Navigation

    Gpushare.com

    • Register
    • Login
    • Search
    • Popular
    • Categories
    • Recent
    • Tags
    1. Home
    2. 183****0229
    • Profile
    • Following 0
    • Followers 2
    • Topics 42
    • Posts 52
    • Best 48
    • Groups 0

    183****0229

    @183****0229

    89
    Reputation
    21
    Profile views
    52
    Posts
    2
    Followers
    0
    Following
    Joined Last Online

    183****0229 Unfollow Follow

    Best posts made by 183****0229

    • 使用UER-py库在下游数据集进一步预训练+微调

      使用UER-py库在下游数据集进一步预训练+微调

      环境

      • RTX3090
      • pytorch 1.8.1
      • python 3.8
      • cuda 11.1

      流程

      # 切换路径
      cd /hy-tmp
      # 下载UER并安装其他依赖包
      git clone https://hub.fastgit.org/dbiir/UER-py.git
      cd UER-py
      pip install -U six
      pip install transformers
      

      转化huggingface的预训练权重为UER适配的权重

      convert.py如下

      # convert.py
      import torch
      import argparse
      import collections
      
      
      def convert_bert_transformer_encoder_from_huggingface_to_uer(input_model, output_model, layers_num):
          for i in range(layers_num):
              output_model["encoder.transformer." + str(i) + ".self_attn.linear_layers.0.weight"] = input_model["bert.encoder.layer." + str(i) + ".attention.self.query.weight"]
              output_model["encoder.transformer." + str(i) + ".self_attn.linear_layers.0.bias"] = input_model["bert.encoder.layer." + str(i) + ".attention.self.query.bias"]
              output_model["encoder.transformer." + str(i) + ".self_attn.linear_layers.1.weight"] = input_model["bert.encoder.layer." + str(i) + ".attention.self.key.weight"]
              output_model["encoder.transformer." + str(i) + ".self_attn.linear_layers.1.bias"] = input_model["bert.encoder.layer." + str(i) + ".attention.self.key.bias"]
              output_model["encoder.transformer." + str(i) + ".self_attn.linear_layers.2.weight"] = input_model["bert.encoder.layer." + str(i) + ".attention.self.value.weight"]
              output_model["encoder.transformer." + str(i) + ".self_attn.linear_layers.2.bias"] = input_model["bert.encoder.layer." + str(i) + ".attention.self.value.bias"]
              output_model["encoder.transformer." + str(i) + ".self_attn.final_linear.weight"] = input_model["bert.encoder.layer." + str(i) + ".attention.output.dense.weight"]
              output_model["encoder.transformer." + str(i) + ".self_attn.final_linear.bias"] = input_model["bert.encoder.layer." + str(i) + ".attention.output.dense.bias"]
              output_model["encoder.transformer." + str(i) + ".layer_norm_1.gamma"] = input_model["bert.encoder.layer." + str(i) + ".attention.output.LayerNorm.weight"]
              output_model["encoder.transformer." + str(i) + ".layer_norm_1.beta"] = input_model["bert.encoder.layer." + str(i) + ".attention.output.LayerNorm.bias"]
              output_model["encoder.transformer." + str(i) + ".feed_forward.linear_1.weight"] = input_model["bert.encoder.layer." + str(i) + ".intermediate.dense.weight"]
              output_model["encoder.transformer." + str(i) + ".feed_forward.linear_1.bias"] = input_model["bert.encoder.layer." + str(i) + ".intermediate.dense.bias"]
              output_model["encoder.transformer." + str(i) + ".feed_forward.linear_2.weight"] = input_model["bert.encoder.layer." + str(i) + ".output.dense.weight"]
              output_model["encoder.transformer." + str(i) + ".feed_forward.linear_2.bias"] = input_model["bert.encoder.layer." + str(i) + ".output.dense.bias"]
              output_model["encoder.transformer." + str(i) + ".layer_norm_2.gamma"] = input_model["bert.encoder.layer." + str(i) + ".output.LayerNorm.weight"]
              output_model["encoder.transformer." + str(i) + ".layer_norm_2.beta"] = input_model["bert.encoder.layer." + str(i) + ".output.LayerNorm.bias"]
      
      
      def main():
          parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
          parser.add_argument("--input_model_path", type=str, default="models/bert-base-chinese/pytorch_model.bin",
                              help=".")
          parser.add_argument("--output_model_path", type=str, default="models/google_zh_model.bin",
                              help=".")
          parser.add_argument("--layers_num", type=int, default=12, help=".")
          parser.add_argument("--target", choices=["bert", "mlm"], default="bert",
                              help="The training target of the pretraining model.")
      
          args = parser.parse_args()
          
          input_model = torch.load(args.input_model_path, map_location='cpu')
          
          output_model = collections.OrderedDict()
          
          output_model["embedding.word_embedding.weight"] = input_model["bert.embeddings.word_embeddings.weight"]
          output_model["embedding.position_embedding.weight"] = input_model["bert.embeddings.position_embeddings.weight"]
          output_model["embedding.segment_embedding.weight"] = torch.cat((torch.Tensor([[0]*input_model["bert.embeddings.token_type_embeddings.weight"].size()[1]]), input_model["bert.embeddings.token_type_embeddings.weight"]), dim=0)
          output_model["embedding.layer_norm.gamma"] = input_model["bert.embeddings.LayerNorm.weight"]
          output_model["embedding.layer_norm.beta"] = input_model["bert.embeddings.LayerNorm.bias"]
          
          convert_bert_transformer_encoder_from_huggingface_to_uer(input_model, output_model, args.layers_num)
          
          if args.target == "bert":
              output_model["target.nsp_linear_1.weight"] = input_model["bert.pooler.dense.weight"]
              output_model["target.nsp_linear_1.bias"] = input_model["bert.pooler.dense.bias"]
              output_model["target.nsp_linear_2.weight"] = input_model["cls.seq_relationship.weight"]
              output_model["target.nsp_linear_2.bias"] = input_model["cls.seq_relationship.bias"]
          output_model["target.mlm_linear_1.weight"] = input_model["cls.predictions.transform.dense.weight"]
          output_model["target.mlm_linear_1.bias"] = input_model["cls.predictions.transform.dense.bias"]
          output_model["target.layer_norm.gamma"] = input_model["cls.predictions.transform.LayerNorm.weight"]
          output_model["target.layer_norm.beta"] = input_model["cls.predictions.transform.LayerNorm.bias"]
          output_model["target.mlm_linear_2.weight"] = input_model["cls.predictions.decoder.weight"]
          output_model["target.mlm_linear_2.bias"] = input_model["cls.predictions.bias"]
          
          torch.save(output_model, args.output_model_path)
      
      if __name__ == "__main__":
          from transformers import BertForPreTraining
          model = BertForPreTraining.from_pretrained("bert-base-chinese")
          model.save_pretrained("models/bert-base-chinese")
          main()
      
      
      # 创建convert.py并将代码复制进去
      touch convert.py
      # 转换权重
      python convert.py
      

      # 数据预处理
      python preprocess.py    --corpus_path corpora/book_review_bert.txt \
                              --vocab_path models/google_zh_vocab.txt \
                              --dataset_path dataset.pt --processes_num 8 \
                              --target bert     
      

      # 进一步预训练
      python pretrain.py  --dataset_path dataset.pt --vocab_path models/google_zh_vocab.txt \
                          --pretrained_model_path models/google_zh_model.bin \
                          --output_model_path models/book_review_model.bin \
                          --gpu_ranks 0 \
                          --total_steps 5000 --save_checkpoint_steps 1000 --batch_size 16 \
                          --embedding word_pos_seg --encoder transformer --mask fully_visible --target bert
      # 修改权重名
      mv models/book_review_model.bin-5000 models/book_review_model.bin
      

      # 微调
      python finetune/run_classifier.py --pretrained_model_path models/book_review_model.bin \
                          --vocab_path models/google_zh_vocab.txt \
                          --train_path datasets/douban_book_review/train.tsv \
                          --dev_path datasets/douban_book_review/dev.tsv \
                          --test_path datasets/douban_book_review/test.tsv \
                          --epochs_num 3 --batch_size 64 \
                          --embedding word_pos_seg --encoder transformer --mask fully_visible
      

      REFERENCE

      https://github.com/dbiir/UER-py/wiki/Quickstart

      posted in 技术交流
      183****0229
      183****0229
    • 【工具篇8】安装code-server,通过网页就可以使用vscode啦!

      结果展示


      简略流程

      • 复制install.sh文件,并安装
      • 关闭tensorboard服务,开启code-server服务
      • 查看code-server登录密码,并登录

      详细流程

      • 复制install.sh文件,并安装
        创建install.sh文件并将 https://raw.githubusercontent.com/cdr/code-server/main/install.sh 内容复制进去。

        使用命令安装 bash install.sh
      • 关闭tensorboard服务supervisorctl stop tensorboard,开启code-server服务code-server --bind-addr 0.0.0.0:6006
      • 查看code-server登录密码cat ~/.config/code-server/config.yaml,并登录


        注:访问登录的网址可以查看原先tensorboard链接

      Tips:

      • 如果实例重启,需要重新关闭tensorboard服务,并开启code server服务

      • 安装教程 https://github.com/cdr/code-server/blob/main/docs/install.md

      posted in 新手教程
      183****0229
      183****0229
    • 【1】使用pytorch_lightning+transformers+torchmetric+datasets进行文本分类

      1. 环境信息

      • pytorch 1.8.1
      • python 3.8.1
      • cuda 11.1

      2. 结果展示

      代码 预训练权重 Accuracy 备注
      simpletransformers hfl/chinese-roberta-wwm-ext 66.85% 参考之前文章的训练结果
      本文 bert-base-chinese 66.56% 使用bert-base-chinese预训练权重
      本文 junnyu/roformer_chinese_base 67.52% 使用junnyu/roformer_chinese_base预训练权重
      本文 hfl/chinese-roberta-wwm-ext 66.83% 使用hfl/chinese-roberta-wwm-ext预训练权重
      本文 hfl/chinese-xlnet-base 66.8% 使用hfl/chinese-xlnet-base预训练权重
      本文 junnyu/roformer_chinese_base 62.6% 【存在错误】使用bert-base-chinese的vocab+junnyu/roformer_chinese_base预训练权重

      3. 使用pytorch_lightning+transformers+torchmetric+datasets进行文本分类

      # 切换路径
      cd /hy-tmp
      # 下载代码
      git clone https://hub.fastgit.org/junnyu/hy_tutorial.git
      cd hy_tutorial
      # 解压
      unzip tnews_classfication_pl.zip
      cd tnews_classfication_pl
      # 安装依赖
      pip install -r requirements.txt
      

      4. 使用bert-base-chinese训练

      bash train.sh
      


      5. 使用junnyu/roformer_chinese_base预训练模型

      bash train_roformer.sh
      


      6. 使用hfl/chinese-roberta-wwm-ext预训练模型

      bash train_roberta.sh
      


      7. 其他:

      使用bert-base-chinese的分词+roformer权重训练得到的结果(虽然存在错误,但结果也有60多的准确率)

      8. To be continued 接下来的几篇文章将详细介绍训练代码

      posted in 技术交流
      183****0229
      183****0229
    • 【3】使用Captum库解释BERT模型

      Captum是PyTorch的模型可解释性和理解库。Captum在拉丁语中表示理解,包含PyTorch模型的集成梯度、显著图、平滑、vargrad等通用实现。它可以快速集成使用特定于领域的库(如torchvision、torchtext等)构建的模型。
      https://github.com/pytorch/captum/blob/master/tutorials/Bert_SQUAD_Interpret2.ipynb

      Interpreting BertLayer Outputs and Self-Attention Matrices in each Layer

      Now let’s look into the layers of our network. More specifically we would like to look into the distribution of attribution scores for each token across all layers and attribution matrices for each head in all layers in Bert model.
      We do that using one of the layer attribution algorithms, namely, layer conductance. However, we encourage you to try out and compare the results with other algorithms as well.

      Let’s configure InterpretableEmbeddingsBase again, in this case in order to interpret the layers of our model.

      现在让我们看看我们的网络层。 更具体地说,我们想研究 Bert 模型中所有层中每个令牌的归因分数分布以及所有层中每个头部的归因矩阵。
      我们使用其中一种层归因算法来做到这一点,即layer conductance。 但是,我们鼓励您尝试并将结果与其他算法进行比较。

      让我们再次配置 InterpretableEmbeddingsBase,在这种情况下是为了解释我们模型的层。

      interpretable_embedding = configure_interpretable_embedding_layer(model, 'bert.embeddings.word_embeddings')
      
      c:\users\yujun\appdata\local\programs\python\python37\lib\site-packages\captum\attr\_models\base.py:189: UserWarning: In order to make embedding layers more interpretable they will be replaced with an interpretable embedding layer which wraps the original embedding layer and takes word embedding vectors as inputs of the forward function. This allows us to generate baselines for word embeddings and compute attributions for each embedding dimension. The original embedding layer must be set back by calling `remove_interpretable_embedding_layer` function after model interpretation is finished. 
        "In order to make embedding layers more interpretable they will "
      

      Let’s iterate over all layers and compute the attributions w.r.t. all tokens in the input and attention matrices.

      Note: Since below code is iterating over all layers it can take over 5 seconds. Please be patient!

      让我们迭代所有层并计算属性 w.r.t. 输入和注意力矩阵中的所有标记。

      注意:由于下面的代码迭代所有层,因此可能需要 5 秒以上。 请耐心等待!

      layer_attrs_start = []
      layer_attrs_end = []
      
      layer_attn_mat_start = []
      layer_attn_mat_end = []
      
      input_embeddings, ref_input_embeddings = construct_whole_bert_embeddings(input_ids, ref_input_ids, \
                                               token_type_ids=token_type_ids, ref_token_type_ids=ref_token_type_ids, \
                                               position_ids=position_ids, ref_position_ids=ref_position_ids)
      
      for i in range(model.config.num_hidden_layers):
          lc = LayerConductance(squad_pos_forward_func, model.bert.encoder.layer[i])
          layer_attributions_start = lc.attribute(inputs=input_embeddings, baselines=ref_input_embeddings, additional_forward_args=(token_type_ids, position_ids,attention_mask, 0))
          layer_attributions_end = lc.attribute(inputs=input_embeddings, baselines=ref_input_embeddings, additional_forward_args=(token_type_ids, position_ids,attention_mask, 1))
          
          layer_attrs_start.append(summarize_attributions(layer_attributions_start[0]))
          layer_attrs_end.append(summarize_attributions(layer_attributions_end[0]))
      
          layer_attn_mat_start.append(layer_attributions_start[1])
          layer_attn_mat_end.append(layer_attributions_end[1])
      
          
      
      # layer x seq_len
      layer_attrs_start = torch.stack(layer_attrs_start)
      # layer x seq_len
      layer_attrs_end = torch.stack(layer_attrs_end)
      
      # layer x batch x head x seq_len x seq_len
      layer_attn_mat_start = torch.stack(layer_attn_mat_start)
      # layer x batch x head x seq_len x seq_len
      layer_attn_mat_end = torch.stack(layer_attn_mat_end)
      

      As a reminder of Part 1 we visualize the heatmaps of the attributions for the outputs of all 12 layers in the plots below. The outputs of 12 layers are also known as context layer which represents the dot product between the attribution matrices and value vector.

      The plot below represents a heatmap of attributions across all layers and tokens for the start position prediction.

      Note that here we do not have information about different heads. Heads related information will be examined separately when we visualize the attribution scores of the attention matrices w.r.t. the start or end position predictions.

      It is interesting to observe that the question word what gains increasingly high attribution from layer one to ten. In the last two layers that importance is slowly diminishing.
      In contrary to what token, many other tokens have negative or close to zero attribution in the first 6 layers.

      We start seeing slightly higher attribution in tokens important, us and to. Interestingly token important is also assigned high attribution score which is remarkably high in the fifth and sixth layers.

      Lastly, our correctly predicted token to gains increasingly high positive attribution especially in the last two layers.

      作为第 1 部分的提醒,我们将下图中所有 12 层输出的属性热图可视化。 12 层的输出也称为上下文层,它表示属性矩阵和值向量之间的点积。

      下图表示了开始位置预测的所有层和标记的属性热图。

      请注意,这里我们没有关于不同头的信息。当我们可视化注意力矩阵 w.r.t. 的归因分数时,将单独检查与头部相关的信息。开始或结束位置预测。

      有趣的是,问题词what从第一层到第十层获得越来越高的归因。在最后两层,重要性正在慢慢降低。
      与what令牌相反,许多其他令牌在前 6 层中具有负面或接近于零的属性。

      我们开始看到标记important、us和to的归因略高。有趣的是,标记重要也被分配了高归因分数,这在第五层和第六层非常高。

      最后,我们正确预测的令牌to获得了越来越高的正面归因,尤其是在最后两层。

      fig, ax = plt.subplots(figsize=(15,5))
      xticklabels=all_tokens
      yticklabels=list(range(1,13))
      ax = sns.heatmap(layer_attrs_start.cpu().detach().numpy(), xticklabels=xticklabels, yticklabels=yticklabels, linewidth=0.2)
      plt.xlabel('Tokens')
      plt.ylabel('Layers')
      plt.show()
      

      Now let’s examine the heat map of the attributions for the end position prediction. In the case of end position prediction we again observe high attribution scores for the token what in the last 11 layers.
      Correctly predicted end token kinds has positive attribution across all layers and it is especially prominent in the last two layers. It’s also interesting to observe that humans token also has relatively high attribution score in the last two layers.

      现在让我们检查结束位置预测的属性热图。 在结束位置预测的情况下,我们再次观察到标记what在最后 11 层中的高归因分数。
      正确预测的结束标记种类在所有层中都有积极的属性,并且在最后两层中尤为突出。 观察到humanstoken在最后两层也具有相对较高的归因分数也很有趣。

      fig, ax = plt.subplots(figsize=(15,5))
      
      xticklabels=all_tokens
      yticklabels=list(range(1,13))
      ax = sns.heatmap(layer_attrs_end.cpu().detach().numpy(), xticklabels=xticklabels, yticklabels=yticklabels, linewidth=0.2) #, annot=True
      plt.xlabel('Tokens')
      plt.ylabel('Layers')
      
      plt.show()
      

      It is interesting to note that when we compare the heat maps of start and end position, overall the colors for start position prediction on the map have darker intensities. This implies that there are less tokens that attribute positively to the start position prediction and there are more tokens which are negative indicators or signals of start position prediction.

      有趣的是,当我们比较开始和结束位置的热图时,总体上地图上开始位置预测的颜色具有更暗的强度。 这意味着正面归因于开始位置预测的token较少,而作为开始位置预测的负面指标或信号的token较多。

      Interpreting Attribution Scores for Attention Matrices

      In this section we visualize the attribution scores of start and end position predictions w.r.t. attention matrices.
      Note that each layer has 12 heads, hence attention matrices. We will first visualize for a specific layer and head, later we will summarize across all heads in order to gain a bigger picture.

      在本节中,我们将开始和结束位置预测 w.r.t. 的归因分数可视化。 注意矩阵。
      请注意,每层有 12 个头,因此是注意力矩阵。 我们将首先对特定层和头部进行可视化,然后我们将总结所有头部以获得更大的图景。

      Below we visualize the attribution scores of 12 heads for selected layer layer for start position prediction.

      下面我们将所选层layer的 12 个头的归因分数可视化以进行起始位置预测。

      visualize_token2token_scores(layer_attn_mat_start[layer].squeeze().cpu().detach().numpy())
      

      As we can see from the visualizations above, in contrary to attention scores the attributions of specific target w.r.t. to those scores are more meaningful and most importantly, they do not attend to [SEP] token or show diagonal patterns. We observe that heads 4, 9, 12 and 2 show strong relationship between what and it tokens when predicting start position, head 10 and 11 between it and it, heads 8 between important and to and head 1 between to and what. Note that to token is the start position of the answer token. It is also important to mention that these observations are for a selected layer. We can change the index of selected layer and examine interesting relationships in other layers.

      正如我们从上面的可视化中看到的,与注意力得分相反的是特定目标 w.r.t. 的属性。 这些分数更有意义,最重要的是,它们不会关注[SEP]标记或显示对角线模式。 我们观察到,在预测开始位置时,第 4、9、12 和 2 个指示符之间显示出what和it标记之间的强相关性,it和it之间的第 10 和 11 个指示符,important和to之间的第 8 个指示符 和在 to 和 what 之间的 head 1。 请注意,to 标记是答案标记的开始位置。 同样重要的是要提到这些观察是针对选定的layer。 我们可以更改所选layer的索引并检查其他层中的有趣关系。

      In the cell below we visualize the attention attribution scores normalized across the head axis.

      在下面的单元格中,我们可视化了跨头轴标准化的注意力归因分数。

      visualize_token2token_scores(norm_fn(layer_attn_mat_start, dim=2).squeeze().detach().cpu().numpy(),
                                   x_label_name='Layer')
      

      By looking at the visualizations above we can see that the model pays attention to very specific handpicked relationships when making a sprediction for start position. Most notably in the layers 10, 7, 11 and 4 it focuses more on the relationships between it and is, important and to.

      通过查看上面的可视化,我们可以看到模型在预测开始位置时会注意非常具体的精心挑选的关系。 最值得注意的是,在第 10、7、11 和 4 层,它更多地关注it和is、important和to之间的关系。

      Now let’s run the same experiments for the end position prediction. Below we visualize the attribution scorese of attention matrices for the end position prediction for the selected layer.

      现在让我们为结束位置预测运行相同的实验。 下面我们将注意力矩阵的归因分数可视化为所选layer的结束位置预测。

      visualize_token2token_scores(layer_attn_mat_end[layer].squeeze().cpu().detach().numpy())
      

      As we can see from the visualizations above that for the end position prediction we have stronger attention towards the end of the answer token kinds. Here we can see stronger connection between humans and kinds in the 11th head, it and em, power, and in the 5th, 6th and 8th heads. The connections between it and what are also strong in first couple and 10th heads.

      从上面的可视化中可以看出,对于结束位置预测,我们对答案token kinds的末尾有更强的关注。 在这里,我们可以在第 11 个头中看到 humans 和 kinds 之间更强的联系,it 和 em,power,and 在第 5、6 和 8 个头中。 it 和 what 之间的联系在第一对和第 10 对头中也很强。

      Similar to start position let’s visualize the norm across all heads for each layer.

      与起始位置类似,让我们可视化每层所有头部的范数。

      visualize_token2token_scores(norm_fn(layer_attn_mat_end, dim=2).squeeze().detach().cpu().numpy(),
                                   x_label_name='Layer')
      

      As we can see from the visualizations above for the end position prediction there is a relation learnt between [SEP] and . in first and second layers. Also we observe that it token is strongly related to what, important and to.

      正如我们从上面结束位置预测的可视化中看到的,在第一层和第二层中,在‘[SEP]’和‘.’之间学习到了一种关系。 我们还观察到 it 标记与 what、important 和 to 密切相关。

      Computing and Visualizing Vector Norms

      In this section of the tutorial we will compute Vector norms for activation layers such as ||f(x)||, ||α * f(x)|| and ||Σαf(x)|| as also described in the: https://arxiv.org/pdf/2004.10102.pdf

      As also shown in the paper mentioned above, normalized activations are better indicators of importance scores than the attention scores however they aren’t as indicative as the attribution scores. This is because normalized activations ||f(x)|| and ||α * f(x)|| aren’t attributed to a specific output prediction. From our results we can also see that according to those normalized scores [SEP] tokens are insignificant.

      在本教程的这一部分中,我们将计算激活层的向量范数,例如 ||f(x)||、||α * f(x)|| 和 ||Σαf(x)|| 还描述在:https://arxiv.org/pdf/2004.10102.pdf

      正如上面提到的论文中所示,归一化激活是比注意力分数更好的重要性分数指标,但它们不像归因分数那样具有指示性。 这是因为标准化激活 ||f(x)|| 和 ||α * f(x)|| 不归因于特定的输出预测。 从我们的结果中我们还可以看到,根据那些归一化的分数,[SEP]标记是微不足道的。

      Below we define / extract all parameters that we need to computation vector norms.

      下面我们定义/提取计算向量范数所需的所有参数。

      output_attentions_all_shape = output_attentions_all.shape
      
      batch = output_attentions_all_shape[1]
      num_heads = output_attentions_all_shape[2]
      head_size = 64
      all_head_size = 768
      
      

      In order to compute above mentioned norms we need to get access to dense layer’s weights and value vector of the self attention layer.

      为了计算上述范数,我们需要访问密集层的权重和自我注意层的值向量。

      Getting Access to Value Activations

      Let’s define the list of all layers for which we would like to access Value Activations.

      让我们定义我们想要访问 Value Activations 的所有层的列表。

      layers = [model.bert.encoder.layer[layer].attention.self.value for layer in range(len(model.bert.encoder.layer))]
      

      We use Captum’s LayerActivation algorithm to access the outputs of all layers.

      我们使用 Captum的 LayerActivation 算法来访问所有层的输出。

      la = LayerActivation(squad_pos_forward_func, layers)
      
      value_layer_acts = la.attribute(input_embeddings, additional_forward_args=(token_type_ids, position_ids, attention_mask))
      # shape -> layer x batch x seq_len x all_head_size
      value_layer_acts = torch.stack(value_layer_acts)
      

      In the cell below we perform several transformations with the value layer activations and bring it to the shape so that we can compute different norms. The transformations are done the same way as it is described in the original paper and corresponding github implementation.

      在下面的单元格中,我们对值层激活进行了几次转换,并将其变为形状,以便我们可以计算不同的范数。 转换的完成方式与原始论文和相应的 github 实现中描述的方式相同。

      new_x_shape = value_layer_acts.size()[:-1] + (num_heads, head_size)
      value_layer_acts = value_layer_acts.view(*new_x_shape)
      
      # layer x batch x neum_heads x 1 x head_size
      value_layer_acts = value_layer_acts.permute(0, 1, 3, 2, 4)
      
      value_layer_acts = value_layer_acts.permute(0, 1, 3, 2, 4).contiguous()
      value_layer_acts_shape = value_layer_acts.size()
      
      # layer x batch x seq_length x num_heads x 1 x head_size
      value_layer_acts = value_layer_acts.view(value_layer_acts_shape[:-1] + (1, value_layer_acts_shape[-1],))
      
      print('value_layer_acts: ', value_layer_acts.shape)
      
      value_layer_acts:  torch.Size([12, 1, 26, 12, 1, 64])
      

      Getting Access to Dense Features

      Now let’s transform dense features so that we can use them to compute ||f(x)|| and ||α * f(x)||.

      现在让我们转换密集特征,以便我们可以使用它们来计算 ||f(x)|| 和 ||α * f(x)||。

      dense_acts = torch.stack([dlayer.attention.output.dense.weight for dlayer in model.bert.encoder.layer])
      
      dense_acts = dense_acts.view(len(layers), all_head_size, num_heads, head_size)
      
      # layer x num_heads x head_size x all_head_size
      dense_acts = dense_acts.permute(0, 2, 3, 1).contiguous()
      

      Computing f(x) score by multiplying the value vector with the weights of the dense vector for all layers.

      通过将值向量与所有层的密集向量的权重相乘来计算 f(x) 分数。

      # layers, batch, seq_length, num_heads, 1, all_head_size
      f_x = torch.stack([value_layer_acts_i.matmul(dense_acts_i) for value_layer_acts_i, dense_acts_i in zip(value_layer_acts, dense_acts)])
      f_x.shape
      
      torch.Size([12, 1, 26, 12, 1, 768])
      
      # layer x batch x seq_length x num_heads x 1 x all_head_size)
      f_x_shape = f_x.size() 
      f_x = f_x.view(f_x_shape[:-2] + (f_x_shape[-1],))
      f_x = f_x.permute(0, 1, 3, 2, 4).contiguous() 
      
      #(layers x batch, num_heads, seq_length, all_head_size)
      f_x_shape = f_x.size() 
      
      #(layers x batch, num_heads, seq_length)
      f_x_norm = norm_fn(f_x, dim=-1)
      
      

      Now let’s visualize ||f(x)|| scores for all layers and examine the distribution of those scores.

      现在让我们形象化 ||f(x)|| 所有层的分数并检查这些分数的分布。

      visualize_token2head_scores(f_x_norm.squeeze().detach().cpu().numpy())
      

      When we examine ||f(x)|| scores for all layers we can easily see that the [SEP] token receives the lowest score across all layers. This is one of the conclusions that the original paper came to. In terms of other tokens we can see that the heads in different layers focus on different parts of the input sentence.

      当我们检查 ||f(x)|| 所有层的分数我们可以很容易地看到,[SEP] 令牌在所有层中获得的分数最低。 这是原论文得出的结论之一。 在其他 token 方面,我们可以看到不同层的 head 集中在输入句子的不同部分。

      Now let’s compute ||α * f_x||. This computation is performed using the original paper’s technique with the help of einsum operator.

      现在让我们计算 ||α * f_x||。 该计算是在 einsum 运算符的帮助下使用原始论文的技术执行的。

      # layer x batch x num_heads x seq_length x seq_length x all_head_size
      alpha_f_x = torch.einsum('lbhks,lbhsd->lbhksd', output_attentions_all, f_x)
      
      # layer x batch x num_heads x seq_length x seq_length
      alpha_f_x_norm = norm_fn(alpha_f_x, dim=-1)
      

      Let’s now visualize ||α * f_x|| scores for the layer with index layer.

      现在让我们可视化 ||α * f_x|| 索引为“layer”的图层的分数。

      visualize_token2token_scores(alpha_f_x_norm[layer].squeeze().detach().cpu().numpy())
      

      As we can see from the visualizations above there is no strong attention to [SEP] or [CLS] tokens. Some of the heads show diagonal patterns and some of them show strong attention between specific pairs of tokens.

      正如我们从上面的可视化中看到的,没有对[SEP]或[CLS]标记的强烈关注。 一些头部显示出对角线图案,其中一些在特定的token对之间表现出强烈的注意力。

      Now let’s compute the summed norm across num_heads axis ||Σαf(x)|| and visualize normalized scores for each layer.

      现在让我们计算跨 num_heads 轴的总范数 ||Σαf(x)|| 并可视化每一层的归一化分数。

      summed_alpha_f_x = alpha_f_x.sum(dim=2)
      
      # layers x batch x seq_length x seq_length
      summed_alpha_f_x_norm = norm_fn(summed_alpha_f_x, dim=-1)
      
      
      visualize_token2token_scores(summed_alpha_f_x_norm.squeeze().cpu().detach().numpy(), x_label_name='Layer')
      

      Above visualizations also confirm that the attention scores aren’t concentrated on the tokens such as [CLS], [SEP] and . however we see stronger signals along the diagonals and some patches of stronger signals between certain parts of the text including some tokens in the question part that are relevant in the answer piece.

      上面的可视化还证实了注意力分数并不集中在诸如[CLS]、[SEP]和.之类的标记上,但是我们看到沿对角线的信号更强,并且在某些部分之间看到了一些更强的信号块。 问题部分中包含一些与答案相关的标记的文本。

      It is important to mention that all experiments were performed for one input sample, namely, sentence. In the papers we often see aggregation of the results across multiple samples. For further analysis and more convincing propositions we recommend to conduct the experiments across multiple input samples. In addition to that it would be also interesting to look into the correlation of heads in layer and across different layers.

      值得一提的是,所有实验都是针对一个输入样本进行的,即句子。 在论文中,我们经常看到跨多个样本的结果聚合。 为了进一步分析和更有说服力的命题,我们建议在多个输入样本中进行实验。 除此之外,研究层中和不同层之间头部的相关性也很有趣。

      posted in 技术交流
      183****0229
      183****0229
    • 【2】使用Captum库解释BERT模型

      Captum是PyTorch的模型可解释性和理解库。Captum在拉丁语中表示理解,包含PyTorch模型的集成梯度、显著图、平滑、vargrad等通用实现。它可以快速集成使用特定于领域的库(如torchvision、torchtext等)构建的模型。
      https://github.com/pytorch/captum/blob/master/tutorials/Bert_SQUAD_Interpret2.ipynb

      Interpreting BERT Models (Part 2)

      In the second part of interpreting Bert models we look into attention matrices, their importance scores, vector norms and compare them with the results that we found in Part 1.

      Similar to Part 1 we use Bert Question Answering model fine-tuned on SQUAD dataset using transformers library from Hugging Face: https://huggingface.co/transformers/

      In order to be able to use the same setup and reproduce the results form Part 1 we will redefine same setup and helper functions in this tutorial as well.

      In this tutorial we compare attention matrices with their importance scores when we attribute them to a particular class, and vector norms as proposed in paper: https://arxiv.org/pdf/2004.10102.pdf

      We show that the importance scores computed for the attention matrices and specific class are more meaningful than the attention matrices alone or different norm vectors computed for different input activations.

      Note: Before running this tutorial, please install seaborn, pandas and matplotlib, transformers(from hugging face) python packages in addition to Captum and torch libraries.

      This tutorial was built using transformer version 4.3.0.

      在解释 Bert 模型的第二部分中,我们研究了注意力矩阵、它们的重要性分数、向量范数,并将它们与我们在第 1 部分中发现的结果进行比较。

      与第 1 部分类似,我们使用来自 Hugging Face 的转换器库在 SQUAD 数据集上微调 Bert 问答模型:https://huggingface.co/transformers/

      为了能够使用相同的设置并重现第 1 部分的结果,我们还将在本教程中重新定义相同的设置和辅助函数。

      在本教程中,当我们将注意力矩阵归因于特定类别时,我们将注意力矩阵与其重要性得分进行比较,以及论文中提出的向量范数:https://arxiv.org/pdf/2004.10102.pdf

      我们表明,为注意力矩阵和特定类别计算的重要性分数比单独的注意力矩阵或为不同输入激活计算的不同范数向量更有意义。

      注意:在运行本教程之前,除了Captum和torch库之外,请安装seaborn、pandas和matplotlib、transformers(来自拥抱脸)python 包。

      本教程是使用 Transformer 版本 4.3.0 构建的。

      import os
      
      import numpy as np
      import pandas as pd
      import seaborn as sns
      import matplotlib.pyplot as plt
      
      import torch
      import torch.nn as nn
      
      from transformers import BertTokenizer, BertForQuestionAnswering, BertConfig
      
      from captum.attr import visualization as viz
      from captum.attr import IntegratedGradients, LayerConductance, LayerIntegratedGradients, LayerActivation
      from captum.attr import configure_interpretable_embedding_layer, remove_interpretable_embedding_layer
      
      device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
      

      The first step is to fine-tune BERT model on SQUAD dataset. This can be easiy accomplished by following the steps described in hugging face’s official web site: https://github.com/huggingface/transformers#run_squadpy-fine-tuning-on-squad-for-question-answering

      Note that the fine-tuning is done on a bert-base-cased pre-trained model.

      第一步是在 SQUAD 数据集上微调 BERT 模型。 这可以通过遵循拥抱脸官方网站中描述的步骤轻松完成:https://github.com/huggingface/transformers#run_squadpy-fine-tuning-on-squad-for-question-answering

      请注意,微调是在“bert-base-cased”预训练模型上完成的。

      After we pretrain the model, we can load the tokenizer and pre-trained BERT model using the commands described below.

      在我们预训练模型之后,我们可以使用下面描述的命令加载tokenizer和预训练的 BERT 模型。

      # replace <PATH-TO-SAVED-MODEL> with the real path of the saved model
      model_path = 'bert-base-cased-squad2'
      
      # load model
      model = BertForQuestionAnswering.from_pretrained(model_path, output_attentions=True)
      model.to(device)
      model.eval()
      model.zero_grad()
      
      # load tokenizer
      tokenizer = BertTokenizer.from_pretrained(model_path,do_lower_case=False)
      

      A helper function to perform forward pass of the model and make predictions.

      一个辅助函数,用于执行模型的前向传递并进行预测。

      def predict(inputs, token_type_ids=None, position_ids=None, attention_mask=None):
          output = model(inputs, token_type_ids=token_type_ids,
                       position_ids=position_ids, attention_mask=attention_mask, )
          return output.start_logits, output.end_logits, output.attentions
      

      Defining a custom forward function that will allow us to access the start and end positions of our prediction using position input argument.

      定义一个自定义前向函数,允许我们使用 position 输入参数访问预测的开始和结束位置。

      def squad_pos_forward_func(inputs, token_type_ids=None, position_ids=None, attention_mask=None, position=0):
          pred = model(inputs_embeds=inputs, token_type_ids=token_type_ids,
                       position_ids=position_ids, attention_mask=attention_mask, )
          pred = pred[position]
          return pred.max(1).values
      

      Let’s define some variables and functions that will help us to compute the attribution of attention matrices for specific output such as start or end positions of the prediction.

      To do so, we need to define baselines / references, numericalize both the baselines and the inputs. We will define helper functions to achieve that.

      The cell below defines numericalized special tokens that will be later used for constructing inputs and corresponding baselines/references.

      让我们定义一些变量和函数,它们将帮助我们计算特定输出的注意力矩阵的属性,例如预测的开始或结束位置。

      为此,我们需要定义baselines/references,将baselines和inputs数值化。 我们将定义辅助函数来实现这一点。

      下面的单元格定义了数字化的特殊标记,稍后将用于构建输入和相应的baselines/references。

      ref_token_id = tokenizer.pad_token_id # A token used for generating token reference
      sep_token_id = tokenizer.sep_token_id # A token used as a separator between question and text and it is also added to the end of the text.
      cls_token_id = tokenizer.cls_token_id # A token used for prepending to the concatenated question-text word sequence
      

      Below we define a set of helper function for constructing references / baselines for word tokens, token types and position ids.

      下面我们定义了一组辅助函数,用于为word tokens,token types和position ids构建baselines/references。

      def construct_input_ref_pair(question, text, ref_token_id, sep_token_id, cls_token_id):
          question_ids = tokenizer.encode(question, add_special_tokens=False)
          text_ids = tokenizer.encode(text, add_special_tokens=False)
      
          # construct input token ids
          input_ids = [cls_token_id] + question_ids + [sep_token_id] + text_ids + [sep_token_id]
      
          # construct reference token ids 
          ref_input_ids = [cls_token_id] + [ref_token_id] * len(question_ids) + [sep_token_id] + \
              [ref_token_id] * len(text_ids) + [sep_token_id]
      
          return torch.tensor([input_ids], device=device), torch.tensor([ref_input_ids], device=device), len(question_ids)
      
      def construct_input_ref_token_type_pair(input_ids, sep_ind=0):
          seq_len = input_ids.size(1)
          token_type_ids = torch.tensor([[0 if i <= sep_ind else 1 for i in range(seq_len)]], device=device)
          ref_token_type_ids = torch.zeros_like(token_type_ids, device=device)# * -1
          return token_type_ids, ref_token_type_ids
      
      def construct_input_ref_pos_id_pair(input_ids):
          seq_length = input_ids.size(1)
          position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
          # we could potentially also use random permutation with `torch.randperm(seq_length, device=device)`
          ref_position_ids = torch.zeros(seq_length, dtype=torch.long, device=device)
      
          position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
          ref_position_ids = ref_position_ids.unsqueeze(0).expand_as(input_ids)
          return position_ids, ref_position_ids
          
      def construct_attention_mask(input_ids):
          return torch.ones_like(input_ids)
          
      def construct_whole_bert_embeddings(input_ids, ref_input_ids, \
                                          token_type_ids=None, ref_token_type_ids=None, \
                                          position_ids=None, ref_position_ids=None):
          input_embeddings = interpretable_embedding.indices_to_embeddings(input_ids)
          ref_input_embeddings = interpretable_embedding.indices_to_embeddings(ref_input_ids)
          
          return input_embeddings, ref_input_embeddings
      
      

      Let’s define the question - text pair that we’d like to use as an input for our Bert model and interpret what the model was focusing on when predicting an answer to the question from given input text

      让我们定义问题 - 文本对,我们希望将其用作 Bert 模型的输入,并在从给定的输入文本预测问题的答案时解释模型关注的内容

      question, text = "What is important to us?", "It is important to us to include, empower and support humans of all kinds."
      
      

      Let’s numericalize the question, the input text and generate corresponding baselines / references for all three sub-embeddings (word, token type and position embeddings) types using our helper functions defined above.

      让我们使用上面定义的辅助函数对问题、输入文本进行数值化,并为所有三个子嵌入(词、标记类型和位置嵌入)类型生成相应的baselines/references。

      input_ids, ref_input_ids, sep_id = construct_input_ref_pair(question, text, ref_token_id, sep_token_id, cls_token_id)
      token_type_ids, ref_token_type_ids = construct_input_ref_token_type_pair(input_ids, sep_id)
      position_ids, ref_position_ids = construct_input_ref_pos_id_pair(input_ids)
      attention_mask = construct_attention_mask(input_ids)
      
      indices = input_ids[0].detach().tolist()
      all_tokens = tokenizer.convert_ids_to_tokens(indices)
      

      Also, let’s define the ground truth for prediction’s start and end positions.

      此外,让我们为预测的开始和结束位置定义真实标签值。

      ground_truth = 'to include, empower and support humans of all kinds'
      
      ground_truth_tokens = tokenizer.encode(ground_truth, add_special_tokens=False)
      ground_truth_end_ind = indices.index(ground_truth_tokens[-1])
      ground_truth_start_ind = ground_truth_end_ind - len(ground_truth_tokens) + 1
      

      Now let’s make predictions using input, token type, position id and a default attention mask.

      现在让我们使用输入、标记类型、位置 ID 和默认注意掩码进行预测。

      start_scores, end_scores, output_attentions = predict(input_ids,
                                         token_type_ids=token_type_ids, \
                                         position_ids=position_ids, \
                                         attention_mask=attention_mask)
      
      
      print('Question: ', question)
      print('Predicted Answer: ', ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
      
      Question:  What is important to us?
      Predicted Answer:  to include , em ##power and support humans of all kinds
      

      Visualizing Attention Matrices

      output_attentions represent attention matrices aka attention probabilities for all 12 layers and all 12 heads. It represents softmax-normalized dot-product between the key and query vectors. In the literature (https://www.aclweb.org/anthology/W19-4828.pdf) it has been used as an importance indicator of how much a token attends / relates to another token in the text. In case of translation for example it is a good indicator of how much a token in one language attends to the corresponding translation in another language. In case of Question Answering model it indicates which tokens attend / relate to each other in question, text or answer segment.

      Since output_attentions contains the layers in a list, we will stack them in order to move everything into a tensor.

      output_attentions 表示所有 12 层和所有 12 个头的注意力矩阵,即注意力概率。 它表示键向量和查询向量之间的 softmax 归一化点积。 在文献 (https://www.aclweb.org/anthology/W19-4828.pdf) 中,它已被用作一个标记与文本中另一个标记的关联程度的重要性指标。 例如,在翻译的情况下,它可以很好地指示一种语言的标记对另一种语言的相应翻译的参与程度。 在问答模型的情况下,它指示哪些令牌在问题、文本或答案段中相互关联/相关。

      由于 output_attentions 包含列表中的层,我们将堆叠它们以将所有内容移动到张量中。

      # shape -> layer x batch x head x seq_len x seq_len
      output_attentions_all = torch.stack(output_attentions)
      
      

      A helper function for visualizing Token-To-Token matices

      Below helper function will be used for visualizing token-to-token relation / attention scores for all heads in a given layer or for all layers across all heads.

      下面的辅助函数将用于可视化给定层中所有头或所有头中所有层token到token的关系/注意力分数。

      def visualize_token2token_scores(scores_mat, x_label_name='Head'):
          fig = plt.figure(figsize=(20, 20))
      
          for idx, scores in enumerate(scores_mat):
              scores_np = np.array(scores)
              ax = fig.add_subplot(4, 3, idx+1)
              # append the attention weights
              im = ax.imshow(scores, cmap='viridis')
      
              fontdict = {'fontsize': 10}
      
              ax.set_xticks(range(len(all_tokens)))
              ax.set_yticks(range(len(all_tokens)))
      
              ax.set_xticklabels(all_tokens, fontdict=fontdict, rotation=90)
              ax.set_yticklabels(all_tokens, fontdict=fontdict)
              ax.set_xlabel('{} {}'.format(x_label_name, idx+1))
      
              fig.colorbar(im, fraction=0.046, pad=0.04)
          plt.tight_layout()
          plt.show()
      

      A helper function for visualizing Token-To-Head matrices

      Below helper function will be used for visualizing the importance scores for tokens across all heads in all layers.

      下面的辅助函数将用于可视化所有层中所有head的token的重要性分数。

      def visualize_token2head_scores(scores_mat):
          fig = plt.figure(figsize=(30, 50))
      
          for idx, scores in enumerate(scores_mat):
              scores_np = np.array(scores)
              ax = fig.add_subplot(6, 2, idx+1)
              # append the attention weights
              im = ax.matshow(scores_np, cmap='viridis')
      
              fontdict = {'fontsize': 20}
      
              ax.set_xticks(range(len(all_tokens)))
              ax.set_yticks(range(len(scores)))
      
              ax.set_xticklabels(all_tokens, fontdict=fontdict, rotation=90)
              ax.set_yticklabels(range(len(scores[0])), fontdict=fontdict)
              ax.set_xlabel('Layer {}'.format(idx+1))
      
              fig.colorbar(im, fraction=0.046, pad=0.04)
          plt.tight_layout()
          plt.show()
      

      Let’s examine a specific layer. For that reason we will define a fixed layer id that will be used for visualization purposes. The users are free to change this layer if they want to examine a different one.

      让我们检查一个特定的层。 出于这个原因,我们将定义一个用于可视化目的的固定层 id。 如果用户想检查不同的层,他们可以自由更改这一层。

      layer = 11
      

      Visualizing attention matrices for a selected layer layer.

      可视化选定层“层”的注意力矩阵。

      visualize_token2token_scores(output_attentions_all[layer].squeeze().detach().cpu().numpy())
      

      Based on the visualizations above we observe that there is a high attention set along the diagonals and on an uninformative token such as [SEP]. This is something that was observed in previous papers which indicates that attention matrices aren’t always a good indicator of finding which tokens are more important or which token is related to which. We observe similar pattern when we examine another layer.

      基于上面的可视化,我们观察到沿对角线和无信息标记(例如[SEP])有很高的注意力集。 这是在之前的论文中观察到的事情,这表明注意力矩阵并不总是一个很好的指标,可以找到哪些标记更重要或哪些标记与哪些标记相关。 当我们检查另一层时,我们观察到类似的模式。

      In the cell below we compute and visualize L2 norm across head axis for all 12 layer. This provides a summary for each layer across all heads.

      在下面的单元格中,我们计算和可视化所有 12 层的跨头轴的 L2 范数。 这提供了所有头部的每一层的摘要。

      Defining normalization function depending on pytorch version.

      根据 pytorch 版本定义规范化函数。

      if torch.__version__ >= '1.7.0':
          norm_fn = torch.linalg.norm
      else:
          norm_fn = torch.norm
      
      visualize_token2token_scores(norm_fn(output_attentions_all, dim=2).squeeze().detach().cpu().numpy(),
                                   x_label_name='Layer')
      

      Based on the visualiziation above we can convince ourselves that attention scores aren’t trustworthy measures of importances for token-to-token relations across all layers. We see strong signal along the diagonal and for the [SEP] and [CLS] tokens. These signals, however, aren’t true indicators of what semantic the model learns.

      基于上面的可视化,我们可以发现,注意力分数不是所有层中token到token关系重要性的可靠度量。 我们看到沿对角线以及 [SEP] 和 [CLS] 令牌的强烈信号。 然而,这些信号并不是模型学习的语义的真正指标。

      Visualizing attribution / importance scores

      In the cells below we visualize the attribution scores of attention matrices for the start and end position positions prediction and compare with the actual attention matrices. To do so, first of all, we compute the attribution scores using LayerConductance algorithm similar to Part 1.

      在下面的单元格中,我们将开始和结束位置位置预测的注意力矩阵的属性分数可视化,并与实际的注意力矩阵进行比较。 为此,首先,我们使用类似于第 1 部分的 LayerConductance 算法计算归因分数。

      A helper function to summarize attributions for each word token in the sequence.

      一个辅助函数,用于总结序列中每个单词标记的属性。

      def summarize_attributions(attributions):
          attributions = attributions.sum(dim=-1).squeeze(0)
          attributions = attributions / norm_fn(attributions)
          return attributions
      
      posted in 技术交流
      183****0229
      183****0229
    • RE: 【有奖话题NO.7】传说中的万能Transformer,你有用过吗?

      这题我不会,transformer is all you need!

      posted in 有奖话题
      183****0229
      183****0229
    • 【2 精度对齐】tf2.x与pytorch模型精度对齐

      今天继续为大家带来剩余的tf2.x与pytorch部分模型精度对齐

      • 5、GRU bidirectional
      • 6、nn.BatchNorm1d vs layers.BatchNormalization
      • 7、nn.LayerNorm vs layers.LayerNormalization
      • 8、Conv2d valid padding
      • 9、Conv2d same padding

      导包+准备工具

      import torch
      import torch.nn as nn
      import torch.nn.functional as F
      
      import tensorflow as tf
      from tensorflow import keras
      import numpy as np
      import math
      
      # 动态增加TF的GPU显存
      gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
      for gpu in gpus:
          tf.config.experimental.set_memory_growth(device=gpu, enable=True)
      
      def reoder_process(x):
          reoder_index = [4,5,6,7,0,1,2,3,8,9,10,11]
          if len(x.shape)==2:
              return x[:,reoder_index]
          else:
              return x[reoder_index]
      
      def pad(x, kernel_size=3, dilation=1):
          """For stride=1 or stride = 2 or stride = 3"""
          pad_total = dilation * (kernel_size - 1)
          pad_beg = pad_total // 2
          pad_end = pad_total - pad_beg
      
          x_padded = F.pad(
              x, pad=(pad_beg, pad_end, pad_beg, pad_end))
          
          return x_padded
      
      
      def compare_difference(a,b):
          o = np.abs((a.detach().numpy()-b.numpy())).max()
          print(f"max diffenence is {o}")
          o = np.abs((a.detach().numpy()-b.numpy())).mean()
          print(f"mean diffenence is {o}")
      
      

      5、GRU bidirectional

      # 创建模型
      pt_gru_bi = nn.GRU(input_size=2,hidden_size=4,batch_first=True,num_layers=1,bidirectional=True)
      tf_gru_bi = keras.layers.Bidirectional(layer=keras.layers.GRU(units=4,return_sequences=True,return_state=True),merge_mode='concat')
      tf_gru_bi.build(input_shape=(None,3,2))
      
      # 复制权重
      forward_input_kernel = reoder_process(pt_gru_bi.weight_ih_l0.T.detach().numpy())
      forward_recur_kernel = reoder_process(pt_gru_bi.weight_hh_l0.T.detach().numpy())
      forward_bias = torch.stack([reoder_process(pt_gru_bi.bias_ih_l0.detach()),reoder_process(pt_gru_bi.bias_hh_l0.detach())]).numpy()
      
      backward_input_kernel = reoder_process(pt_gru_bi.weight_ih_l0_reverse.T.detach().numpy())
      backward_recur_kernel = reoder_process(pt_gru_bi.weight_hh_l0_reverse.T.detach().numpy())
      backward_bias = torch.stack([reoder_process(pt_gru_bi.bias_ih_l0_reverse.detach()),reoder_process(pt_gru_bi.bias_hh_l0_reverse.detach())]).numpy() 
      
      weights = [forward_input_kernel,forward_recur_kernel,forward_bias,backward_input_kernel,backward_recur_kernel,backward_bias]
      
      tf_gru_bi.set_weights(weights=weights)
      
      # 比较
      x = np.random.randn(1,3,2).astype(np.float32)
      pt_x = torch.from_numpy(x)
      tf_x = tf.constant(x)
      
      # pt_outputs 输出的是每个seqlen对应位置的 (forward_hidden + backward_hidden) pt_hidden_states是最后一个seqlen对应的 forward_hidden 和 第一个seqlen对应的backward_hidden
      pt_outputs,pt_hidden_states = pt_gru_bi(pt_x)
      tf_outputs,tf_forward_hidden_states,tf_backward_hidden_states = tf_gru_bi(tf_x)
      tf_hidden_states = tf.stack([tf_forward_hidden_states,tf_backward_hidden_states])
      
      compare_difference(pt_outputs,tf_outputs)
      # max diffenence is 2.9802322387695312e-08
      # mean diffenence is 1.1253480813877559e-08
      compare_difference(pt_hidden_states,tf_hidden_states)
      # max diffenence is 2.9802322387695312e-08
      # mean diffenence is 1.0943040251731873e-08
      

      6、nn.BatchNorm1d vs layers.BatchNormalization

      # 创建模型
      # Input: :math:`(b, C)` or :math:`(N, C, L)` # C这个维度batchnorm
      pt_bn = nn.BatchNorm1d(num_features=4,eps=1e-05) # dim=1的维度。
      tf_bn = keras.layers.BatchNormalization(axis=1,epsilon=1e-05)
      tf_bn.build(input_shape=(None,4))
      
      x = np.random.randn(5,4).astype(np.float32)
      pt_x = torch.from_numpy(x)
      tf_x = tf.constant(x)
      
      a=pt_bn(pt_x)
      b=tf_bn(tf_x,training=True)
      compare_difference(a,b)
      # max diffenence is 1.4901161193847656e-07
      # mean diffenence is 2.9802322387695312e-08
      

      7、nn.LayerNorm vs layers.LayerNormalization

      # 创建模型
      pt_lm = nn.LayerNorm(normalized_shape=4,eps=1e-05)
      tf_lm = keras.layers.LayerNormalization(axis=-1,epsilon=1e-05)
      tf_lm.build(input_shape=(None,4))
      
      # 比较
      x = np.random.randn(2,4).astype(np.float32)
      pt_x = torch.from_numpy(x)
      tf_x = tf.constant(x)
      a=pt_lm(pt_x)
      b=tf_lm(tf_x)
      compare_difference(a,b)
      # max diffenence is 2.384185791015625e-07
      # mean diffenence is 2.9802322387695312e-08
      

      8、Conv2d valid padding

      # 创建模型
      pt_conv2d = nn.Conv2d(in_channels=10,out_channels=2,kernel_size=3)
      tf_conv2d = keras.layers.Conv2D(filters=2,kernel_size=3)
      tf_conv2d.build(input_shape=(None,5,5,10))
      # pytorch cnn weight shape [output_channel,input_channel,height,width]
      # tf cnn weight shape [height,width,channel,filter]
      
      # 复制权重
      weight = pt_conv2d.weight.detach().numpy().transpose(2,3,1,0)
      bias = pt_conv2d.bias.detach().numpy()
      tf_conv2d.set_weights(weights=[weight,bias])
      
      # 比较
      x = np.random.randn(2,5,5,10).astype(np.float32)
      pt_x = torch.from_numpy(x).permute(0,3,1,2)
      tf_x = tf.constant(x)
      a = pt_conv2d(pt_x).permute(0,2,3,1)
      b = tf_conv2d(tf_x)
      compare_difference(a,b)
      # max diffenence is 2.980232238769531e-07
      # mean diffenence is 9.045470505952835e-08
      

      9、Conv2d same padding

      # 重新定义pytorch conv2d
      class Conv2d(nn.Conv2d):
          def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                       padding=0, dilation=1, groups=1, bias=True):
              self.padding_type =None
              if padding =="same":
                  self.padding_type = "same"
                  padding =0
              super(Conv2d, self).__init__(
                  in_channels, out_channels, kernel_size, stride,padding, dilation,
                  groups, bias)
      
              nn.init.xavier_uniform_(self.weight)
      
          def forward(self, x):
              if self.padding_type=="same":
                  ih, iw = x.shape[-2:]
                  kh, kw = self.weight.shape[-2:]
                  oh = math.ceil(ih / self.stride[0])
                  ow = math.ceil(iw / self.stride[1])
                  pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0)
                  pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0)
                  if pad_h > 0 or pad_w > 0:
                      x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
              out = F.conv2d(x, self.weight, self.bias, self.stride,
                             self.padding, self.dilation, self.groups)
              return out
      
      # 创建模型
      kernel_size = 3
      stride = 5
      pt_conv2d = nn.Conv2d(in_channels=3,out_channels=2,kernel_size=kernel_size,stride=stride,padding=0)
      pt_conv2d_same = Conv2d(in_channels=3,out_channels=2,kernel_size=kernel_size,stride=stride,padding="same")
      tf_conv2d = keras.layers.Conv2D(filters=2,kernel_size=kernel_size,padding="same",strides=stride)
      
      tf_conv2d.build(input_shape=(None,26,26,3))
      
      # 复制权重
      weight = pt_conv2d.weight.data.numpy().transpose(2,3,1,0)
      bias = pt_conv2d.bias.data.numpy()
      tf_conv2d.set_weights(weights=[weight,bias])
      pt_conv2d_same.weight.data = pt_conv2d.weight.data
      pt_conv2d_same.bias.data = pt_conv2d.bias.data
      
      # 比较
      x = np.random.randn(2,26,26,3).astype(np.float32)
      pt_x = torch.from_numpy(x).permute(0,3,1,2)
      tf_x = tf.constant(x)
      # 手动给pytorch原生的conv2d进行pandding操作。
      pt_x_padded = pad(pt_x,kernel_size=kernel_size)
      pt_a = pt_conv2d(pt_x_padded).permute(0,2,3,1)
      pt_b = pt_conv2d_same(pt_x).permute(0,2,3,1)
      tf_c = tf_conv2d(tf_x)
      
      compare_difference(pt_a ,tf_c )
      # max diffenence is 1.341104507446289e-07
      # mean diffenence is 3.928370517769508e-08
      
      compare_difference(pt_b ,tf_c )
      # max diffenence is 1.341104507446289e-07
      # mean diffenence is 3.928370517769508e-08
      
      
      posted in 技术交流
      183****0229
      183****0229
    • 推荐一些好用的库【NLP】

      推荐一些好用的库:

      • 1.模型 transformers
        pip install transformers
        🤗Transformers: State-of-the-art Natural Language Processing for Pytorch, TensorFlow, and JAX.
        这个库从2018年开始就存在了,刚开始的时候是叫pytorch_pretrained_bert,然后改名成了pytorch_transformer,最后改名成这个transformers。
        对于刚接触pytorch和python的小伙伴,推荐读一下pytorch_pretrained_bert的代码,因为现在transformers库的代码越来越复杂,封装了越来越多的东西,对新手不怎么友好。

      • 2.训练框架 pytorch-lightning
        pip install pytorch-lightning
        The lightweight PyTorch wrapper for high-performance AI research. Scale your models, not the boilerplate.
        这个库把研究代码与工程代码相分离,还将PyTorch代码结构化,更加直观的展现数据操作过程。
        这样,更加易于理解,不易出错,本来很冗长的代码一下子就变得轻便了,对AI研究者十分的友好。

      • 3.指标 torchmetrics
        pip install torchmetrics
        Machine learning metrics for distributed, scalable PyTorch applications.
        TorchMetrics 是 25 多个 PyTorch 指标实现的集合,是一个易于使用的 API,用于创建自定义指标。它提供:标准化接口,提高重现性、减少样板、批量自动累积、针对分布式训练优化的指标、多台设备间自动同步。

      • 4.数据集 datasets
        pip install datasets
        🤗 The largest hub of ready-to-use NLP datasets for ML models with fast, easy-to-use and efficient data manipulation tools.
        适用于大型数据集:Datasets库可以将用户从 RAM 内存限制中解放出来,所有数据集都使用高效的零序列化成本后端 (Apache Arrow) 进行内存映射。
        智能缓存:不需要重复处理数据集。
        轻量级和快速的透明和 pythonic API(多处理/缓存/内存映射)。
        兼容性:与 NumPy、pandas、PyTorch、Tensorflow 2 和 JAX 的内置互操作性。

      总结:

      • 关于数据读取和处理可以使用huggingface的datasets库进行高效的处理。
      • 关于训练框架可以使用pytorch-lightning库。
      • 关于指标可以使用torchmetric库。
      • 关于模型可以使用transformers库(预训练模型相关)。

      TODO:

      接下来的几篇文章将分别介绍这四个库的使用方法,最终将结合使用这四个库完成一个文本分类任务。

      posted in 技术交流
      183****0229
      183****0229
    • 【1】使用Captum库解释BERT模型

      Captum是PyTorch的模型可解释性和理解库。Captum在拉丁语中表示理解,包含PyTorch模型的集成梯度、显著图、平滑、vargrad等通用实现。它可以快速集成使用特定于领域的库(如torchvision、torchtext等)构建的模型。

      https://github.com/pytorch/captum/blob/master/tutorials/Bert_SQUAD_Interpret.ipynb

      Interpreting BERT Models (Part 1)

      In this notebook we demonstrate how to interpret Bert models using Captum library. In this particular case study we focus on a fine-tuned Question Answering model on SQUAD dataset using transformers library from Hugging Face: https://huggingface.co/transformers/

      We show how to use interpretation hooks to examine and better understand embeddings, sub-embeddings, bert, and attention layers.

      Note: Before running this tutorial, please install seaborn, pandas and matplotlib, transformers(from hugging face, tested on transformer version 4.3.0.dev0) python packages.

      在本笔记本中,我们演示了如何使用 Captum 库解释 Bert 模型。 在这个特殊的案例研究中,我们专注于使用 Hugging Face 的transformers库对 SQUAD 数据集进行微调的问答模型:https://huggingface.co/transformers/

      我们展示了如何使用interpretation hooks来检查和更好地理解embeddings、sub-embeddings、bert 和attention层。

      注意:在运行本教程之前,请安装seaborn、pandas 和matplotlib、transformers(来自hugging face,在transformer 版本4.3.0.dev0 上测试)python 包。

      import numpy as np
      import pandas as pd
      import seaborn as sns
      import matplotlib.pyplot as plt
      
      import torch
      import torch.nn as nn
      
      from transformers import BertTokenizer, BertForQuestionAnswering, BertConfig
      
      from captum.attr import visualization as viz
      from captum.attr import LayerConductance, LayerIntegratedGradients
      
      device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
      

      The first step is to fine-tune BERT model on SQUAD dataset. This can be easiy accomplished by following the steps described in hugging face’s official web site: https://github.com/huggingface/transformers#run_squadpy-fine-tuning-on-squad-for-question-answering

      Note that the fine-tuning is done on a bert-base-cased pre-trained model.

      第一步是在 SQUAD 数据集上微调 BERT 模型。 这可以通过遵循hugging face官方网站中描述的步骤轻松完成:https://github.com/huggingface/transformers#run_squadpy-fine-tuning-on-squad-for-question-answering

      请注意,微调是在bert-base-cased预训练模型上完成的。

      After we pretrain the model, we can load the tokenizer and pre-trained BERT model using the commands described below.

      在我们预训练模型之后,我们可以使用下面描述的命令加载tokenizer和预训练的 BERT 模型。

      # replace <PATH-TO-SAVED-MODEL> with the real path of the saved model
      model_path = 'deepset/bert-base-cased-squad2'
      
      # load model
      model = BertForQuestionAnswering.from_pretrained(model_path)
      model.to(device)
      model.eval()
      model.zero_grad()
      
      # load tokenizer
      tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=False)
      

      A helper function to perform forward pass of the model and make predictions.

      一个辅助函数,用于执行模型的前向传递并进行预测。

      def predict(inputs, token_type_ids=None, position_ids=None, attention_mask=None):
          output = model(inputs, token_type_ids=token_type_ids,
                       position_ids=position_ids, attention_mask=attention_mask, )
          return output.start_logits, output.end_logits
      

      Defining a custom forward function that will allow us to access the start and end postitions of our prediction using position input argument.

      定义一个自定义forward函数,该函数允许我们使用 position 输入参数访问预测的开始和结束位置。

      def squad_pos_forward_func(inputs, token_type_ids=None, position_ids=None, attention_mask=None, position=0):
          pred = predict(inputs,
                         token_type_ids=token_type_ids,
                         position_ids=position_ids,
                         attention_mask=attention_mask)
          pred = pred[position]
          return pred.max(1).values
      
      

      Let’s compute attributions with respect to the BertEmbeddings layer.

      To do so, we need to define baselines / references, numericalize both the baselines and the inputs. We will define helper functions to achieve that.

      The cell below defines numericalized special tokens that will be later used for constructing inputs and corresponding baselines/references.

      让我们计算关于BertEmbeddings层的属性。

      为此,我们需要定义baselines/references,将baselines和inputs数值化。 我们将定义辅助函数来实现这一点。

      下面的单元格定义了数字化的special tokens,稍后将用于构建输入和相应的baselines/references。

      ref_token_id = tokenizer.pad_token_id # A token used for generating token reference
      sep_token_id = tokenizer.sep_token_id # A token used as a separator between question and text and it is also added to the end of the text.
      cls_token_id = tokenizer.cls_token_id # A token used for prepending to the concatenated question-text word sequence
      

      Below we define a set of helper function for constructing references / baselines for word tokens, token types and position ids. We also provide separate helper functions that allow to construct attention masks and bert embeddings both for input and reference.

      下面我们定义了一组辅助函数,用于为word tokens、token types和position ids构建references/baselines。 我们还提供了单独的辅助函数,允许为input和reference构建attention masks和 bert embeddings。

      def construct_input_ref_pair(question, text, ref_token_id, sep_token_id, cls_token_id):
          question_ids = tokenizer.encode(question, add_special_tokens=False)
          text_ids = tokenizer.encode(text, add_special_tokens=False)
      
          # construct input token ids
          input_ids = [cls_token_id] + question_ids + [sep_token_id] + text_ids + [sep_token_id]
      
          # construct reference token ids 
          ref_input_ids = [cls_token_id] + [ref_token_id] * len(question_ids) + [sep_token_id] + \
              [ref_token_id] * len(text_ids) + [sep_token_id]
      
          return torch.tensor([input_ids], device=device), torch.tensor([ref_input_ids], device=device), len(question_ids)
      
      def construct_input_ref_token_type_pair(input_ids, sep_ind=0):
          seq_len = input_ids.size(1)
          token_type_ids = torch.tensor([[0 if i <= sep_ind else 1 for i in range(seq_len)]], device=device)
          ref_token_type_ids = torch.zeros_like(token_type_ids, device=device)# * -1
          return token_type_ids, ref_token_type_ids
      
      def construct_input_ref_pos_id_pair(input_ids):
          seq_length = input_ids.size(1)
          position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
          # we could potentially also use random permutation with `torch.randperm(seq_length, device=device)`
          ref_position_ids = torch.zeros(seq_length, dtype=torch.long, device=device)
      
          position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
          ref_position_ids = ref_position_ids.unsqueeze(0).expand_as(input_ids)
          return position_ids, ref_position_ids
          
      def construct_attention_mask(input_ids):
          return torch.ones_like(input_ids)
      
      def construct_whole_bert_embeddings(input_ids, ref_input_ids, \
                                          token_type_ids=None, ref_token_type_ids=None, \
                                          position_ids=None, ref_position_ids=None):
          input_embeddings = model.bert.embeddings(input_ids, token_type_ids=token_type_ids, position_ids=position_ids)
          ref_input_embeddings = model.bert.embeddings(ref_input_ids, token_type_ids=token_type_ids, position_ids=position_ids)
          
          return input_embeddings, ref_input_embeddings
      
      

      Let’s define the question - text pair that we’d like to use as an input for our Bert model and interpret what the model was forcusing on when predicting an answer to the question from given input text

      让我们定义question-text对,我们希望将其用作 Bert 模型的输入,并在从给定的输入文本预测问题的答案时解释模型所强调的内容

      question, text = "What is important to us?", "It is important to us to include, empower and support humans of all kinds."
      

      Let’s numericalize the question, the input text and generate corresponding baselines / references for all three sub-embeddings (word, token type and position embeddings) types using our helper functions defined above.

      让我们使用上面定义的辅助函数对问题、输入文本进行数值化,并为所有三个子嵌入(词、标记类型和位置嵌入)类型生成相应的基线/参考。

      input_ids, ref_input_ids, sep_id = construct_input_ref_pair(question, text, ref_token_id, sep_token_id, cls_token_id)
      token_type_ids, ref_token_type_ids = construct_input_ref_token_type_pair(input_ids, sep_id)
      position_ids, ref_position_ids = construct_input_ref_pos_id_pair(input_ids)
      attention_mask = construct_attention_mask(input_ids)
      
      indices = input_ids[0].detach().tolist()
      all_tokens = tokenizer.convert_ids_to_tokens(indices)
      

      Also, let’s define the ground truth for prediction’s start and end positions.

      此外,让我们为预测的开始和结束位置定义真实标签。

      ground_truth = 'to include, empower and support humans of all kinds'
      
      ground_truth_tokens = tokenizer.encode(ground_truth, add_special_tokens=False)
      ground_truth_end_ind = indices.index(ground_truth_tokens[-1])
      ground_truth_start_ind = ground_truth_end_ind - len(ground_truth_tokens) + 1
      

      Now let’s make predictions using input, token type, position id and a default attention mask.

      现在让我们使用input、token type、position id和默认的attention mask进行预测。

      start_scores, end_scores = predict(input_ids, \
                                         token_type_ids=token_type_ids, \
                                         position_ids=position_ids, \
                                         attention_mask=attention_mask)
      
      
      print('Question: ', question)
      print('Predicted Answer: ', ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
      
      Question:  What is important to us?
      Predicted Answer:  to include , em ##power and support humans of all kinds
      

      There are two different ways of computing the attributions for emebdding layers. One option is to use LayerIntegratedGradients and compute the attributions with respect to BertEmbedding. The second option is to use LayerIntegratedGradients for each word_embeddings, token_type_embeddings and position_embeddings and compute the attributions w.r.t each embedding vector.

      有两种不同的方法可以计算emebdding layers的属性。 一种选择是使用 LayerIntegratedGradients 并计算关于 BertEmbedding 的属性。 第二个选项是对每个 word_embeddings、token_type_embeddings 和 position_embeddings 使用 LayerIntegratedGradients,并计算每个嵌入向量的属性。

      lig = LayerIntegratedGradients(squad_pos_forward_func, model.bert.embeddings)
      
      attributions_start, delta_start = lig.attribute(inputs=input_ids,
                                        baselines=ref_input_ids,
                                        additional_forward_args=(token_type_ids, position_ids, attention_mask, 0),
                                        return_convergence_delta=True)
      attributions_end, delta_end = lig.attribute(inputs=input_ids, baselines=ref_input_ids,
                                      additional_forward_args=(token_type_ids, position_ids, attention_mask, 1),
                                      return_convergence_delta=True)
      

      A helper function to summarize attributions for each word token in the sequence.

      一个辅助函数,用于提取序列中每个单词标记的属性。

      def summarize_attributions(attributions):
          attributions = attributions.sum(dim=-1).squeeze(0)
          attributions = attributions / torch.norm(attributions)
          return attributions
      
      attributions_start_sum = summarize_attributions(attributions_start)
      attributions_end_sum = summarize_attributions(attributions_end)
      
      # storing couple samples in an array for visualization purposes
      start_position_vis = viz.VisualizationDataRecord(
                              attributions_start_sum,
                              torch.max(torch.softmax(start_scores[0], dim=0)),
                              torch.argmax(start_scores),
                              torch.argmax(start_scores),
                              str(ground_truth_start_ind),
                              attributions_start_sum.sum(),       
                              all_tokens,
                              delta_start)
      
      end_position_vis = viz.VisualizationDataRecord(
                              attributions_end_sum,
                              torch.max(torch.softmax(end_scores[0], dim=0)),
                              torch.argmax(end_scores),
                              torch.argmax(end_scores),
                              str(ground_truth_end_ind),
                              attributions_end_sum.sum(),       
                              all_tokens,
                              delta_end)
      
      print('\033[1m', 'Visualizations For Start Position', '\033[0m')
      viz.visualize_text([start_position_vis])
      
      print('\033[1m', 'Visualizations For End Position', '\033[0m')
      viz.visualize_text([end_position_vis])
      

      From the results above we can tell that for predicting start position our model is focusing more on the question side. More specifically on the tokens what and important. It has also slight focus on the token sequence to us in the text side.

      In contrast to that, for predicting end position, our model focuses more on the text side and has relative high attribution on the last end position token kinds.

      从上面的结果我们可以看出,为了预测开始位置,我们的模型更多地关注问题方面。 更具体地说,关于tokens what和important。 它还略微关注文本侧的标记序列to us。

      与此相反,对于预测结束位置,我们的模型更侧重于文本方面,并且对最后结束位置标记kinds具有相对较高的归因。

      Multi-Embedding attribution

      Now let’s look into the sub-embeddings of BerEmbeddings and try to understand the contributions and roles of each of them for both start and end predicted positions.

      To do so, we will use LayerIntegratedGradients for all three layer: word_embeddings, token_type_embeddings and position_embeddings.

      现在让我们看看BerEmbeddings的sub-embeddings,并尝试了解每个sub-embeddings对开始和结束预测位置的贡献和作用。

      为此,我们将对所有三层使用LayerIntegratedGradients:word_embeddings、token_type_embeddings和position_embeddings。

      Now let’s create an instance of LayerIntegratedGradients and compute the attributions with respect to all those embeddings both for the start and end positions and summarize them for each word token.

      现在让我们创建一个 LayerIntegratedGradients 的实例,并计算所有这些嵌入的开始和结束位置的属性,并为每个单词标记总结它们。

      lig2 = LayerIntegratedGradients(squad_pos_forward_func, \
                                      [model.bert.embeddings.word_embeddings, \
                                       model.bert.embeddings.token_type_embeddings, \
                                       model.bert.embeddings.position_embeddings])
      
      attributions_start = lig2.attribute(inputs=(input_ids, token_type_ids, position_ids),
                                        baselines=(ref_input_ids, ref_token_type_ids, ref_position_ids),
                                        additional_forward_args=(attention_mask, 0))
      attributions_end = lig2.attribute(inputs=(input_ids, token_type_ids, position_ids),
                                        baselines=(ref_input_ids, ref_token_type_ids, ref_position_ids),
                                        additional_forward_args=(attention_mask, 1))
      
      attributions_start_word = summarize_attributions(attributions_start[0])
      attributions_end_word = summarize_attributions(attributions_end[0])
      
      attributions_start_token_type = summarize_attributions(attributions_start[1])
      attributions_end_token_type = summarize_attributions(attributions_end[1])
      
      attributions_start_position = summarize_attributions(attributions_start[2])
      attributions_end_position = summarize_attributions(attributions_end[2])
      
      
      /usr/local/lib/python3.8/dist-packages/captum/attr/_core/layer/layer_integrated_gradients.py:103: UserWarning: Multiple layers provided. Please ensure that each layer is**not** solely solely dependent on the outputs ofanother layer. Please refer to the documentation for moredetail.
        warnings.warn(
      

      An auxilary function that will help us to compute topk attributions and corresponding indices

      一个辅助函数,将帮助我们计算 topk 属性和相应的索引

      def get_topk_attributed_tokens(attrs, k=5):
          values, indices = torch.topk(attrs, k)
          top_tokens = [all_tokens[idx] for idx in indices]
          return top_tokens, values, indices
      

      Removing interpretation hooks from all layers after finishing attribution.

      完成attribution后,从所有层中删除interpretation hooks。

      Computing topk attributions for all sub-embeddings and placing them in pandas dataframes for better visualization.

      计算所有sub-embeddings的 topk 属性并将它们放置在 Pandas 数据框中以获得更好的可视化。

      top_words_start, top_words_val_start, top_word_ind_start = get_topk_attributed_tokens(attributions_start_word)
      top_words_end, top_words_val_end, top_words_ind_end = get_topk_attributed_tokens(attributions_end_word)
      
      top_token_type_start, top_token_type_val_start, top_token_type_ind_start = get_topk_attributed_tokens(attributions_start_token_type)
      top_token_type_end, top_token_type_val_end, top_token_type_ind_end = get_topk_attributed_tokens(attributions_end_token_type)
      
      top_pos_start, top_pos_val_start, pos_ind_start = get_topk_attributed_tokens(attributions_start_position)
      top_pos_end, top_pos_val_end, pos_ind_end = get_topk_attributed_tokens(attributions_end_position)
      
      df_start = pd.DataFrame({'Word(Index), Attribution': ["{} ({}), {}".format(word, pos, round(val.item(),2)) for word, pos, val in zip(top_words_start, top_word_ind_start, top_words_val_start)],
                         'Token Type(Index), Attribution': ["{} ({}), {}".format(ttype, pos, round(val.item(),2)) for ttype, pos, val in zip(top_token_type_start, top_token_type_ind_start, top_words_val_start)],
                         'Position(Index), Attribution': ["{} ({}), {}".format(position, pos, round(val.item(),2)) for position, pos, val in zip(top_pos_start, pos_ind_start, top_pos_val_start)]})
      df_start.style.apply(['cell_ids: False'])
      
      df_end = pd.DataFrame({'Word(Index), Attribution': ["{} ({}), {}".format(word, pos, round(val.item(),2)) for word, pos, val in zip(top_words_end, top_words_ind_end, top_words_val_end)],
                         'Token Type(Index), Attribution': ["{} ({}), {}".format(ttype, pos, round(val.item(),2)) for ttype, pos, val in zip(top_token_type_end, top_token_type_ind_end, top_words_val_end)],
                         'Position(Index), Attribution': ["{} ({}), {}".format(position, pos, round(val.item(),2)) for position, pos, val in zip(top_pos_end, pos_ind_end, top_pos_val_end)]})
      df_end.style.apply(['cell_ids: False'])
      
      ['{}({})'.format(token, str(i)) for i, token in enumerate(all_tokens)]
      
      ['[CLS](0)',
       'What(1)',
       'is(2)',
       'important(3)',
       'to(4)',
       'us(5)',
       '?(6)',
       '[SEP](7)',
       'It(8)',
       'is(9)',
       'important(10)',
       'to(11)',
       'us(12)',
       'to(13)',
       'include(14)',
       ',(15)',
       'em(16)',
       '##power(17)',
       'and(18)',
       'support(19)',
       'humans(20)',
       'of(21)',
       'all(22)',
       'kinds(23)',
       '.(24)',
       '[SEP](25)']
      

      Below we can see top 5 attribution results from all three embedding types in predicting start positions.

      下面我们可以看到所有三种嵌入类型在预测起始位置时的前 5 个归因结果。

      Top 5 attributed embeddings for start position

      Word embeddings help to focus more on the surrounding tokens of the predicted answer’s start position to such as em, ##power and ,. It also has high attribution for the tokens in the question such as what and ?.

      In contrast to to word embedding, token embedding type focuses more on the tokens in the text part such as important,em and start token to.

      Position embedding also has high attribution score for the tokens surrounding to such as us and important. In addition to that, similar to word embedding we observe important tokens from the question.

      We can perform similar analysis, and visualize top 5 attributed tokens for all three embedding types, also for the end position prediction.

      词嵌入有助于更多地关注预测答案的起始位置的周围标记,例如 em、##power 和 ,。 它对问题中的令牌也有很高的归因,例如什么和?。

      与词嵌入相比,标记嵌入类型更关注文本部分中的标记,例如important,em 和start token to。

      位置嵌入对于诸如 us 和 important 之类的标记也具有很高的归因分数。 除此之外,与词嵌入类似,我们从问题中观察到重要的标记。

      我们可以执行类似的分析,并将所有三种嵌入类型的前 5 个属性标记可视化,也可以用于结束位置预测。

      Top 5 attributed embeddings for end position

      It is interesting to observe high concentration of highly attributed tokens such as of, kinds, support and ##power for end position prediction.

      The token kinds, which is the correct predicted token appears to have high attribution score both according word and position embeddings.

      观察高度集中的高属性标记(例如用于结束位置预测的of、kinds, support 和 ##power)很有趣。

      根据词嵌入和位置嵌入,作为正确预测的token,kinds似乎具有较高的属性得分。

      Interpreting Bert Layers

      Now let’s look into the layers of our network. More specifically we would like to look into the distribution of attribution scores for each token across all layers in Bert model and dive deeper into specific tokens.
      We do that using one of layer attribution algorithms, namely, layer conductance. However, we encourage you to try out and compare the results with other algorithms as well.

      现在让我们看看我们的网络层。 更具体地说,我们希望研究 Bert 模型中所有层中每个token的归因分数分布,并深入研究特定的tokens。
      我们使用层归因算法之一来做到这一点,即layer conductance。 但是,我们鼓励您尝试并将结果与其他算法进行比较。

      Let’s define another version of squad forward function that takes emebddings as input argument. This is necessary for LayerConductance algorithm.

      让我们定义另一个版本的forward函数,它将emebddings作为输入参数。 这对于LayerConductance算法是必要的。

      def squad_pos_forward_func2(input_emb, attention_mask=None, position=0):
          pred = model(inputs_embeds=input_emb, attention_mask=attention_mask, )
          pred = pred[position]
          return pred.max(1).values
      

      Let’s iterate over all layers and compute the attributions for all tokens. In addition to that let’s also choose a specific token that we would like to examine in detail, specified by an id token_to_explain and store related information in a separate array.

      Note: Since below code is iterating over all layers it can take over 5 seconds. Please be patient!

      让我们迭代所有层并计算所有tokens的属性。 除此之外,让我们还选择一个我们想要详细检查的特定token,由 id token_to_explain 指定,并将相关信息存储在单独的数组中。

      注意:由于下面的代码迭代所有层,因此可能需要 5 秒以上。 请耐心等待!

      layer_attrs_start = []
      layer_attrs_end = []
      
      # The token that we would like to examine separately.
      token_to_explain = 23 # the index of the token that we would like to examine more thoroughly
      layer_attrs_start_dist = []
      layer_attrs_end_dist = []
      
      input_embeddings, ref_input_embeddings = construct_whole_bert_embeddings(input_ids, ref_input_ids, \
                                               token_type_ids=token_type_ids, ref_token_type_ids=ref_token_type_ids, \
                                               position_ids=position_ids, ref_position_ids=ref_position_ids)
      
      for i in range(model.config.num_hidden_layers):
          lc = LayerConductance(squad_pos_forward_func2, model.bert.encoder.layer[i])
          layer_attributions_start = lc.attribute(inputs=input_embeddings, baselines=ref_input_embeddings, additional_forward_args=(attention_mask, 0))
          layer_attributions_end = lc.attribute(inputs=input_embeddings, baselines=ref_input_embeddings, additional_forward_args=(attention_mask, 1))
          layer_attrs_start.append(summarize_attributions(layer_attributions_start).cpu().detach().tolist())
          layer_attrs_end.append(summarize_attributions(layer_attributions_end).cpu().detach().tolist())
      
          # storing attributions of the token id that we would like to examine in more detail in token_to_explain
          layer_attrs_start_dist.append(layer_attributions_start[0,token_to_explain,:].cpu().detach().tolist())
          layer_attrs_end_dist.append(layer_attributions_end[0,token_to_explain,:].cpu().detach().tolist())
      
      

      The plot below represents a heat map of attributions across all layers and tokens for the start position prediction.
      It is interesting to observe that the question word what gains increasingly high attribution from layer one to nine. In the last three layers that importance is slowly diminishing.
      In contrary to what token, many other tokens have negative or close to zero attribution in the first 6 layers.

      We start seeing slightly higher attribution in tokens important, us and to. Interestingly token em is also assigned high attribution score which is remarkably high the last three layers.
      And lastly, our correctly predicted token to for the start position gains increasingly positive attribution has relatively high attribution especially in the last two layers.

      下图代表了开始位置预测的所有层和标记的属性热图。
      有趣的是,问题词what从第一层到第九层获得越来越高的归因。 在最后三层中,重要性正在慢慢降低。
      与what令牌相反,许多其他令牌在前 6 层中具有负面或接近于零的属性。

      我们开始看到标记important、us和to的归因略高。 有趣的是,标记em也被分配了高归因分数,这在最后三层非常高。
      最后,我们正确预测的标记to的起始位置增益越来越积极的归因具有相对较高的归因,尤其是在最后两层。

      fig, ax = plt.subplots(figsize=(15,5))
      xticklabels=all_tokens
      yticklabels=list(range(1,13))
      ax = sns.heatmap(np.array(layer_attrs_start), xticklabels=xticklabels, yticklabels=yticklabels, linewidth=0.2)
      plt.xlabel('Tokens')
      plt.ylabel('Layers')
      plt.show()
      

      Now let’s examine the heat map of the attributions for the end position prediction. In the case of end position prediction we again observe high attribution scores for the token what in the last 11 layers.
      The correctly predicted end token kinds has positive attribution across all layers and it is especially prominent in the last two layers.

      现在让我们检查结束位置预测的属性热图。 在结束位置预测的情况下,我们再次观察到标记what在最后 11 层中的高归因分数。
      正确预测的结束标记kinds在所有层中都具有正属性,并且在最后两层中尤为突出。

      fig, ax = plt.subplots(figsize=(15,5))
      
      xticklabels=all_tokens
      yticklabels=list(range(1,13))
      ax = sns.heatmap(np.array(layer_attrs_end), xticklabels=xticklabels, yticklabels=yticklabels, linewidth=0.2) #, annot=True
      plt.xlabel('Tokens')
      plt.ylabel('Layers')
      
      plt.show()
      

      It is interesting to note that when we compare the heat maps of start and end position, overall the colors for start position prediction on the map have darker intensities. This implies that there are less tokens that attribute positively to the start position prediction and there are more tokens which are negative indicators or signals of start position prediction.

      有趣的是,当我们比较开始和结束位置的热图时,总体上地图上开始位置预测的颜色具有更暗的强度。 这意味着正面归因于开始位置预测的tokens较少,而作为开始位置预测的负面指标或信号的tokens较多。

      Now let’s dig deeper into specific tokens and look into the distribution of attributions per layer for the token kinds in the start and end positions. The box plot diagram below shows the presence of outliers especially in the first four layers and in layer 8. We also observe that for start position prediction interquartile range slowly decreases as we go deeper into the layers and finally it is dimishing.

      现在让我们深入研究特定的标记,并查看每层在开始和结束位置的标记 kinds 的属性分布。 下面的箱线图显示了异常值的存在,尤其是在前四层和第 8 层。我们还观察到,开始位置预测的四分位间距随着我们深入层而缓慢减小,最后逐渐减小。

      fig, ax = plt.subplots(figsize=(20,10))
      ax = sns.boxplot(data=layer_attrs_start_dist)
      plt.xlabel('Layers')
      plt.ylabel('Attribution')
      plt.show()
      

      Now let’s plot same distribution but for the prediction of the end position. Here attribution has larger positive values across all layers and the interquartile range doesn’t change much when moving deeper into the layers.

      现在让我们绘制相同的分布,但用于预测结束位置。 在这里,归因在所有层中都有更大的正值,当深入到层中时,四分位距变化不大。

      fig, ax = plt.subplots(figsize=(20,10))
      ax = sns.boxplot(data=layer_attrs_end_dist)
      plt.xlabel('Layers')
      plt.ylabel('Attribution')
      plt.show()
      

      In addition to that we can also look into the distribution of attributions in each layer for any input token. This will help us to better understand and compare the distributional patterns of attributions across multiple layers. We can for example represent attributions as a probability density function (pdf) and compute the entropy of it in order to estimate the entropy of attributions in each layer. This can be easily computed using a histogram.

      除此之外,我们还可以查看任何输入令牌在每一层中的属性分布。 这将帮助我们更好地理解和比较跨多个层的归因分布模式。 例如,我们可以将属性表示为概率密度函数 (pdf) 并计算它的熵以估计每一层中属性的熵。 这可以使用直方图轻松计算。

      def pdf_attr(attrs, bins=100):
          return np.histogram(attrs, bins=bins, density=True)[0]
      

      In this particular case let’s compute the pdf for the attributions at end positions kinds. We can however do it for all tokens.

      We will compute and visualize the pdfs and entropies using Shannon’s Entropy measure for each layer for token kinds.

      在这种特殊情况下,让我们计算结束位置kinds的属性的 pdf。 但是,我们可以为所有令牌执行此操作。

      我们将使用香农的熵度量来计算和可视化 pdf 和熵,用于标记kinds的每一层。

      layer_attrs_end_pdf = map(lambda layer_attrs_end_dist: pdf_attr(layer_attrs_end_dist), layer_attrs_end_dist)
      layer_attrs_end_pdf = np.array(list(layer_attrs_end_pdf))
      
      # summing attribution along embedding diemension for each layer
      # size: #layers
      attr_sum = np.array(layer_attrs_end_dist).sum(-1)
      
      # size: #layers
      layer_attrs_end_pdf_norm = np.linalg.norm(layer_attrs_end_pdf, axis=-1, ord=1)
      
      #size: #bins x #layers
      layer_attrs_end_pdf = np.transpose(layer_attrs_end_pdf)
      
      #size: #bins x #layers
      layer_attrs_end_pdf = np.divide(layer_attrs_end_pdf, layer_attrs_end_pdf_norm, where=layer_attrs_end_pdf_norm!=0)
      

      The plot below visualizes the probability mass function (pmf) of attributions for each layer for the end position token kinds. From the plot we can observe that the distributions are taking bell-curved shapes with different means and variances.
      We can now use attribution pdfs to compute entropies in the next cell.

      下图显示了终端位置标记kinds的每一层属性的概率质量函数 (pmf)。 从图中我们可以观察到分布呈钟形曲线,具有不同的均值和方差。
      我们现在可以使用归因 pdf 来计算下一个单元格中的熵。

      fig, ax = plt.subplots(figsize=(20,10))
      plt.plot(layer_attrs_end_pdf)
      plt.xlabel('Bins')
      plt.ylabel('Density')
      plt.legend(['Layer '+ str(i) for i in range(1,13)])
      plt.show()
      

      Below we calculate and visualize attribution entropies based on Shannon entropy measure where the x-axis corresponds to the number of layers and the y-axis corresponds to the total attribution in that layer. The size of the circles for each (layer, total_attribution) pair correspond to the normalized entropy value at that point.

      In this particular example, we observe that the entropy doesn’t change much from layer to layer, however in a general case entropy can provide us an intuition about the distributional characteristics of attributions in each layer and can be useful especially when comparing it across multiple tokens.

      下面我们基于Shannon熵度量计算和可视化属性熵,其中x轴对应层数,y轴对应该层的总属性。每个(layer, total_attribution)对的圆的大小对应于该点的归一化熵值。

      在这个特定的例子中,我们观察到,熵在每一层之间变化不大,但是在一般情况下,熵可以为我们提供关于属性在每一层的分布特征的直观认识,尤其是在跨多个标记进行比较时,熵非常有用。

      fig, ax = plt.subplots(figsize=(20,10))
      
      # replacing 0s with 1s. np.log(1) = 0 and np.log(0) = -inf
      layer_attrs_end_pdf[layer_attrs_end_pdf == 0] = 1
      layer_attrs_end_pdf_log = np.log2(layer_attrs_end_pdf)
      
      # size: #layers
      entropies= -(layer_attrs_end_pdf * layer_attrs_end_pdf_log).sum(0)
      
      plt.scatter(np.arange(12), attr_sum, s=entropies * 100)
      plt.xlabel('Layers')
      plt.ylabel('Total Attribution')
      plt.show()
      

      In the Part 2 of this tutorial we will to go deeper into attention layers, heads and compare the attributions with the attention weight matrices, study and discuss related statistics.

      在本教程的第2部分中,我们将深入研究注意力层、头部,并将其属性与注意力权重矩阵进行比较,学习和讨论相关的统计数据。






      posted in 技术交流
      183****0229
      183****0229
    • V100训练初体验【VS 3090】

      今天平台里上新了V100的服务器,为了尝尝鲜,于是乎我决定在V100上训练一个文本分类任务。

      配置信息对比

      配置 3090 V100
      价格 4.5(可使用代金券) 5.5(不可使用代金券)
      内存 64G / GPU 64G / GPU
      CPU 8核 / GPU 8核 / GPU
      网络 下行带宽:50MB/s 下行带宽:10MB/s
      显存 24 GB 32GB
      硬盘 1.6T SSD 800.0G HDD
      有无nas 有 无

      可以发现V100虽然显存比3090大,但是性价比不如3090高(不可使用代金券,下行速度慢,硬盘不是SSD,没有Nas)。

      V100 训练(使用fp16)

      可以看到训练batch size 能调到32,此时使用率将近100%


      RTX3090 训练(使用fp16)

      而3090,batch size则不能调到32,会出现OOM,只能调到24,使用率也将近100%

      结果对比

      3090 V100
      训练时间 39m38s 35m20s
      accuracy 64.39 68.61

      Tips:

      这是个不严谨的测试。请自行判断结果是否可信。

      posted in 技术交流
      183****0229
      183****0229

    Latest posts made by 183****0229

    • RE: 删除nas数据

      @180-9163 ➕1,我好不容易删除了,已经扣了我几百多rmb了,太坑了

      posted in 不懂就问❓在线等
      183****0229
      183****0229
    • RE: 【有奖话题NO.13】 天临四年,肝论文的炼丹师们,大家还好吗?

      分母你好,我是分子。

      posted in 有奖话题
      183****0229
      183****0229
    • RE: 【有奖话题NO.7】传说中的万能Transformer,你有用过吗?

      这题我不会,transformer is all you need!

      posted in 有奖话题
      183****0229
      183****0229
    • 【抢钱活动】PaddlePaddleHackathon

      飞桨黑客马拉松活动


      昨天偶然刷了下github的issue,突然发现多了很多奇怪的东西,于是就进去看了一下。
      发现又有新一轮的抢钱活动~

      乍一看发现是1万,5千,1千RMB。可仔细一看发现端倪,竟然是10块、5块和1块~(前端背锅,竟然用点号)。活动是昨天也就是9月23号开始的,还有好多题目可以做(好多钱可以抢),不多说了我继续去抢钱了~。

      posted in 聊一会吧
      183****0229
      183****0229
    • 【思考】Byte-Pair-Encoding tokenizer 的offset mapping实现

      huggingface库的tokenizers只有rust版本的get offset mapping,没有python版本的,因此个人想着试着找个方法实现以下这个方法。

      准备

      安装pip install fastcore

      # 安装fastcore
      # !pip install fastcore
      
      from paddlenlp.transformers import XLNetTokenizer, GPTTokenizer
      from fastcore.all import patch_to
      from paddle.utils import try_import
      from transformers import XLNetTokenizerFast, GPT2TokenizerFast
      
      @patch_to(GPTTokenizer)
      def get_offset_mapping(self, text):
          token_mapping = []
          global_offsets = 0
          re = try_import("regex")
          for token in re.findall(self.pat, text):
              newtokens = ""
              char2bpe = []
              for char_index, each_element in enumerate(token):
                  for b in each_element.encode("utf-8"):
                      newtokens += self.byte_encoder[b]
                      char2bpe.append(char_index)
      
              cum_bpe_offset = 0
              for bpe_token in self.bpe(newtokens).split(" "):
                  start = newtokens.index(bpe_token) + cum_bpe_offset
                  end = start + len(bpe_token)
                  new_start = char2bpe[start] + global_offsets
                  new_end = char2bpe[end - 1] + global_offsets + 1
                  if bpe_token[0] == "Ġ":
                      new_start += 1
                  token_mapping.append((new_start, new_end))
                  cum_bpe_offset += len(bpe_token)
                  newtokens = newtokens[len(bpe_token) :]
      
              global_offsets += len(token)
      
          return token_mapping
      
      # 数据来自squad1.1
      data = {'id': '56f7c651aef2371900625bf5',
       'title': 'Martin_Luther',
       'context': "Martin Luther (/ˈluːθər/ or /ˈluːðər/; German: [ˈmaɐ̯tiːn ˈlʊtɐ] ( listen); 10 November 1483 – 18 February 1546) was a German professor of theology, composer, priest, former monk and a seminal figure in the Protestant Reformation. Luther came to reject several teachings and practices of the Late Medieval Catholic Church. He strongly disputed the claim that freedom from God's punishment for sin could be purchased with money. He proposed an academic discussion of the power and usefulness of indulgences in his Ninety-Five Theses of 1517. His refusal to retract all of his writings at the demand of Pope Leo X in 1520 and the Holy Roman Emperor Charles V at the Diet of Worms in 1521 resulted in his excommunication by the Pope and condemnation as an outlaw by the Emperor.",
       'question': 'Of what nationality was Martin Luther?',
       'answers': ['German', 'German', 'German'],
       'answer_starts': [39, 119, 119],
       'is_impossible': False}
      text = data["context"]
      
      # 比较gpt2的offset mapping。
      # huggingface版本
      for a,b in zip(gpt2_hgtokenizer(text,return_offsets_mapping=True)["offset_mapping"],gpt2_hgtokenizer.tokenize(text)):
          print(text[a[0]:a[1]],"======",b)
      # paddle版本
      for a,b in zip(gpt2_pdtokenizer.get_offset_mapping(text),gpt2_pdtokenizer.tokenize(text)):
          print(text[a[0]:a[1]],"======",b)
      

      注:

      • gpt2有关空格的offset mapping,本方法与huggingface略有不同。
      • 可能存在其他问题,仅供参考~
      posted in 技术交流
      183****0229
      183****0229
    • 【思考】SentencePiece tokenizer 的offset mapping实现

      huggingface库的tokenizers只有rust版本的get offset mapping,没有python版本的,因此个人想着试着找个方法实现以下这个方法。

      准备

      安装pip install fastcore

      # 安装fastcore
      # !pip install fastcore
      
      from paddlenlp.transformers import XLNetTokenizer, GPTTokenizer
      from fastcore.all import patch_to
      from paddle.utils import try_import
      from transformers import XLNetTokenizerFast, GPT2TokenizerFast
      
      SENTENCEPIECE_UNDERLINE = "▁"
      import unicodedata
      
      @patch_to(XLNetTokenizer)
      def get_offset_mapping(self, text):
          text = text.replace("``", '"').replace("''", '"')
          normalized_text, char_mapping = '', []
          for i, ch in enumerate(text):
              if not self.keep_accents:
                  ch = unicodedata.normalize("NFKD", ch)
                  if not unicodedata.combining(ch):
                      normalized_text += ch
                      char_mapping.extend([i] * len(ch))
              else:
                  normalized_text += ch
                  char_mapping.extend([i] * len(ch))
      
          if self.do_lower_case:
              normalized_text = normalized_text.lower()
      
          text, token_mapping, offset = normalized_text, [], 0
          split_tokens = self.tokenize(text)
          if split_tokens[0] == SENTENCEPIECE_UNDERLINE:
              split_tokens = split_tokens[1:]
              token_mapping.append((0,0))
      
          for token in split_tokens:
              if token[0] == SENTENCEPIECE_UNDERLINE:
                  token=token[1:]
      
              if len(token) == 0:
                  length = 1
              else:
                  length = len(token)
                  
              start = text[offset:].index(token) + offset
              end = start + length
      
              token_mapping.append(
                  (char_mapping[start], char_mapping[end - 1] + 1))            
              offset = end
      
          return token_mapping
      xlnet_pdtokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
      xlnet_hgtokenizer = XLNetTokenizerFast.from_pretrained("xlnet-base-cased")
      
      # 数据来自squad1.1
      data = {'id': '56f7c651aef2371900625bf5',
       'title': 'Martin_Luther',
       'context': "Martin Luther (/ˈluːθər/ or /ˈluːðər/; German: [ˈmaɐ̯tiːn ˈlʊtɐ] ( listen); 10 November 1483 – 18 February 1546) was a German professor of theology, composer, priest, former monk and a seminal figure in the Protestant Reformation. Luther came to reject several teachings and practices of the Late Medieval Catholic Church. He strongly disputed the claim that freedom from God's punishment for sin could be purchased with money. He proposed an academic discussion of the power and usefulness of indulgences in his Ninety-Five Theses of 1517. His refusal to retract all of his writings at the demand of Pope Leo X in 1520 and the Holy Roman Emperor Charles V at the Diet of Worms in 1521 resulted in his excommunication by the Pope and condemnation as an outlaw by the Emperor.",
       'question': 'Of what nationality was Martin Luther?',
       'answers': ['German', 'German', 'German'],
       'answer_starts': [39, 119, 119],
       'is_impossible': False}
      text = data["context"]
      
      # 比较xlnet的offset mapping。
      # huggingface版本
      for a,b in zip(xlnet_hgtokenizer(text,return_offsets_mapping=True)["offset_mapping"],xlnet_hgtokenizer.tokenize(text)):
          print(text[a[0]:a[1]],"======",b)
      # paddle版本
      for a,b in zip(xlnet_pdtokenizer.get_offset_mapping(text),xlnet_pdtokenizer.tokenize(text)):
          print(text[a[0]:a[1]],"======",b)
      

      注:

      • xlnet有关空格的offset mapping,本方法与huggingface略有不同。
      • 可能存在其他问题,仅供参考~
      posted in 技术交流
      183****0229
      183****0229
    • 【记录】pytorch_tabular

      地址:https://github.com/manujosephv/pytorch_tabular
      介绍:为表格数据建模深度学习模型的标准框架。[PyTorch和PyTorch Lightning框架]
      PyTorch Tabular 旨在让表格形式数据的深度学习变得容易,并且可供现实案例和研究使用。
      安装: pip install pytorch_tabular[all]
      文档:https://pytorch-tabular.readthedocs.io/en/latest/
      可用模型:

      • FeedForward Network with Category Embedding is a simple FF network, but with an Embedding layers for the categorical columns.
      • Neural Oblivious Decision Ensembles for Deep Learning on Tabular Data is a model presented in ICLR 2020 and according to the authors have beaten well-tuned Gradient Boosting models on many datasets.
      • TabNet: Attentive Interpretable Tabular Learning is another model coming out of Google Research which uses Sparse Attention in multiple steps of decision making to model the output.
      • Mixture Density Networks is a regression model which uses gaussian components to approximate the target function and provide a probabilistic prediction out of the box.
      • AutoInt: Automatic Feature Interaction Learning via Self-Attentive Neural Networks is a model which tries to learn interactions between the features in an automated way and create a better representation and then use this representation in downstream task
      • TabTransformer is an adaptation of the Transformer model for Tabular Data which creates contextual representations for categorical features.
      • FT Transformer from Revisiting Deep Learning Models for Tabular Data

      使用:

      from pytorch_tabular import TabularModel
      from pytorch_tabular.models import CategoryEmbeddingModelConfig
      from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
      
      data_config = DataConfig(
          target=['target'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
          continuous_cols=num_col_names,
          categorical_cols=cat_col_names,
      )
      trainer_config = TrainerConfig(
          auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
          batch_size=1024,
          max_epochs=100,
          gpus=1, #index of the GPU to use. 0, means CPU
      )
      optimizer_config = OptimizerConfig()
      
      model_config = CategoryEmbeddingModelConfig(
          task="classification",
          layers="1024-512-512",  # Number of nodes in each layer
          activation="LeakyReLU", # Activation between each layers
          learning_rate = 1e-3
      )
      
      tabular_model = TabularModel(
          data_config=data_config,
          model_config=model_config,
          optimizer_config=optimizer_config,
          trainer_config=trainer_config,
      )
      tabular_model.fit(train=train, validation=val)
      result = tabular_model.evaluate(test)
      pred_df = tabular_model.predict(test)
      tabular_model.save_model("examples/basic")
      loaded_model = TabularModel.load_from_checkpoint("examples/basic")
      
      posted in 技术交流
      183****0229
      183****0229
    • 【记录】x-transformers库

      地址:https://github.com/lucidrains/x-transformers
      安装:pip install x-transformers
      优点:

      • 这里包含了很多transformer的变种模型,感觉很全。
      • 可以通过这里的代码学习一下pytorch和python知识。
      • 可以学习一下rearrange库的使用。

      缺点:

      • 无法加载预训权重,因为改起来会需要费精力。

      例子:

      Full encoder / decoder

      import torch
      from x_transformers import XTransformer
      
      model = XTransformer(
          dim = 512,
          enc_num_tokens = 256,
          enc_depth = 6,
          enc_heads = 8,
          enc_max_seq_len = 1024,
          dec_num_tokens = 256,
          dec_depth = 6,
          dec_heads = 8,
          dec_max_seq_len = 1024,
          tie_token_emb = True      # tie embeddings of encoder and decoder
      )
      
      src = torch.randint(0, 256, (1, 1024))
      src_mask = torch.ones_like(src).bool()
      tgt = torch.randint(0, 256, (1, 1024))
      tgt_mask = torch.ones_like(tgt).bool()
      
      loss = model(src, tgt, src_mask = src_mask, tgt_mask = tgt_mask) # (1, 1024, 512)
      loss.backward()
      

      GPT

      import torch
      from x_transformers import TransformerWrapper, Decoder
      
      model = TransformerWrapper(
          num_tokens = 20000,
          max_seq_len = 1024,
          attn_layers = Decoder(
              dim = 512,
              depth = 12,
              heads = 8
          )
      ).cuda()
      
      x = torch.randint(0, 256, (1, 1024)).cuda()
      
      model(x) # (1, 1024, 20000)
      

      END

      更多的内容可以看下这个作者的其他仓库,感觉写的都非常好!!!

      posted in 技术交流
      183****0229
      183****0229
    • 【记录】rotary-embedding-torch

      旋转位置编码
      地址:https://github.com/lucidrains/rotary-embedding-torch
      安装:pip install rotary-embedding-torch

      代码(可以发现这里使用到了rearrange库)

      from inspect import isfunction
      from math import pi, log
      
      import torch
      from torch import nn, einsum
      
      from einops import rearrange, repeat
      
      # helper functions
      
      def exists(val):
          return val is not None
      
      def broadcat(tensors, dim = -1):
          num_tensors = len(tensors)
          shape_lens = set(list(map(lambda t: len(t.shape), tensors)))
          assert len(shape_lens) == 1, 'tensors must all have the same number of dimensions'
          shape_len = list(shape_lens)[0]
      
          dim = (dim + shape_len) if dim < 0 else dim
          dims = list(zip(*map(lambda t: list(t.shape), tensors)))
      
          expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
          assert all([*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]), 'invalid dimensions for broadcastable concatentation'
          max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims))
          expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims))
          expanded_dims.insert(dim, (dim, dims[dim]))
          expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims)))
          tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes)))
          return torch.cat(tensors, dim = dim)
      
      # rotary embedding helper functions
      
      def rotate_half(x):
          x = rearrange(x, '... (d r) -> ... d r', r = 2)
          x1, x2 = x.unbind(dim = -1)
          x = torch.stack((-x2, x1), dim = -1)
          return rearrange(x, '... d r -> ... (d r)')
      
      def apply_rotary_emb(freqs, t, start_index = 0):
          rot_dim = freqs.shape[-1]
          end_index = start_index + rot_dim
          assert rot_dim <= t.shape[-1], f'feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}'
          t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
          t = (t * freqs.cos()) + (rotate_half(t) * freqs.sin())
          return torch.cat((t_left, t, t_right), dim = -1)
      
      # learned rotation helpers
      
      def apply_learned_rotations(rotations, t, start_index = 0, freq_ranges = None):
          if exists(freq_ranges):
              rotations = einsum('..., f -> ... f', rotations, freq_ranges)
              rotations = rearrange(rotations, '... r f -> ... (r f)')
      
          rotations = repeat(rotations, '... n -> ... (n r)', r = 2)
          return apply_rotary_emb(rotations, t, start_index = start_index)
      
      # classes
      
      class RotaryEmbedding(nn.Module):
          def __init__(
              self,
              dim,
              custom_freqs = None,
              freqs_for = 'lang',
              theta = 10000,
              max_freq = 10,
              num_freqs = 1,
              learned_freq = False
          ):
              super().__init__()
              if exists(custom_freqs):
                  freqs = custom_freqs
              elif freqs_for == 'lang':
                  freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
              elif freqs_for == 'pixel':
                  freqs = torch.logspace(0., log(max_freq / 2) / log(2), dim // 2, base = 2) * pi
              elif freqs_for == 'constant':
                  freqs = torch.ones(num_freqs).float()
              else:
                  raise ValueError(f'unknown modality {freqs_for}')
      
              self.cache = dict()
      
              if learned_freq:
                  self.freqs = nn.Parameter(freqs)
              else:
                  self.register_buffer('freqs', freqs)
      
          def forward(self, t, cache_key = None):
              if exists(cache_key) and cache_key in self.cache:
                  return self.cache[cache_key]
      
              if isfunction(t):
                  t = t()
      
              freqs = self.freqs
      
              freqs = torch.einsum('..., f -> ... f', t.type(freqs.dtype), freqs)
              freqs = repeat(freqs, '... n -> ... (n r)', r = 2)
      
              if exists(cache_key):
                  self.cache[cache_key] = freqs
      
              return freqs
      

      END

      这里的实现更为优雅,比我之前实现的要美观多了!!!

      posted in 技术交流
      183****0229
      183****0229
    • 【记录】pytorch_scatter工具

      地址:https://github.com/rusty1s/pytorch_scatter
      pytorch_scatter包是一个高度优化的稀疏update操作(scatter 和 segment)的小型扩展库。(PyTorch主包中缺少这些操作)。

      安装:

      pip

      pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+${CUDA}.html

      conda

      conda install pytorch-scatter -c rusty1s

      例子:

      import torch
      from torch_scatter import scatter_max
      
      src = torch.tensor([[2, 0, 1, 4, 3], [0, 2, 1, 3, 4]])
      index = torch.tensor([[4, 5, 4, 2, 3], [0, 0, 2, 2, 1]])
      
      out, argmax = scatter_max(src, index, dim=-1)
      
      print(out)
      tensor([[0, 0, 4, 3, 2, 0],
              [2, 4, 3, 0, 0, 0]])
      
      print(argmax)
      tensor([[5, 5, 3, 4, 0, 1]
              [1, 4, 3, 5, 5, 5]])
      

      END:

      使用了这个方法,可以帮助我们快速对最终的结果进行一些sao操作。

      posted in 技术交流
      183****0229
      183****0229