Navigation

    Gpushare.com

    • Register
    • Login
    • Search
    • Popular
    • Categories
    • Recent
    • Tags

    【记录】einops工具

    技术交流
    1
    1
    70
    Loading More Posts
    • Oldest to Newest
    • Newest to Oldest
    • Most Votes
    Reply
    • Reply as topic
    Log in to reply
    This topic has been deleted. Only users with topic management privileges can see it.
    • 183****0229
      183****0229 last edited by

      地址:https://github.com/arogozhnikov/einops
      安装:pip install einops
      einops库通过灵活而强大的张量操作符为你提供易读并可靠的代码。
      支持 numpy、pytorch、tensorflow 等等。

      实现缩放点积Attention

      不使用einops代码量

      class ScaledDotProductAttention(nn.Module):
          ''' Scaled Dot-Product Attention '''
      
          def __init__(self, temperature, attn_dropout=0.1):
              super().__init__()
              self.temperature = temperature
              self.dropout = nn.Dropout(attn_dropout)
              self.softmax = nn.Softmax(dim=2)
      
          def forward(self, q, k, v, mask=None):
      
              attn = torch.bmm(q, k.transpose(1, 2))
              attn = attn / self.temperature
      
              if mask is not None:
                  attn = attn.masked_fill(mask, -np.inf)
      
              attn = self.softmax(attn)
              attn = self.dropout(attn)
              output = torch.bmm(attn, v)
      
              return output, attn
      
      
      
      class MultiHeadAttentionOld(nn.Module):
          ''' Multi-Head Attention module '''
      
          def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
              super().__init__()
      
              self.n_head = n_head
              self.d_k = d_k
              self.d_v = d_v
      
              self.w_qs = nn.Linear(d_model, n_head * d_k)
              self.w_ks = nn.Linear(d_model, n_head * d_k)
              self.w_vs = nn.Linear(d_model, n_head * d_v)
              nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
              nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
              nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v)))
      
              self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5))
              self.layer_norm = nn.LayerNorm(d_model)
      
              self.fc = nn.Linear(n_head * d_v, d_model)
              nn.init.xavier_normal_(self.fc.weight)
      
              self.dropout = nn.Dropout(dropout)
      
      
          def forward(self, q, k, v, mask=None):
              
              d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
              
              sz_b, len_q, _ = q.size()
              sz_b, len_k, _ = k.size()
              sz_b, len_v, _ = v.size()
              
              residual = q
              
              q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
              k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
              v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
              
              q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k) # (n*b) x lq x dk
              k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k) # (n*b) x lk x dk
              v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v) # (n*b) x lv x dv
              
              mask = mask.repeat(n_head, 1, 1) # (n*b) x .. x ..
              output, attn = self.attention(q, k, v, mask=mask)
              
              output = output.view(n_head, sz_b, len_q, d_v)
              output = output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1) # b x lq x (n*dv)
              
              output = self.dropout(self.fc(output))
              output = self.layer_norm(output + residual)
              
              return output, attn
      

      使用einops后的代码量(可以发现代码行数变短好多。主要是在维度交换,拆分的那部分!)

      class MultiHeadAttentionNew(nn.Module):
          def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
              super().__init__()
              self.n_head = n_head
              
              self.w_qs = nn.Linear(d_model, n_head * d_k)
              self.w_ks = nn.Linear(d_model, n_head * d_k)
              self.w_vs = nn.Linear(d_model, n_head * d_v)
              
              nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
              nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
              nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v)))
              
              self.fc = nn.Linear(n_head * d_v, d_model)
              nn.init.xavier_normal_(self.fc.weight)
              self.dropout = nn.Dropout(p=dropout)
              self.layer_norm = nn.LayerNorm(d_model)
      
          def forward(self, q, k, v, mask=None):
              residual = q
              q = rearrange(self.w_qs(q), 'b l (head k) -> head b l k', head=self.n_head)
              k = rearrange(self.w_ks(k), 'b t (head k) -> head b t k', head=self.n_head)
              v = rearrange(self.w_vs(v), 'b t (head v) -> head b t v', head=self.n_head)
              attn = torch.einsum('hblk,hbtk->hblt', [q, k]) / np.sqrt(q.shape[-1])
              if mask is not None:
                  attn = attn.masked_fill(mask[None], -np.inf)
              attn = torch.softmax(attn, dim=3)
              output = torch.einsum('hblt,hbtv->hblv', [attn, v])
              output = rearrange(output, 'head b l v -> b l (head v)')
              output = self.dropout(self.fc(output))
              output = self.layer_norm(output + residual)
              return output, attn
      

      注:

      使用einops库,一方面可以方便进行维度拆分,维度交换等操作,另一方面最主要它有更强的可读性,这样我们可以更好的理解里面的维度代表的意义!

      1 Reply Last reply Reply Quote 1
      • First post
        Last post