【记录】使用python获取Bert类型分词器的offset_mapping
-
本次的代码主要参考了:https://github.com/bojone/bert4keras/blob/master/bert4keras/tokenizers.py#L372
参考的代码:
def rematch(self, text, tokens): """给出原始的text和tokenize后的tokens的映射关系 """ if is_py2: text = unicode(text) if self._do_lower_case: text = text.lower() normalized_text, char_mapping = '', [] for i, ch in enumerate(text): if self._do_lower_case: ch = unicodedata.normalize('NFD', ch) ch = ''.join([c for c in ch if unicodedata.category(c) != 'Mn']) ch = ''.join([ c for c in ch if not (ord(c) == 0 or ord(c) == 0xfffd or self._is_control(c)) ]) normalized_text += ch char_mapping.extend([i] * len(ch)) text, token_mapping, offset = normalized_text, [], 0 for token in tokens: if self._is_special(token): token_mapping.append([]) else: token = self.stem(token) start = text[offset:].index(token) + offset end = start + len(token) token_mapping.append(char_mapping[start:end]) offset = end return token_mapping
修改的代码(简化api使用,加修复当中的bug):
def get_offset_mapping(self, text): """ Returns the map of tokens and the start and end index of their start and end character. Modified from https://github.com/bojone/bert4keras/blob/master/bert4keras/tokenizers.py#L372 Args: text (str): Input text. Returns: list: The offset map of input text. """ split_tokens = [] for token in self.basic_tokenizer.tokenize(text): for sub_token in self.wordpiece_tokenizer.tokenize(token): split_tokens.append(sub_token if sub_token != self.unk_token else token) normalized_text, char_mapping = '', [] for i, ch in enumerate(text): # do_lower_case的判断得放到这里,放到上面的话,会存在重音这个特殊字符的标注产生偏移。 if self.basic_tokenizer.do_lower_case: ch = ch.lower() ch = unicodedata.normalize('NFD', ch) ch = ''.join([c for c in ch if unicodedata.category(c) != 'Mn']) ch = ''.join([ c for c in ch if not (ord(c) == 0 or ord(c) == 0xfffd or _is_control(c)) ]) normalized_text += ch char_mapping.extend([i] * len(ch)) text, token_mapping, offset = normalized_text, [], 0 for token in split_tokens: if token[:2] == '##': token = token[2:] start = text[offset:].index(token) + offset end = start + len(token) token_mapping.append( (char_mapping[start], char_mapping[end - 1] + 1)) offset = end return token_mapping
注:
huggingface的transformers,只有fast版本的tokenizer才能使用offset_mapping,python版本无法获得offset_mapping。