Mapping chars and words to tokens

Reason is the light and the light of life.

Jerry Su Feb 22, 2022 3 mins

How to Convert Characters, Tokens, and Words

1. Mapping char to token.

from transformers import BigBirdTokenizerFast
tokenizer = BigBirdTokenizerFast.from_pretrained('allenai/longformer-large-4096')
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
text = "Phones\n\nModern humans today are always on their phone. "
text_encoded = tokenizer(text, return_offsets_mapping=True, max_length=512, truncation=True)
input_ids = text_encoded['input_ids']
tokens = tokenizer.convert_ids_to_tokens(input_ids)
offset_mapping = text_encoded['offset_mapping']
print(f"input_ids:\t {input_ids}, len: {len(input_ids)}")
print(f"tokens:\t\t {tokens}, len: {len(tokens)}")
print(f"offset_mapping:  {offset_mapping}, len: {len(offset_mapping)}")
text
input_ids:   [0, 48083, 50118, 50118, 39631, 5868, 452, 32, 460, 15, 49, 1028, 4, 1437, 2], len: 15
tokens:      ['<s>', 'Phones', 'Ċ', 'Ċ', 'Modern', 'Ġhumans', 'Ġtoday', 'Ġare', 'Ġalways', 'Ġon', 'Ġtheir', 'Ġphone', '.', 'Ġ', '</s>'], len: 15
offset_mapping:  [(0, 0), (0, 6), (6, 7), (7, 8), (8, 14), (15, 21), (22, 27), (28, 31), (32, 38), (39, 41), (42, 47), (48, 53), (53, 54), (55, 55), (0, 0)], len: 15





'Phones\n\nModern humans today are always on their phone. '

offset_mapping: tokens与offset_mapping意义对应,意义当前token在原文本中的start,end位置。 [start, end)左闭右开 ‘Phones’->start:0 end:6

new_span = []
for i in offset_mapping:
    if i[0] == i[1]:
        new_span.append([])
    else:
        if i[0] + 1 == i[1]:
            new_span.append([i[0]])
        else:
            new_span.append([i[0], i[-1]])
new_span         
[[],
 [0, 6],
 [6],
 [7],
 [8, 14],
 [15, 21],
 [22, 27],
 [28, 31],
 [32, 38],
 [39, 41],
 [42, 47],
 [48, 53],
 [53],
 [],
 []]


tokens_with_offset_mapping = [text[ele[0]:ele[1]] for ele in offset_mapping]
print(f"tokens_with_offset_mapping: {tokens_with_offset_mapping}, len: {len(tokens_with_offset_mapping)}")
tokens_with_offset_mapping: ['', 'Phones', '\n', '\n', 'Modern', 'humans', 'today', 'are', 'always', 'on', 'their', 'phone', '.', '', ''], len: 15
text = "Phones\n\nModern humans today are always on their phone. " 
# Modern为实体,则(8, 14)

j[-1]表示end,-1由于标注数据end是闭区间,offset_mapping返回的[start, end)end是开区间,-1与标注数据end对应

start_mapping = {j[0]: i for i, j in enumerate(offset_mapping) if j != (0, 0)}
end_mapping = {j[-1]: i for i, j in enumerate(offset_mapping) if j != (0, 0)}
print(start_mapping)
print(end_mapping)
{0: 1, 6: 2, 7: 3, 8: 4, 15: 5, 22: 6, 28: 7, 32: 8, 39: 9, 42: 10, 48: 11, 53: 12, 55: 13}
{6: 1, 7: 2, 8: 3, 14: 4, 21: 5, 27: 6, 31: 7, 38: 8, 41: 9, 47: 10, 53: 11, 54: 12, 55: 13}
# test "Modern humans"
char_start, char_end = 8, 20
entity_label = text[char_start:char_end]
print(f"entity_label: {entity_label}")

token_start, token_end = start_mapping[char_start], end_mapping[char_end]
entity_token = tokens[token_start:token_end+1]
print(f"entity_token; {entity_token}")
entity_label: Modern human
entity_token; ['Modern', 'Ġhumans']



2.Mapping words to tokens.

text = "Phones\n\nModern. humans. today are always on their phone. " 
print(f"text split: {text.split()}")
encoding = tokenizer(text.split(), is_split_into_words=True, truncation=True, max_length=512)
input_ids = encoding['input_ids']
print(f"input_ids: {input_ids}, len: {len(input_ids)}")
tokens = tokenizer.convert_ids_to_tokens(input_ids)
print(f"token: {tokens}, len: {len(tokens)}")
word_ids = encoding.word_ids() 
print(f"word_ids: {word_ids}, len: {len(word_ids)}")
text split: ['Phones', 'Modern.', 'humans.', 'today', 'are', 'always', 'on', 'their', 'phone.']
input_ids: [0, 48083, 39631, 4, 44734, 4, 34375, 1322, 30035, 261, 25017, 17283, 4, 2], len: 14
token: ['<s>', 'Phones', 'Modern', '.', 'humans', '.', 'today', 'are', 'always', 'on', 'their', 'phone', '.', '</s>'], len: 14
word_ids: [None, 0, 1, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, None], len: 14
print(text.split())
['Phones', 'Modern.', 'humans.', 'today', 'are', 'always', 'on', 'their', 'phone.']
word_start, word_end = 1, 3  # [)
text.split()[word_start:word_end]
['Modern.', 'humans.']
token_start = word_ids.index(word_start)
token_start
2
def mapping_word_to_token(word_ids, word_start, word_end):
    token_start, token_end = -1, -1
    for idx, word_id in enumerate(word_ids):
        if word_id == word_start:
            token_start = idx
            break
    for idx, word_id in enumerate(word_ids):
        if word_id == word_end:
            token_end = idx
    return token_start, token_end

mapping_word_to_token(word_ids, 1, 2)    
(2, 5)
def mapping_token_to_word(word_ids, token_start, token_end):
    return word_ids[token_start], word_ids[token_end]

mapping_token_to_word(word_ids, 1, 5)
(0, 2)


import numpy as np
a = np.random.randn(2, 3, 3)
b = np.random.randn(2, 3, 3)
a
array([[[-2.19866978,  1.1447375 ,  0.92294878],
        [ 0.25705591, -0.06175304, -0.41273075],
        [ 1.17907811, -0.5812148 ,  0.64390744]],

       [[ 0.14645867,  2.24804856, -1.49206082],
        [ 0.59735229,  0.33992097,  2.49375344],
        [-0.67786164, -0.52116344, -0.78938703]]])
b
array([[[-1.14554669, -1.05169601, -0.57564661],
        [-0.66068856,  1.50527007,  1.06941421],
        [-0.03592041, -1.13148369, -2.10340704]],

       [[-0.61072201, -0.53944643,  0.62138492],
        [ 1.06405361,  0.51542254,  0.3228443 ],
        [-0.39662927, -2.01357541,  1.58483281]]])
(a+b)/2
array([[[-1.67210823,  0.04652075,  0.17365108],
        [-0.20181632,  0.72175852,  0.32834173],
        [ 0.57157885, -0.85634924, -0.7297498 ]],

       [[-0.23213167,  0.85430107, -0.43533795],
        [ 0.83070295,  0.42767175,  1.40829887],
        [-0.53724545, -1.26736943,  0.39772289]]])


Read more:

Related posts: