|
| 1 | +"""Byte-Pair Encoding: Subword-based tokenization algorithm used |
| 2 | +by state-of-the-art language models. |
| 3 | +
|
| 4 | +Wikipedia: https://en.wikipedia.org/wiki/Byte_pair_encoding""" |
| 5 | + |
| 6 | +import itertools |
| 7 | +from collections import OrderedDict |
| 8 | + |
| 9 | + |
| 10 | +def get_byte_pair_counts(ids: list[int]): |
| 11 | + """Count consecutive byte-pairs of an encoded string. |
| 12 | +
|
| 13 | + >>> ids = [73, 32, 97, 109, 32, 74, 111, 110, 83, 110, 111, 119, 46] |
| 14 | + >>> get_byte_pair_counts(ids) |
| 15 | + {(73, 32): 1, (32, 97): 1, (97, 109): 1, (109, 32): 1, (32, 74): 1, (74, 111): 1, (111, 110): 1, (110, 83): 1, (83, 110): 1, (110, 111): 1, (111, 119): 1, (119, 46): 1} |
| 16 | + >>> ids = [2, 3, 6, 2, 3, 6, 2, 5] |
| 17 | + >>> get_byte_pair_counts(ids) |
| 18 | + {(2, 3): 2, (3, 6): 2, (6, 2): 2, (2, 5): 1} |
| 19 | + """ # noqa: E501 |
| 20 | + counts: dict = {} |
| 21 | + for pair in itertools.pairwise(ids): |
| 22 | + counts[pair] = counts.get(pair, 0) + 1 |
| 23 | + return counts |
| 24 | + |
| 25 | + |
| 26 | +def merge(ids: list[int], pair: tuple, idx: int): |
| 27 | + """Replace most occurring byte pair with new byte that is not used |
| 28 | + in the data. For utf-8 encoding, we start with 256 as the new byte |
| 29 | +
|
| 30 | + >>> ids = [2, 3, 6, 2, 3, 6, 2, 5] |
| 31 | + >>> pair = (2, 3) |
| 32 | + >>> idx = 256 |
| 33 | + >>> merge(ids, pair, idx) |
| 34 | + [256, 6, 256, 6, 2, 5] |
| 35 | + """ |
| 36 | + new_ids = [] |
| 37 | + i = 0 |
| 38 | + while i < len(ids): |
| 39 | + if i < len(ids) - 1 and (ids[i] == pair[0] and ids[i + 1] == pair[1]): |
| 40 | + new_ids.append(idx) |
| 41 | + i += 2 |
| 42 | + else: |
| 43 | + new_ids.append(ids[i]) |
| 44 | + i += 1 |
| 45 | + return new_ids |
| 46 | + |
| 47 | + |
| 48 | +class Tokenizer: |
| 49 | + """Tokenize a string using the byte-pair encoding algorithm""" |
| 50 | + |
| 51 | + def __init__(self, num_merges: int = 20, verbose: bool = False): |
| 52 | + self.num_merges = num_merges |
| 53 | + self.merges: dict = {} |
| 54 | + self.verbose = verbose |
| 55 | + |
| 56 | + def encode(self, text: str): |
| 57 | + """Convert a string to tokens (bytes) |
| 58 | +
|
| 59 | + >>> t = Tokenizer() |
| 60 | + >>> text = "I am JonSnow." |
| 61 | + >>> t.encode(text) |
| 62 | + [73, 32, 97, 109, 32, 74, 111, 110, 83, 110, 111, 119, 46] |
| 63 | +
|
| 64 | + >>> t = Tokenizer() |
| 65 | + >>> text = "" |
| 66 | + >>> t.encode(text) |
| 67 | + [] |
| 68 | + """ |
| 69 | + text_b = text.encode("utf-8") # raw bytes |
| 70 | + tokens = list(map(int, text_b)) # convert to list of integers |
| 71 | + |
| 72 | + if self.verbose: |
| 73 | + print(f"Input text: {text}") |
| 74 | + print(f"Tokens: {tokens}") |
| 75 | + |
| 76 | + ids = list(tokens) # create a copy of tokens |
| 77 | + self.merges = OrderedDict() # store a mapping of merges (int, int) -> int |
| 78 | + max_merges = len(tokens) - 1 |
| 79 | + num_merges = min(self.num_merges, max_merges) |
| 80 | + # start merging most frequently occurring byte pairs |
| 81 | + for i in range(num_merges): |
| 82 | + counts = get_byte_pair_counts(ids) |
| 83 | + pair = max(counts, key=counts.get) |
| 84 | + |
| 85 | + if counts[pair] == 1: |
| 86 | + continue |
| 87 | + |
| 88 | + idx = 256 + i # create new token for every merge step |
| 89 | + if self.verbose: |
| 90 | + print(f"Merging {pair} into a new token {idx}") |
| 91 | + ids = merge(ids, pair, idx) |
| 92 | + self.merges[pair] = idx |
| 93 | + |
| 94 | + return ids |
| 95 | + |
| 96 | + def decode(self, ids: list[int]): |
| 97 | + """Convert a list of tokens to the original string |
| 98 | +
|
| 99 | + >>> t = Tokenizer() |
| 100 | + >>> ids = [73, 32, 97, 109, 32, 74, 111, 110, 83, 110, 111, 119, 46] |
| 101 | + >>> t.decode(ids) |
| 102 | + 'I am JonSnow.' |
| 103 | +
|
| 104 | + >>> t = Tokenizer() |
| 105 | + >>> ids = [] |
| 106 | + >>> t.decode(ids) |
| 107 | + '' |
| 108 | + """ |
| 109 | + vocab = {idx: bytes([idx]) for idx in range(256)} # original vocabulary |
| 110 | + # The iteration of items should be in the order of |
| 111 | + # their insertion. This is the default behavior in Python 3 |
| 112 | + # but we use an OrderedDict explicitly here |
| 113 | + for (p0, p1), idx in self.merges.items(): |
| 114 | + vocab[idx] = vocab[p0] + vocab[p1] |
| 115 | + |
| 116 | + if self.verbose: |
| 117 | + print("Vocabulary (after merging): {vocab}") |
| 118 | + |
| 119 | + tokens = b"".join(vocab[idx] for idx in ids) |
| 120 | + # handle UnicodeDecodeError by replacing the invalid |
| 121 | + # start byte to conform to utf-8 format |
| 122 | + text = tokens.decode("utf-8", errors="replace") |
| 123 | + return text |
| 124 | + |
| 125 | + |
| 126 | +if __name__ == "__main__": |
| 127 | + import doctest |
| 128 | + |
| 129 | + doctest.testmod() |
0 commit comments