openai/tiktoken
Publicmirrored fromhttps://github.com/openai/tiktokenAvailable
tiktoken_ext/openai_public.py
162lines · modecode
| 1 | from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe |
| 2 | |
| 3 | ENDOFTEXT = "<|endoftext|>" |
| 4 | FIM_PREFIX = "<|fim_prefix|>" |
| 5 | FIM_MIDDLE = "<|fim_middle|>" |
| 6 | FIM_SUFFIX = "<|fim_suffix|>" |
| 7 | ENDOFPROMPT = "<|endofprompt|>" |
| 8 | |
| 9 | # The pattern in the original GPT-2 release is: |
| 10 | # r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" |
| 11 | # This is equivalent, but executes faster: |
| 12 | r50k_pat_str = ( |
| 13 | r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s++$|\s+(?!\S)|\s""" |
| 14 | ) |
| 15 | |
| 16 | |
| 17 | def gpt2(): |
| 18 | mergeable_ranks = data_gym_to_mergeable_bpe_ranks( |
| 19 | vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe", |
| 20 | encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json", |
| 21 | vocab_bpe_hash="1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5", |
| 22 | encoder_json_hash="196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783", |
| 23 | ) |
| 24 | return { |
| 25 | "name": "gpt2", |
| 26 | "explicit_n_vocab": 50257, |
| 27 | "pat_str": r50k_pat_str, |
| 28 | "mergeable_ranks": mergeable_ranks, |
| 29 | "special_tokens": {ENDOFTEXT: 50256}, |
| 30 | } |
| 31 | |
| 32 | |
| 33 | def r50k_base(): |
| 34 | mergeable_ranks = load_tiktoken_bpe( |
| 35 | "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken", |
| 36 | expected_hash="306cd27f03c1a714eca7108e03d66b7dc042abe8c258b44c199a7ed9838dd930", |
| 37 | ) |
| 38 | return { |
| 39 | "name": "r50k_base", |
| 40 | "explicit_n_vocab": 50257, |
| 41 | "pat_str": r50k_pat_str, |
| 42 | "mergeable_ranks": mergeable_ranks, |
| 43 | "special_tokens": {ENDOFTEXT: 50256}, |
| 44 | } |
| 45 | |
| 46 | |
| 47 | def p50k_base(): |
| 48 | mergeable_ranks = load_tiktoken_bpe( |
| 49 | "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken", |
| 50 | expected_hash="94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069", |
| 51 | ) |
| 52 | return { |
| 53 | "name": "p50k_base", |
| 54 | "explicit_n_vocab": 50281, |
| 55 | "pat_str": r50k_pat_str, |
| 56 | "mergeable_ranks": mergeable_ranks, |
| 57 | "special_tokens": {ENDOFTEXT: 50256}, |
| 58 | } |
| 59 | |
| 60 | |
| 61 | def p50k_edit(): |
| 62 | mergeable_ranks = load_tiktoken_bpe( |
| 63 | "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken", |
| 64 | expected_hash="94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069", |
| 65 | ) |
| 66 | special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283} |
| 67 | return { |
| 68 | "name": "p50k_edit", |
| 69 | "pat_str": r50k_pat_str, |
| 70 | "mergeable_ranks": mergeable_ranks, |
| 71 | "special_tokens": special_tokens, |
| 72 | } |
| 73 | |
| 74 | |
| 75 | def cl100k_base(): |
| 76 | mergeable_ranks = load_tiktoken_bpe( |
| 77 | "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken", |
| 78 | expected_hash="223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7", |
| 79 | ) |
| 80 | special_tokens = { |
| 81 | ENDOFTEXT: 100257, |
| 82 | FIM_PREFIX: 100258, |
| 83 | FIM_MIDDLE: 100259, |
| 84 | FIM_SUFFIX: 100260, |
| 85 | ENDOFPROMPT: 100276, |
| 86 | } |
| 87 | return { |
| 88 | "name": "cl100k_base", |
| 89 | "pat_str": r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s++$|\s*[\r\n]|\s+(?!\S)|\s""", |
| 90 | "mergeable_ranks": mergeable_ranks, |
| 91 | "special_tokens": special_tokens, |
| 92 | } |
| 93 | |
| 94 | |
| 95 | def o200k_base(): |
| 96 | mergeable_ranks = load_tiktoken_bpe( |
| 97 | "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken", |
| 98 | expected_hash="446a9538cb6c348e3516120d7c08b09f57c36495e2acfffe59a5bf8b0cfb1a2d", |
| 99 | ) |
| 100 | special_tokens = {ENDOFTEXT: 199999, ENDOFPROMPT: 200018} |
| 101 | # This regex could be made more efficient. If I was the one working on this encoding, I would |
| 102 | # have done a few other things differently too, e.g. I think you can allocate tokens more |
| 103 | # efficiently across languages. |
| 104 | pat_str = "|".join( |
| 105 | [ |
| 106 | r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""", |
| 107 | r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""", |
| 108 | r"""\p{N}{1,3}""", |
| 109 | r""" ?[^\s\p{L}\p{N}]+[\r\n/]*""", |
| 110 | r"""\s*[\r\n]+""", |
| 111 | r"""\s+(?!\S)""", |
| 112 | r"""\s+""", |
| 113 | ] |
| 114 | ) |
| 115 | return { |
| 116 | "name": "o200k_base", |
| 117 | "pat_str": pat_str, |
| 118 | "mergeable_ranks": mergeable_ranks, |
| 119 | "special_tokens": special_tokens, |
| 120 | } |
| 121 | |
| 122 | |
| 123 | def o200k_harmony(): |
| 124 | base_enc = o200k_base() |
| 125 | name = "o200k_harmony" |
| 126 | pat_str = base_enc["pat_str"] |
| 127 | mergeable_ranks = base_enc["mergeable_ranks"] |
| 128 | special_tokens = { |
| 129 | **base_enc["special_tokens"], |
| 130 | "<|startoftext|>": 199998, |
| 131 | "<|endoftext|>": 199999, |
| 132 | "<|reserved_200000|>": 200000, |
| 133 | "<|reserved_200001|>": 200001, |
| 134 | "<|return|>": 200002, |
| 135 | "<|constrain|>": 200003, |
| 136 | "<|reserved_200004|>": 200004, |
| 137 | "<|channel|>": 200005, |
| 138 | "<|start|>": 200006, |
| 139 | "<|end|>": 200007, |
| 140 | "<|message|>": 200008, |
| 141 | "<|reserved_200009|>": 200009, |
| 142 | "<|reserved_200010|>": 200010, |
| 143 | "<|reserved_200011|>": 200011, |
| 144 | "<|call|>": 200012, |
| 145 | } | {f"<|reserved_{i}|>": i for i in range(200013, 201088)} |
| 146 | return { |
| 147 | "name": name, |
| 148 | "pat_str": pat_str, |
| 149 | "mergeable_ranks": mergeable_ranks, |
| 150 | "special_tokens": special_tokens, |
| 151 | } |
| 152 | |
| 153 | |
| 154 | ENCODING_CONSTRUCTORS = { |
| 155 | "gpt2": gpt2, |
| 156 | "r50k_base": r50k_base, |
| 157 | "p50k_base": p50k_base, |
| 158 | "p50k_edit": p50k_edit, |
| 159 | "cl100k_base": cl100k_base, |
| 160 | "o200k_base": o200k_base, |
| 161 | "o200k_harmony": o200k_harmony, |
| 162 | } |
| 163 | |