openai/tiktoken

Public

mirrored fromhttps://github.com/openai/tiktokenAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
08a5f3b2c987ada4fc5aa1f16c643c203fa8acaa

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

tiktoken_ext/openai_public.py

162lines · modecode

1from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe
2
3ENDOFTEXT = "<|endoftext|>"
4FIM_PREFIX = "<|fim_prefix|>"
5FIM_MIDDLE = "<|fim_middle|>"
6FIM_SUFFIX = "<|fim_suffix|>"
7ENDOFPROMPT = "<|endofprompt|>"
8
9# The pattern in the original GPT-2 release is:
10# r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
11# This is equivalent, but executes faster:
12r50k_pat_str = (
13 r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s++$|\s+(?!\S)|\s"""
14)
15
16
17def gpt2():
18 mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
19 vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe",
20 encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json",
21 vocab_bpe_hash="1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5",
22 encoder_json_hash="196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783",
23 )
24 return {
25 "name": "gpt2",
26 "explicit_n_vocab": 50257,
27 "pat_str": r50k_pat_str,
28 "mergeable_ranks": mergeable_ranks,
29 "special_tokens": {ENDOFTEXT: 50256},
30 }
31
32
33def r50k_base():
34 mergeable_ranks = load_tiktoken_bpe(
35 "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken",
36 expected_hash="306cd27f03c1a714eca7108e03d66b7dc042abe8c258b44c199a7ed9838dd930",
37 )
38 return {
39 "name": "r50k_base",
40 "explicit_n_vocab": 50257,
41 "pat_str": r50k_pat_str,
42 "mergeable_ranks": mergeable_ranks,
43 "special_tokens": {ENDOFTEXT: 50256},
44 }
45
46
47def p50k_base():
48 mergeable_ranks = load_tiktoken_bpe(
49 "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
50 expected_hash="94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069",
51 )
52 return {
53 "name": "p50k_base",
54 "explicit_n_vocab": 50281,
55 "pat_str": r50k_pat_str,
56 "mergeable_ranks": mergeable_ranks,
57 "special_tokens": {ENDOFTEXT: 50256},
58 }
59
60
61def p50k_edit():
62 mergeable_ranks = load_tiktoken_bpe(
63 "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
64 expected_hash="94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069",
65 )
66 special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283}
67 return {
68 "name": "p50k_edit",
69 "pat_str": r50k_pat_str,
70 "mergeable_ranks": mergeable_ranks,
71 "special_tokens": special_tokens,
72 }
73
74
75def cl100k_base():
76 mergeable_ranks = load_tiktoken_bpe(
77 "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken",
78 expected_hash="223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7",
79 )
80 special_tokens = {
81 ENDOFTEXT: 100257,
82 FIM_PREFIX: 100258,
83 FIM_MIDDLE: 100259,
84 FIM_SUFFIX: 100260,
85 ENDOFPROMPT: 100276,
86 }
87 return {
88 "name": "cl100k_base",
89 "pat_str": r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s++$|\s*[\r\n]|\s+(?!\S)|\s""",
90 "mergeable_ranks": mergeable_ranks,
91 "special_tokens": special_tokens,
92 }
93
94
95def o200k_base():
96 mergeable_ranks = load_tiktoken_bpe(
97 "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken",
98 expected_hash="446a9538cb6c348e3516120d7c08b09f57c36495e2acfffe59a5bf8b0cfb1a2d",
99 )
100 special_tokens = {ENDOFTEXT: 199999, ENDOFPROMPT: 200018}
101 # This regex could be made more efficient. If I was the one working on this encoding, I would
102 # have done a few other things differently too, e.g. I think you can allocate tokens more
103 # efficiently across languages.
104 pat_str = "|".join(
105 [
106 r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
107 r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
108 r"""\p{N}{1,3}""",
109 r""" ?[^\s\p{L}\p{N}]+[\r\n/]*""",
110 r"""\s*[\r\n]+""",
111 r"""\s+(?!\S)""",
112 r"""\s+""",
113 ]
114 )
115 return {
116 "name": "o200k_base",
117 "pat_str": pat_str,
118 "mergeable_ranks": mergeable_ranks,
119 "special_tokens": special_tokens,
120 }
121
122
123def o200k_harmony():
124 base_enc = o200k_base()
125 name = "o200k_harmony"
126 pat_str = base_enc["pat_str"]
127 mergeable_ranks = base_enc["mergeable_ranks"]
128 special_tokens = {
129 **base_enc["special_tokens"],
130 "<|startoftext|>": 199998,
131 "<|endoftext|>": 199999,
132 "<|reserved_200000|>": 200000,
133 "<|reserved_200001|>": 200001,
134 "<|return|>": 200002,
135 "<|constrain|>": 200003,
136 "<|reserved_200004|>": 200004,
137 "<|channel|>": 200005,
138 "<|start|>": 200006,
139 "<|end|>": 200007,
140 "<|message|>": 200008,
141 "<|reserved_200009|>": 200009,
142 "<|reserved_200010|>": 200010,
143 "<|reserved_200011|>": 200011,
144 "<|call|>": 200012,
145 } | {f"<|reserved_{i}|>": i for i in range(200013, 201088)}
146 return {
147 "name": name,
148 "pat_str": pat_str,
149 "mergeable_ranks": mergeable_ranks,
150 "special_tokens": special_tokens,
151 }
152
153
154ENCODING_CONSTRUCTORS = {
155 "gpt2": gpt2,
156 "r50k_base": r50k_base,
157 "p50k_base": p50k_base,
158 "p50k_edit": p50k_edit,
159 "cl100k_base": cl100k_base,
160 "o200k_base": o200k_base,
161 "o200k_harmony": o200k_harmony,
162}
163