openai/tiktoken

Public

mirrored fromhttps://github.com/openai/tiktokenAvailable

Watch0 Fork0 Star0

Code Commits Issues Pull requests Actions Insights Security

main

Find a branch or tag

Branches

main

Clone

HTTPS

Download ZIP

tiktoken/tests

tests/test_encoding.py

264lines · modecode

Raw Download

Latest commit unavailable.

unknown

1	`# Note that there are more actual tests, they're just not currently public :-)`
2
3	`from typing import Callable`
4
5	`import hypothesis`
6	`import hypothesis.strategies as st`
7	`import pytest`
8
9	`import tiktoken`
10
11	`from .test_helpers import ENCODING_FACTORIES, MAX_EXAMPLES`
12
13
14	`def test_simple():`
15	`enc = tiktoken.get_encoding("gpt2")`
16	`assert enc.encode("hello world") == [31373, 995]`
17	`assert enc.decode([31373, 995]) == "hello world"`
18	`assert enc.encode("hello <\|endoftext\|>", allowed_special="all") == [31373, 220, 50256]`
19
20	`enc = tiktoken.get_encoding("cl100k_base")`
21	`assert enc.encode("hello world") == [15339, 1917]`
22	`assert enc.decode([15339, 1917]) == "hello world"`
23	`assert enc.encode("hello <\|endoftext\|>", allowed_special="all") == [15339, 220, 100257]`
24
25	`for enc_name in tiktoken.list_encoding_names():`
26	`enc = tiktoken.get_encoding(enc_name)`
27	`for token in range(min(10_000, enc.max_token_value - 1)):`
28	`assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token`
29
30
31	`def test_simple_repeated():`
32	`enc = tiktoken.get_encoding("gpt2")`
33	`assert enc.encode("0") == [15]`
34	`assert enc.encode("00") == [405]`
35	`assert enc.encode("000") == [830]`
36	`assert enc.encode("0000") == [2388]`
37	`assert enc.encode("00000") == [20483]`
38	`assert enc.encode("000000") == [10535]`
39	`assert enc.encode("0000000") == [24598]`
40	`assert enc.encode("00000000") == [8269]`
41	`assert enc.encode("000000000") == [10535, 830]`
42	`assert enc.encode("0000000000") == [8269, 405]`
43	`assert enc.encode("00000000000") == [8269, 830]`
44	`assert enc.encode("000000000000") == [8269, 2388]`
45	`assert enc.encode("0000000000000") == [8269, 20483]`
46	`assert enc.encode("00000000000000") == [8269, 10535]`
47	`assert enc.encode("000000000000000") == [8269, 24598]`
48	`assert enc.encode("0000000000000000") == [25645]`
49	`assert enc.encode("00000000000000000") == [8269, 10535, 830]`
50
51
52	`def test_large_repeated():`
53	`enc = tiktoken.get_encoding("o200k_base")`
54
55	`# Large inputs should be handled without raising.`
56	`tokens = enc.encode("x" * 1_000_000)`
57	`assert tokens`
58
59
60	`def test_simple_regex():`
61	`enc = tiktoken.get_encoding("cl100k_base")`
62	`assert enc.encode("rer") == [38149]`
63	`assert enc.encode("'rer") == [2351, 81]`
64	`assert enc.encode("today\n ") == [31213, 198, 220]`
65	`assert enc.encode("today\n \n") == [31213, 27907]`
66	`assert enc.encode("today\n \n") == [31213, 14211]`
67
68
69	`def test_basic_encode():`
70	`enc = tiktoken.get_encoding("r50k_base")`
71	`assert enc.encode("hello world") == [31373, 995]`
72
73	`enc = tiktoken.get_encoding("p50k_base")`
74	`assert enc.encode("hello world") == [31373, 995]`
75
76	`enc = tiktoken.get_encoding("cl100k_base")`
77	`assert enc.encode("hello world") == [15339, 1917]`
78	`assert enc.encode(" \x850") == [220, 126, 227, 15]`
79
80
81	`def test_encode_empty():`
82	`enc = tiktoken.get_encoding("r50k_base")`
83	`assert enc.encode("") == []`
84
85
86	`def test_encode_bytes():`
87	`enc = tiktoken.get_encoding("cl100k_base")`
88	`assert enc._encode_bytes(b" \xec\x8b\xa4\xed") == [62085]`
89	`for i in range(10):`
90	`bytestring = b"\x80" * i`
91	`assert enc.decode_bytes(enc._encode_bytes(bytestring)) == bytestring`
92
93
94	`@pytest.mark.parametrize("make_enc", ENCODING_FACTORIES)`
95	`@hypothesis.given(bytestring=st.binary())`
96	`@hypothesis.settings(deadline=None, max_examples=MAX_EXAMPLES)`
97	`def test_hyp_encode_bytes(make_enc: Callable[[], tiktoken.Encoding], bytestring: bytes):`
98	`enc = make_enc()`
99	`assert enc.decode_bytes(enc._encode_bytes(bytestring)) == bytestring`
100
101
102	`def test_encode_surrogate_pairs():`
103	`enc = tiktoken.get_encoding("cl100k_base")`
104
105	`assert enc.encode("👍") == [9468, 239, 235]`
106	`# surrogate pair gets converted to codepoint`
107	`assert enc.encode("\ud83d\udc4d") == [9468, 239, 235]`
108
109	`# lone surrogate just gets replaced`
110	`assert enc.encode("\ud83d") == enc.encode("�")`
111
112
113	`@pytest.mark.parametrize("make_enc", ENCODING_FACTORIES)`
114	`def test_catastrophically_repetitive(make_enc: Callable[[], tiktoken.Encoding]):`
115	`enc = make_enc()`
116	`for c in ["^", "0", "a", "'s", " ", "\n"]:`
117	`big_value = c * 10_000`
118	`assert big_value == enc.decode(enc.encode(big_value))`
119
120	`big_value = " " + big_value`
121	`assert big_value == enc.decode(enc.encode(big_value))`
122
123	`big_value = big_value + "\n"`
124	`assert big_value == enc.decode(enc.encode(big_value))`
125
126
127	`# ====================`
128	`# Roundtrip`
129	`# ====================`
130
131
132	`@pytest.mark.parametrize("make_enc", ENCODING_FACTORIES)`
133	`def test_basic_roundtrip(make_enc):`
134	`enc = make_enc()`
135	`for value in (`
136	`"hello",`
137	`"hello ",`
138	`"hello ",`
139	`" hello",`
140	`" hello ",`
141	`" hello ",`
142	`"hello world",`
143	`"请考试我的软件！12345",`
144	`):`
145	`assert value == enc.decode(enc.encode(value))`
146	`assert value == enc.decode(enc.encode_ordinary(value))`
147
148
149	`@pytest.mark.parametrize("make_enc", ENCODING_FACTORIES)`
150	`@hypothesis.given(text=st.text())`
151	`@hypothesis.settings(deadline=None, max_examples=MAX_EXAMPLES)`
152	`def test_hyp_roundtrip(make_enc: Callable[[], tiktoken.Encoding], text):`
153	`enc = make_enc()`
154
155	`assert text == enc.decode(enc.encode(text))`
156
157
158	`@pytest.mark.parametrize("make_enc", ENCODING_FACTORIES)`
159	`def test_single_token_roundtrip(make_enc: Callable[[], tiktoken.Encoding]):`
160	`enc = make_enc()`
161
162	`for token in range(enc.n_vocab):`
163	`try:`
164	`token_bytes = enc.decode_single_token_bytes(token)`
165	`except KeyError:`
166	`continue`
167	`assert enc.encode_single_token(token_bytes) == token`
168
169
170	`# ====================`
171	`# Special tokens`
172	`# ====================`
173
174
175	`def test_special_token():`
176	`enc = tiktoken.get_encoding("cl100k_base")`
177
178	`eot = enc.encode_single_token("<\|endoftext\|>")`
179	`assert eot == enc.eot_token`
180	`fip = enc.encode_single_token("<\|fim_prefix\|>")`
181	`fim = enc.encode_single_token("<\|fim_middle\|>")`
182
183	`text = "<\|endoftext\|> hello <\|fim_prefix\|>"`
184	`assert eot not in enc.encode(text, disallowed_special=())`
185	`with pytest.raises(ValueError):`
186	`enc.encode(text)`
187	`with pytest.raises(ValueError):`
188	`enc.encode(text, disallowed_special="all")`
189	`with pytest.raises(ValueError):`
190	`enc.encode(text, disallowed_special={"<\|endoftext\|>"})`
191	`with pytest.raises(ValueError):`
192	`enc.encode(text, disallowed_special={"<\|fim_prefix\|>"})`
193
194	`text = "<\|endoftext\|> hello <\|fim_prefix\|> there <\|fim_middle\|>"`
195	`tokens = enc.encode(text, disallowed_special=())`
196	`assert eot not in tokens`
197	`assert fip not in tokens`
198	`assert fim not in tokens`
199
200	`tokens = enc.encode(text, allowed_special="all", disallowed_special=())`
201	`assert eot in tokens`
202	`assert fip in tokens`
203	`assert fim in tokens`
204
205	`tokens = enc.encode(text, allowed_special="all", disallowed_special="all")`
206	`assert eot in tokens`
207	`assert fip in tokens`
208	`assert fim in tokens`
209
210	`tokens = enc.encode(text, allowed_special={"<\|fim_prefix\|>"}, disallowed_special=())`
211	`assert eot not in tokens`
212	`assert fip in tokens`
213	`assert fim not in tokens`
214
215	`tokens = enc.encode(text, allowed_special={"<\|endoftext\|>"}, disallowed_special=())`
216	`assert eot in tokens`
217	`assert fip not in tokens`
218	`assert fim not in tokens`
219
220	`tokens = enc.encode(text, allowed_special={"<\|fim_middle\|>"}, disallowed_special=())`
221	`assert eot not in tokens`
222	`assert fip not in tokens`
223	`assert fim in tokens`
224
225
226	`@pytest.mark.parametrize("make_enc", ENCODING_FACTORIES)`
227	`@hypothesis.given(text=st.text())`
228	`@hypothesis.settings(deadline=None, max_examples=MAX_EXAMPLES)`
229	`def test_hyp_special_ordinary(make_enc, text: str):`
230	`enc = make_enc()`
231	`assert enc.encode_ordinary(text) == enc.encode(text, disallowed_special=())`
232
233
234	`# ====================`
235	`# Batch encoding`
236	`# ====================`
237
238
239	`@pytest.mark.parametrize("make_enc", ENCODING_FACTORIES)`
240	`def test_batch_encode(make_enc: Callable[[], tiktoken.Encoding]):`
241	`enc = make_enc()`
242	`text1 = "hello world"`
243	`text2 = "goodbye world"`
244
245	`assert enc.encode_batch([text1]) == [enc.encode(text1)]`
246	`assert enc.encode_batch([text1, text2]) == [enc.encode(text1), enc.encode(text2)]`
247
248	`assert enc.encode_ordinary_batch([text1]) == [enc.encode_ordinary(text1)]`
249	`assert enc.encode_ordinary_batch([text1, text2]) == [`
250	`enc.encode_ordinary(text1),`
251	`enc.encode_ordinary(text2),`
252	`]`
253
254
255	`@pytest.mark.parametrize("make_enc", ENCODING_FACTORIES)`
256	`@hypothesis.given(batch=st.lists(st.text()))`
257	`@hypothesis.settings(deadline=None, max_examples=MAX_EXAMPLES)`
258	`def test_hyp_batch_roundtrip(make_enc: Callable[[], tiktoken.Encoding], batch):`
259	`enc = make_enc()`
260
261	`encoded = enc.encode_batch(batch, allowed_special="all")`
262	`assert encoded == [enc.encode(t, allowed_special="all") for t in batch]`
263	`decoded = enc.decode_batch(encoded)`
264	`assert decoded == batch`
265

openai/tiktoken

Branches

Tags

Clone