openai/tiktoken
Publicmirrored fromhttps://github.com/openai/tiktokenAvailable
tests/test_simple_public.py
42lines · modecode
| 1 | import subprocess |
| 2 | import sys |
| 3 | |
| 4 | import tiktoken |
| 5 | |
| 6 | |
| 7 | def test_simple(): |
| 8 | # Note that there are more actual tests, they're just not currently public :-) |
| 9 | enc = tiktoken.get_encoding("gpt2") |
| 10 | assert enc.encode("hello world") == [31373, 995] |
| 11 | assert enc.decode([31373, 995]) == "hello world" |
| 12 | assert enc.encode("hello <|endoftext|>", allowed_special="all") == [31373, 220, 50256] |
| 13 | |
| 14 | enc = tiktoken.get_encoding("cl100k_base") |
| 15 | assert enc.encode("hello world") == [15339, 1917] |
| 16 | assert enc.decode([15339, 1917]) == "hello world" |
| 17 | assert enc.encode("hello <|endoftext|>", allowed_special="all") == [15339, 220, 100257] |
| 18 | |
| 19 | for enc_name in tiktoken.list_encoding_names(): |
| 20 | enc = tiktoken.get_encoding(enc_name) |
| 21 | for token in range(10_000): |
| 22 | assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token |
| 23 | |
| 24 | |
| 25 | def test_encoding_for_model(): |
| 26 | enc = tiktoken.encoding_for_model("gpt2") |
| 27 | assert enc.name == "gpt2" |
| 28 | enc = tiktoken.encoding_for_model("text-davinci-003") |
| 29 | assert enc.name == "p50k_base" |
| 30 | enc = tiktoken.encoding_for_model("text-davinci-edit-001") |
| 31 | assert enc.name == "p50k_edit" |
| 32 | enc = tiktoken.encoding_for_model("gpt-3.5-turbo-0301") |
| 33 | assert enc.name == "cl100k_base" |
| 34 | |
| 35 | |
| 36 | def test_optional_blobfile_dependency(): |
| 37 | prog = """ |
| 38 | import tiktoken |
| 39 | import sys |
| 40 | assert "blobfile" not in sys.modules |
| 41 | """ |
| 42 | subprocess.check_call([sys.executable, "-c", prog]) |
| 43 | |