openai/tiktoken

Public

0.8.0

Find a branch or tag

HTTPS

scripts/benchmark.py

39lines · modecode

unknown

1	`import base64`
2	`import functools`
3	`import gzip`
4	`import json`
5	`import os`
6	`import random`
7	`import time`
8	`from typing import Any, cast`
9
10	`import blobfile`
11
12	`import tiktoken`
13
14
15	`def benchmark_batch(documents: list[str]) -> None:`
16	`num_threads = int(os.environ["RAYON_NUM_THREADS"])`
17	`num_bytes = sum(map(len, map(str.encode, documents)))`
18	`print(f"num_threads: {num_threads}, num_bytes: {num_bytes}")`
19
20	`enc = tiktoken.get_encoding("gpt2")`
21	`enc.encode("warmup")`
22
23	`start = time.perf_counter_ns()`
24	`enc.encode_ordinary_batch(documents, num_threads=num_threads)`
25	`end = time.perf_counter_ns()`
26	`print(f"tiktoken \t{num_bytes / (end - start) * 1e9} bytes / s")`
27
28	`import transformers`
29
30	`hf_enc = cast(Any, transformers).GPT2TokenizerFast.from_pretrained("gpt2")`
31	`hf_enc.model_max_length = 1e30 # silence!`
32	`hf_enc.encode("warmup")`
33
34	`start = time.perf_counter_ns()`
35	`hf_enc(documents)`
36	`end = time.perf_counter_ns()`
37	`print(f"huggingface \t{num_bytes / (end - start) * 1e9} bytes / s")`
38
39
40