openai/tiktoken

Public

0.7.0

Find a branch or tag

HTTPS

scripts/benchmark.py

39lines · modeblame

unknown

`a1a9f168`Shantanu Jain3 years ago	1	`import base64`
	2	`import functools`
	3	`import gzip`
	4	`import json`
	5	`import os`
	6	`import random`
	7	`import time`
	8	`from typing import Any, cast`
	9
	10	`import blobfile`
	11
	12	`import tiktoken`
	13
	14
	15	`def benchmark_batch(documents: list[str]) -> None:`
	16	`num_threads = int(os.environ["RAYON_NUM_THREADS"])`
	17	`num_bytes = sum(map(len, map(str.encode, documents)))`
	18	`print(f"num_threads: {num_threads}, num_bytes: {num_bytes}")`
	19
	20	`enc = tiktoken.get_encoding("gpt2")`
	21	`enc.encode("warmup")`
	22
	23	`start = time.perf_counter_ns()`
	24	`enc.encode_ordinary_batch(documents, num_threads=num_threads)`
	25	`end = time.perf_counter_ns()`
	26	`print(f"tiktoken \t{num_bytes / (end - start) * 1e9} bytes / s")`
	27
	28	`import transformers`
	29
	30	`hf_enc = cast(Any, transformers).GPT2TokenizerFast.from_pretrained("gpt2")`
	31	`hf_enc.model_max_length = 1e30 # silence!`
	32	`hf_enc.encode("warmup")`
	33
	34	`start = time.perf_counter_ns()`
	35	`hf_enc(documents)`
	36	`end = time.perf_counter_ns()`
	37	`print(f"huggingface \t{num_bytes / (end - start) * 1e9} bytes / s")`
	38
	39