openai/tiktoken

Public

mirrored fromhttps://github.com/openai/tiktokenAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
0.8.0

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

scripts/benchmark.py

39lines · modecode

1import base64
2import functools
3import gzip
4import json
5import os
6import random
7import time
8from typing import Any, cast
9
10import blobfile
11
12import tiktoken
13
14
15def benchmark_batch(documents: list[str]) -> None:
16 num_threads = int(os.environ["RAYON_NUM_THREADS"])
17 num_bytes = sum(map(len, map(str.encode, documents)))
18 print(f"num_threads: {num_threads}, num_bytes: {num_bytes}")
19
20 enc = tiktoken.get_encoding("gpt2")
21 enc.encode("warmup")
22
23 start = time.perf_counter_ns()
24 enc.encode_ordinary_batch(documents, num_threads=num_threads)
25 end = time.perf_counter_ns()
26 print(f"tiktoken \t{num_bytes / (end - start) * 1e9} bytes / s")
27
28 import transformers
29
30 hf_enc = cast(Any, transformers).GPT2TokenizerFast.from_pretrained("gpt2")
31 hf_enc.model_max_length = 1e30 # silence!
32 hf_enc.encode("warmup")
33
34 start = time.perf_counter_ns()
35 hf_enc(documents)
36 end = time.perf_counter_ns()
37 print(f"huggingface \t{num_bytes / (end - start) * 1e9} bytes / s")
38
39
40