openai/tiktoken

Public

mirrored from https://github.com/openai/tiktokenAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
0.7.0

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

scripts/benchmark.py

39lines · modeblame

a1a9f168Shantanu Jain3 years ago1import base64
2import functools
3import gzip
4import json
5import os
6import random
7import time
8from typing import Any, cast
9
10import blobfile
11
12import tiktoken
13
14
15def benchmark_batch(documents: list[str]) -> None:
16num_threads = int(os.environ["RAYON_NUM_THREADS"])
17num_bytes = sum(map(len, map(str.encode, documents)))
18print(f"num_threads: {num_threads}, num_bytes: {num_bytes}")
19
20enc = tiktoken.get_encoding("gpt2")
21enc.encode("warmup")
22
23start = time.perf_counter_ns()
24enc.encode_ordinary_batch(documents, num_threads=num_threads)
25end = time.perf_counter_ns()
26print(f"tiktoken \t{num_bytes / (end - start) * 1e9} bytes / s")
27
28import transformers
29
30hf_enc = cast(Any, transformers).GPT2TokenizerFast.from_pretrained("gpt2")
31hf_enc.model_max_length = 1e30 # silence!
32hf_enc.encode("warmup")
33
34start = time.perf_counter_ns()
35hf_enc(documents)
36end = time.perf_counter_ns()
37print(f"huggingface \t{num_bytes / (end - start) * 1e9} bytes / s")
38
39