openai/gpt-oss

Public

mirrored fromhttps://github.com/openai/gpt-ossAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
main

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

compatibility-test/runCase.ts

331lines · modecode

1import {
2 Agent,
3 Runner,
4 OpenAIResponsesModel,
5 OpenAIChatCompletionsModel,
6 RunResult,
7 StreamedRunResult,
8 FunctionTool,
9 setTracingDisabled,
10} from "@openai/agents";
11import { Ajv } from "ajv";
12import { OpenAI } from "openai";
13import { PROVIDERS } from "./providers";
14import { TOOLS_MAP } from "./tools";
15
16setTracingDisabled(true);
17
18const ajv = new Ajv();
19
20export type Case = {
21 tool_name: string;
22 input: string;
23 expected_arguments: string;
24 instructions?: string;
25};
26
27// Summary shape for each apiType
28export type RunCaseSummary = {
29 apiType: string;
30 success: boolean;
31 validResponse: boolean;
32 validEvents?: boolean;
33 details: Record<string, any>;
34 history: any[];
35 successToolCall: boolean;
36 toolCallingDetails: Record<string, any>;
37};
38
39export async function runCase(
40 provider: string,
41 caseData: Case,
42 {
43 maxTurns,
44 streaming,
45 strict,
46 }: { maxTurns: number; streaming: boolean; strict: boolean }
47): Promise<RunCaseSummary[]> {
48 const config = PROVIDERS[provider];
49 if (!config) {
50 throw new Error(
51 `Provider ${provider} not found. Valid providers are: ${Object.keys(
52 PROVIDERS
53 ).join(", ")}`
54 );
55 }
56
57 const agent = new Agent({
58 name: caseData.tool_name,
59 instructions: caseData.instructions,
60 tools: [TOOLS_MAP[caseData.tool_name]],
61 });
62
63 const client = new OpenAI({
64 apiKey: config.apiKey,
65 baseURL: config.apiBaseUrl,
66 });
67
68 const summaries: RunCaseSummary[] = [];
69
70 for (const apiType of config.apiType) {
71 const runner = new Runner({
72 model:
73 apiType === "responses"
74 ? new OpenAIResponsesModel(client, config.modelName)
75 : new OpenAIChatCompletionsModel(client, config.modelName),
76 modelSettings: {
77 providerData: config.providerDetails ?? {},
78 },
79 });
80
81 let result: RunResult<any, any> | StreamedRunResult<any, any>;
82 let streamedEvents: any[] | undefined = undefined;
83 if (streaming) {
84 result = await runner.run(agent, caseData.input, {
85 stream: streaming,
86 maxTurns: maxTurns,
87 });
88 if (result instanceof StreamedRunResult) {
89 // Collect streaming events if applicable
90 streamedEvents = [];
91 for await (const event of result) {
92 if (event.type === "raw_model_stream_event") {
93 if (event.data.type === "model") {
94 streamedEvents.push(event.data.event);
95 }
96 }
97 }
98 await result.completed;
99 }
100 } else {
101 result = await runner.run(agent, caseData.input, {
102 maxTurns: maxTurns,
103 });
104 }
105
106 const { success: successToolCall, details: toolCallingDetails } =
107 testToolCall(apiType, caseData, result, strict);
108
109 const { validResponse, details } = testOutputData(
110 apiType,
111 result.rawResponses,
112 streaming
113 );
114
115 const { validEvents, details: eventsDetails } = streaming
116 ? testEvents(apiType, streamedEvents)
117 : { validEvents: true, details: {} };
118
119 let success = successToolCall && validResponse;
120 if (streaming) {
121 success = success && validEvents;
122 }
123 const summary: RunCaseSummary = {
124 apiType,
125 success,
126 validResponse,
127 validEvents,
128 details: {
129 ...details,
130 ...eventsDetails,
131 },
132 history: result?.rawResponses.map((entry) => entry.providerData) ?? [],
133 successToolCall,
134 toolCallingDetails,
135 };
136
137 summaries.push(summary);
138 }
139
140 return summaries;
141}
142
143function testToolCall(apiType, caseData, result, strict) {
144 let details: Record<string, boolean | string> = {};
145 result.newItems.forEach((item) => {
146 // for this test for now we only care if the tool is called at least once
147 if (details.calledToolAtLeastOnce) {
148 return;
149 }
150
151 const isToolCall = item.type === "tool_call_item";
152 if (isToolCall) {
153 if (item.rawItem.type === "function_call") {
154 if (item.rawItem.name === caseData.tool_name) {
155 const validate = ajv.compile(
156 (TOOLS_MAP[caseData.tool_name] as FunctionTool).parameters
157 );
158 const valid = validate(JSON.parse(item.rawItem.arguments));
159 details.calledToolWithRightSchema = valid;
160 details.calledToolAtLeastOnce = true;
161
162 if (details.calledToolWithRightSchema) {
163 const parsedArguments = JSON.parse(item.rawItem.arguments);
164 const expectedArguments = JSON.parse(caseData.expected_arguments);
165 details.calledToolWithRightArguments = deepEqual(
166 parsedArguments,
167 expectedArguments
168 );
169 if (!details.calledToolWithRightArguments) {
170 if (details.calledToolWithRightSchema) {
171 details.warning = `Tool call with wrong arguments but correct schema. Check logs for full details. Not failing this test. Parsed: ${JSON.stringify(
172 parsedArguments
173 )} Expected: ${JSON.stringify(expectedArguments)}`;
174 }
175 details.actualArguments = parsedArguments;
176 details.expectedArguments = expectedArguments;
177 }
178 }
179 }
180 }
181 }
182 });
183
184 return {
185 success:
186 !!details.calledToolAtLeastOnce &&
187 !!details.calledToolWithRightSchema &&
188 (!strict || !!details.calledToolWithRightArguments),
189 details,
190 };
191}
192
193function testEvents(apiType, events) {
194 // In an ideal world we would check all the events to follow and reconstruct the final response
195 // and then compare it against the final response in the response.completed event
196 // for now we just check that certain events are present
197
198 let details: Record<string, boolean> = {};
199 let validEvents: boolean = false;
200
201 if (apiType === "chat") {
202 let hasReasoningDeltas = false;
203 for (const event of events) {
204 hasReasoningDeltas =
205 hasReasoningDeltas ||
206 (typeof event.choices[0].delta.reasoning === "string" &&
207 event.choices[0].delta.reasoning.length > 0);
208 }
209 details.hasReasoningDeltas = hasReasoningDeltas;
210 validEvents = hasReasoningDeltas;
211 }
212
213 if (apiType === "responses") {
214 let hasReasoningDeltaEvents = false;
215 let hasReasoningDoneEvents = false;
216 for (const event of events) {
217 if (event.type === "raw_model_stream_event") {
218 if (event.data.type === "model") {
219 if (event.data.event.type === "response.reasoning_text.delta") {
220 hasReasoningDeltaEvents = true;
221 }
222 if (event.data.event.type === "response.reasoning_text.done") {
223 hasReasoningDoneEvents = true;
224 }
225 }
226 }
227 }
228
229 details.hasReasoningDeltaEvents = hasReasoningDeltaEvents;
230 details.hasReasoningDoneEvents = hasReasoningDoneEvents;
231 validEvents =
232 details.hasReasoningDeltaEvents && details.hasReasoningDoneEvents;
233 }
234
235 return {
236 validEvents,
237 details,
238 };
239}
240
241function testOutputData(apiType, rawResponses, streaming) {
242 let details: Record<string, boolean> = {};
243 let validResponse: boolean = false;
244
245 if (apiType === "chat") {
246 for (const response of rawResponses) {
247 if (streaming && !response.providerData) {
248 // with Chat Completions we don't have a final response object that's native so we skip this test
249 return {
250 validResponse: true,
251 details: {
252 skippedBecauseStreaming: true,
253 },
254 };
255 }
256
257 // this is the actual HTTP response from the provider
258 // Since it's not guaranteed that every response has a reasoning field, we check if it's present
259 // at least once across all responses
260 const data = response.providerData;
261 const message = data.choices[0].message;
262 if (message.role === "assistant" && !message.refusal) {
263 details.hasReasoningField =
264 details.hasReasoningField ||
265 ("reasoning" in message && typeof message.reasoning === "string");
266 details.hasReasoningContentField =
267 details.hasReasoningContentField ||
268 ("reasoning_content" in message &&
269 typeof message.reasoning_content === "string");
270
271 validResponse =
272 validResponse ||
273 (details.hasReasoningField && message.reasoning.length > 0);
274 }
275 }
276 } else if (apiType === "responses") {
277 // this is the actual HTTP response from the provider
278 const data = rawResponses[0].providerData;
279 for (const item of data.output) {
280 // Since it's not guaranteed that every response has a reasoning field, we check if it's present
281 // at least once across all responses
282
283 if (item.type === "reasoning") {
284 details.hasReasoningContentArray = Array.isArray(item.content);
285 details.hasReasoningContentArrayLength = item.content.length > 0;
286 details.hasReasoningContentArrayItemType = item.content.every(
287 (item) => item.type === "reasoning_text"
288 );
289 details.hasReasoningContentArrayItemText = item.content.every(
290 (item) => item.text.length > 0
291 );
292
293 validResponse =
294 details.hasReasoningContentArray &&
295 details.hasReasoningContentArrayLength &&
296 details.hasReasoningContentArrayItemType &&
297 details.hasReasoningContentArrayItemText;
298 }
299 }
300 }
301
302 return {
303 validResponse,
304 details,
305 };
306}
307
308function deepEqual(a: any, b: any): boolean {
309 if (a === b) return true;
310 if (typeof a !== typeof b) return false;
311 if (a && b && typeof a === "object") {
312 if (Array.isArray(a) !== Array.isArray(b)) return false;
313 if (Array.isArray(a)) {
314 if (a.length !== b.length) return false;
315 for (let i = 0; i < a.length; i++) {
316 if (!deepEqual(a[i], b[i])) return false;
317 }
318 return true;
319 } else {
320 const aKeys = Object.keys(a);
321 const bKeys = Object.keys(b);
322 if (aKeys.length !== bKeys.length) return false;
323 for (const key of aKeys) {
324 if (!b.hasOwnProperty(key)) return false;
325 if (!deepEqual(a[key], b[key])) return false;
326 }
327 return true;
328 }
329 }
330 return false;
331}