openai/gpt-oss
Publicmirrored fromhttps://github.com/openai/gpt-ossAvailable
compatibility-test/runCase.ts
331lines · modecode
| 1 | import { |
| 2 | Agent, |
| 3 | Runner, |
| 4 | OpenAIResponsesModel, |
| 5 | OpenAIChatCompletionsModel, |
| 6 | RunResult, |
| 7 | StreamedRunResult, |
| 8 | FunctionTool, |
| 9 | setTracingDisabled, |
| 10 | } from "@openai/agents"; |
| 11 | import { Ajv } from "ajv"; |
| 12 | import { OpenAI } from "openai"; |
| 13 | import { PROVIDERS } from "./providers"; |
| 14 | import { TOOLS_MAP } from "./tools"; |
| 15 | |
| 16 | setTracingDisabled(true); |
| 17 | |
| 18 | const ajv = new Ajv(); |
| 19 | |
| 20 | export type Case = { |
| 21 | tool_name: string; |
| 22 | input: string; |
| 23 | expected_arguments: string; |
| 24 | instructions?: string; |
| 25 | }; |
| 26 | |
| 27 | // Summary shape for each apiType |
| 28 | export type RunCaseSummary = { |
| 29 | apiType: string; |
| 30 | success: boolean; |
| 31 | validResponse: boolean; |
| 32 | validEvents?: boolean; |
| 33 | details: Record<string, any>; |
| 34 | history: any[]; |
| 35 | successToolCall: boolean; |
| 36 | toolCallingDetails: Record<string, any>; |
| 37 | }; |
| 38 | |
| 39 | export async function runCase( |
| 40 | provider: string, |
| 41 | caseData: Case, |
| 42 | { |
| 43 | maxTurns, |
| 44 | streaming, |
| 45 | strict, |
| 46 | }: { maxTurns: number; streaming: boolean; strict: boolean } |
| 47 | ): Promise<RunCaseSummary[]> { |
| 48 | const config = PROVIDERS[provider]; |
| 49 | if (!config) { |
| 50 | throw new Error( |
| 51 | `Provider ${provider} not found. Valid providers are: ${Object.keys( |
| 52 | PROVIDERS |
| 53 | ).join(", ")}` |
| 54 | ); |
| 55 | } |
| 56 | |
| 57 | const agent = new Agent({ |
| 58 | name: caseData.tool_name, |
| 59 | instructions: caseData.instructions, |
| 60 | tools: [TOOLS_MAP[caseData.tool_name]], |
| 61 | }); |
| 62 | |
| 63 | const client = new OpenAI({ |
| 64 | apiKey: config.apiKey, |
| 65 | baseURL: config.apiBaseUrl, |
| 66 | }); |
| 67 | |
| 68 | const summaries: RunCaseSummary[] = []; |
| 69 | |
| 70 | for (const apiType of config.apiType) { |
| 71 | const runner = new Runner({ |
| 72 | model: |
| 73 | apiType === "responses" |
| 74 | ? new OpenAIResponsesModel(client, config.modelName) |
| 75 | : new OpenAIChatCompletionsModel(client, config.modelName), |
| 76 | modelSettings: { |
| 77 | providerData: config.providerDetails ?? {}, |
| 78 | }, |
| 79 | }); |
| 80 | |
| 81 | let result: RunResult<any, any> | StreamedRunResult<any, any>; |
| 82 | let streamedEvents: any[] | undefined = undefined; |
| 83 | if (streaming) { |
| 84 | result = await runner.run(agent, caseData.input, { |
| 85 | stream: streaming, |
| 86 | maxTurns: maxTurns, |
| 87 | }); |
| 88 | if (result instanceof StreamedRunResult) { |
| 89 | // Collect streaming events if applicable |
| 90 | streamedEvents = []; |
| 91 | for await (const event of result) { |
| 92 | if (event.type === "raw_model_stream_event") { |
| 93 | if (event.data.type === "model") { |
| 94 | streamedEvents.push(event.data.event); |
| 95 | } |
| 96 | } |
| 97 | } |
| 98 | await result.completed; |
| 99 | } |
| 100 | } else { |
| 101 | result = await runner.run(agent, caseData.input, { |
| 102 | maxTurns: maxTurns, |
| 103 | }); |
| 104 | } |
| 105 | |
| 106 | const { success: successToolCall, details: toolCallingDetails } = |
| 107 | testToolCall(apiType, caseData, result, strict); |
| 108 | |
| 109 | const { validResponse, details } = testOutputData( |
| 110 | apiType, |
| 111 | result.rawResponses, |
| 112 | streaming |
| 113 | ); |
| 114 | |
| 115 | const { validEvents, details: eventsDetails } = streaming |
| 116 | ? testEvents(apiType, streamedEvents) |
| 117 | : { validEvents: true, details: {} }; |
| 118 | |
| 119 | let success = successToolCall && validResponse; |
| 120 | if (streaming) { |
| 121 | success = success && validEvents; |
| 122 | } |
| 123 | const summary: RunCaseSummary = { |
| 124 | apiType, |
| 125 | success, |
| 126 | validResponse, |
| 127 | validEvents, |
| 128 | details: { |
| 129 | ...details, |
| 130 | ...eventsDetails, |
| 131 | }, |
| 132 | history: result?.rawResponses.map((entry) => entry.providerData) ?? [], |
| 133 | successToolCall, |
| 134 | toolCallingDetails, |
| 135 | }; |
| 136 | |
| 137 | summaries.push(summary); |
| 138 | } |
| 139 | |
| 140 | return summaries; |
| 141 | } |
| 142 | |
| 143 | function testToolCall(apiType, caseData, result, strict) { |
| 144 | let details: Record<string, boolean | string> = {}; |
| 145 | result.newItems.forEach((item) => { |
| 146 | // for this test for now we only care if the tool is called at least once |
| 147 | if (details.calledToolAtLeastOnce) { |
| 148 | return; |
| 149 | } |
| 150 | |
| 151 | const isToolCall = item.type === "tool_call_item"; |
| 152 | if (isToolCall) { |
| 153 | if (item.rawItem.type === "function_call") { |
| 154 | if (item.rawItem.name === caseData.tool_name) { |
| 155 | const validate = ajv.compile( |
| 156 | (TOOLS_MAP[caseData.tool_name] as FunctionTool).parameters |
| 157 | ); |
| 158 | const valid = validate(JSON.parse(item.rawItem.arguments)); |
| 159 | details.calledToolWithRightSchema = valid; |
| 160 | details.calledToolAtLeastOnce = true; |
| 161 | |
| 162 | if (details.calledToolWithRightSchema) { |
| 163 | const parsedArguments = JSON.parse(item.rawItem.arguments); |
| 164 | const expectedArguments = JSON.parse(caseData.expected_arguments); |
| 165 | details.calledToolWithRightArguments = deepEqual( |
| 166 | parsedArguments, |
| 167 | expectedArguments |
| 168 | ); |
| 169 | if (!details.calledToolWithRightArguments) { |
| 170 | if (details.calledToolWithRightSchema) { |
| 171 | details.warning = `Tool call with wrong arguments but correct schema. Check logs for full details. Not failing this test. Parsed: ${JSON.stringify( |
| 172 | parsedArguments |
| 173 | )} Expected: ${JSON.stringify(expectedArguments)}`; |
| 174 | } |
| 175 | details.actualArguments = parsedArguments; |
| 176 | details.expectedArguments = expectedArguments; |
| 177 | } |
| 178 | } |
| 179 | } |
| 180 | } |
| 181 | } |
| 182 | }); |
| 183 | |
| 184 | return { |
| 185 | success: |
| 186 | !!details.calledToolAtLeastOnce && |
| 187 | !!details.calledToolWithRightSchema && |
| 188 | (!strict || !!details.calledToolWithRightArguments), |
| 189 | details, |
| 190 | }; |
| 191 | } |
| 192 | |
| 193 | function testEvents(apiType, events) { |
| 194 | // In an ideal world we would check all the events to follow and reconstruct the final response |
| 195 | // and then compare it against the final response in the response.completed event |
| 196 | // for now we just check that certain events are present |
| 197 | |
| 198 | let details: Record<string, boolean> = {}; |
| 199 | let validEvents: boolean = false; |
| 200 | |
| 201 | if (apiType === "chat") { |
| 202 | let hasReasoningDeltas = false; |
| 203 | for (const event of events) { |
| 204 | hasReasoningDeltas = |
| 205 | hasReasoningDeltas || |
| 206 | (typeof event.choices[0].delta.reasoning === "string" && |
| 207 | event.choices[0].delta.reasoning.length > 0); |
| 208 | } |
| 209 | details.hasReasoningDeltas = hasReasoningDeltas; |
| 210 | validEvents = hasReasoningDeltas; |
| 211 | } |
| 212 | |
| 213 | if (apiType === "responses") { |
| 214 | let hasReasoningDeltaEvents = false; |
| 215 | let hasReasoningDoneEvents = false; |
| 216 | for (const event of events) { |
| 217 | if (event.type === "raw_model_stream_event") { |
| 218 | if (event.data.type === "model") { |
| 219 | if (event.data.event.type === "response.reasoning_text.delta") { |
| 220 | hasReasoningDeltaEvents = true; |
| 221 | } |
| 222 | if (event.data.event.type === "response.reasoning_text.done") { |
| 223 | hasReasoningDoneEvents = true; |
| 224 | } |
| 225 | } |
| 226 | } |
| 227 | } |
| 228 | |
| 229 | details.hasReasoningDeltaEvents = hasReasoningDeltaEvents; |
| 230 | details.hasReasoningDoneEvents = hasReasoningDoneEvents; |
| 231 | validEvents = |
| 232 | details.hasReasoningDeltaEvents && details.hasReasoningDoneEvents; |
| 233 | } |
| 234 | |
| 235 | return { |
| 236 | validEvents, |
| 237 | details, |
| 238 | }; |
| 239 | } |
| 240 | |
| 241 | function testOutputData(apiType, rawResponses, streaming) { |
| 242 | let details: Record<string, boolean> = {}; |
| 243 | let validResponse: boolean = false; |
| 244 | |
| 245 | if (apiType === "chat") { |
| 246 | for (const response of rawResponses) { |
| 247 | if (streaming && !response.providerData) { |
| 248 | // with Chat Completions we don't have a final response object that's native so we skip this test |
| 249 | return { |
| 250 | validResponse: true, |
| 251 | details: { |
| 252 | skippedBecauseStreaming: true, |
| 253 | }, |
| 254 | }; |
| 255 | } |
| 256 | |
| 257 | // this is the actual HTTP response from the provider |
| 258 | // Since it's not guaranteed that every response has a reasoning field, we check if it's present |
| 259 | // at least once across all responses |
| 260 | const data = response.providerData; |
| 261 | const message = data.choices[0].message; |
| 262 | if (message.role === "assistant" && !message.refusal) { |
| 263 | details.hasReasoningField = |
| 264 | details.hasReasoningField || |
| 265 | ("reasoning" in message && typeof message.reasoning === "string"); |
| 266 | details.hasReasoningContentField = |
| 267 | details.hasReasoningContentField || |
| 268 | ("reasoning_content" in message && |
| 269 | typeof message.reasoning_content === "string"); |
| 270 | |
| 271 | validResponse = |
| 272 | validResponse || |
| 273 | (details.hasReasoningField && message.reasoning.length > 0); |
| 274 | } |
| 275 | } |
| 276 | } else if (apiType === "responses") { |
| 277 | // this is the actual HTTP response from the provider |
| 278 | const data = rawResponses[0].providerData; |
| 279 | for (const item of data.output) { |
| 280 | // Since it's not guaranteed that every response has a reasoning field, we check if it's present |
| 281 | // at least once across all responses |
| 282 | |
| 283 | if (item.type === "reasoning") { |
| 284 | details.hasReasoningContentArray = Array.isArray(item.content); |
| 285 | details.hasReasoningContentArrayLength = item.content.length > 0; |
| 286 | details.hasReasoningContentArrayItemType = item.content.every( |
| 287 | (item) => item.type === "reasoning_text" |
| 288 | ); |
| 289 | details.hasReasoningContentArrayItemText = item.content.every( |
| 290 | (item) => item.text.length > 0 |
| 291 | ); |
| 292 | |
| 293 | validResponse = |
| 294 | details.hasReasoningContentArray && |
| 295 | details.hasReasoningContentArrayLength && |
| 296 | details.hasReasoningContentArrayItemType && |
| 297 | details.hasReasoningContentArrayItemText; |
| 298 | } |
| 299 | } |
| 300 | } |
| 301 | |
| 302 | return { |
| 303 | validResponse, |
| 304 | details, |
| 305 | }; |
| 306 | } |
| 307 | |
| 308 | function deepEqual(a: any, b: any): boolean { |
| 309 | if (a === b) return true; |
| 310 | if (typeof a !== typeof b) return false; |
| 311 | if (a && b && typeof a === "object") { |
| 312 | if (Array.isArray(a) !== Array.isArray(b)) return false; |
| 313 | if (Array.isArray(a)) { |
| 314 | if (a.length !== b.length) return false; |
| 315 | for (let i = 0; i < a.length; i++) { |
| 316 | if (!deepEqual(a[i], b[i])) return false; |
| 317 | } |
| 318 | return true; |
| 319 | } else { |
| 320 | const aKeys = Object.keys(a); |
| 321 | const bKeys = Object.keys(b); |
| 322 | if (aKeys.length !== bKeys.length) return false; |
| 323 | for (const key of aKeys) { |
| 324 | if (!b.hasOwnProperty(key)) return false; |
| 325 | if (!deepEqual(a[key], b[key])) return false; |
| 326 | } |
| 327 | return true; |
| 328 | } |
| 329 | } |
| 330 | return false; |
| 331 | } |