microsoft/hve-core

Public

mirrored fromhttps://github.com/microsoft/hve-coreAvailable

Watch0 Fork0 Star0

Code Commits Issues Pull requests Actions Insights Security

feat/1873-devcontainer

Find a branch or tag

Branches

feat/1873-devcontainer

Clone

HTTPS

Download ZIP

hve-core/scripts/evals

scripts/evals/Invoke-BaselineEquivalence.ps1

541lines · modecode

Raw Download

Latest commit unavailable.

unknown

1	`#!/usr/bin/env pwsh`
2	`# Copyright (c) Microsoft Corporation.`
3	`# SPDX-License-Identifier: MIT`
4
5	`#Requires -Version 7.0`
6
7	`<#`
8	`.SYNOPSIS`
9	`Runs the Vally baseline-vs-customized equivalence suite for a target hve-core agent.`
10
11	`.DESCRIPTION`
12	Drives the `evals/baseline-equivalence/` Vally suite end-to-end. Resolves the target
13	agent's frontmatter `model:` hint, selects a model tier (PR or nightly), invokes
14	`vally eval` once per environment (`baseline` and `task-researcher-context`), invokes
15	`vally compare` to produce a pairwise verdict, and writes a machine-readable summary
16	to `logs/baseline-equivalence-summary.json`.
17
18	`Exit policy by tier:`
19	- PR tier always exits 0. Equivalence failures surface as `verdict: warn` in the
20	`summary JSON. Advisory only.`
21	- Nightly tier exits non-zero (1) when `verdict == fail`. Source of truth.
22
23	`-WhatIf` (dry-run) mode prints the planned `vally` command lines, emits a summary
24	JSON populated with zeros and `verdict: dry-run`, and exits 0 without invoking any
25	`SDK or external command.`
26
27	`.PARAMETER Agent`
28	The target agent slug, matching the basename of an `.agent.md` file under
29	`.github/agents/`. Defaults to `task-researcher`.
30
31	`.PARAMETER Tier`
32	The model tier to exercise. `pr` runs a single primary model; `nightly` runs a model
33	array for broader coverage. Defaults to `pr`.
34
35	`.PARAMETER StimulusFilter`
36	Optional regular expression filtering stimulus names. Defaults to `.*` (all stimuli).
37
38	`.PARAMETER RepoRoot`
39	Repository root. Defaults to the result of `git rev-parse --show-toplevel`, falling
40	back to the parent of `$PSScriptRoot`.
41
42	`.PARAMETER OutputPath`
43	Path to the summary JSON. Defaults to `<RepoRoot>/logs/baseline-equivalence-summary.json`.
44
45	`.EXAMPLE`
46	`./Invoke-BaselineEquivalence.ps1 -Agent task-researcher -Tier pr -WhatIf`
47
48	`Prints the planned commands and writes a dry-run summary.`
49
50	`.EXAMPLE`
51	`npm run eval:equivalence -- -Agent task-researcher -Tier pr`
52
53	`Runs the PR-tier flow via the npm wrapper.`
54
55	`.NOTES`
56	`Runs via: npm run eval:equivalence`
57	`#>`
58
59	`[CmdletBinding(SupportsShouldProcess = $true)]`
60	`param(`
61	`[Parameter(Mandatory = $false)]`
62	`[ValidateNotNullOrEmpty()]`
63	`[string]$Agent = 'task-researcher',`
64
65	`[Parameter(Mandatory = $false)]`
66	`[ValidateSet('pr', 'nightly')]`
67	`[string]$Tier = 'pr',`
68
69	`[Parameter(Mandatory = $false)]`
70	`[string]$StimulusFilter = '.*',`
71
72	`[Parameter(Mandatory = $false)]`
73	`[string]$RepoRoot,`
74
75	`[Parameter(Mandatory = $false)]`
76	`[string]$OutputPath`
77	`)`
78
79	`$ErrorActionPreference = 'Stop'`
80
81	`Import-Module -Name (Join-Path $PSScriptRoot 'lib/EquivalenceParsing.psm1') -Force`
82
83	`#region Helper Functions`
84
85	`function Resolve-RepoRoot {`
86	`[CmdletBinding()]`
87	`[OutputType([string])]`
88	`param(`
89	`[string]$Hint`
90	`)`
91
92	`if ($Hint) { return (Resolve-Path -LiteralPath $Hint).Path }`
93
94	`$gitRoot = & git rev-parse --show-toplevel 2>$null`
95	`if ($LASTEXITCODE -eq 0 -and -not [string]::IsNullOrWhiteSpace($gitRoot)) {`
96	`return $gitRoot.Trim()`
97	`}`
98
99	`return (Resolve-Path -LiteralPath (Join-Path $PSScriptRoot '../..')).Path`
100	`}`
101
102	`function Resolve-AgentSurfaceSignaturePath {`
103	`[CmdletBinding()]`
104	`[OutputType([string])]`
105	`param(`
106	`[Parameter(Mandatory)]`
107	`[string]$RepoRoot,`
108	`[Parameter(Mandatory)]`
109	`[string]$Agent`
110	`)`
111
112	`$path = Join-Path $RepoRoot "evals/baseline-equivalence/surface-signatures/$Agent.yml"`
113	`if (-not (Test-Path -LiteralPath $path)) {`
114	`throw "Surface signature not found for agent '$Agent' at $path. Run scripts/evals/New-AgentSurfaceSignatures.ps1 -Agent $Agent to generate."`
115	`}`
116	`return $path`
117	`}`
118
119	`function New-RenderedCompareSpec {`
120	`[CmdletBinding()]`
121	`[OutputType([string])]`
122	`param(`
123	`[Parameter(Mandatory)]`
124	`[string]$RepoRoot,`
125	`[Parameter(Mandatory)]`
126	`[string]$Agent,`
127	`[Parameter(Mandatory)]`
128	`[string]$OutputPath`
129	`)`
130
131	`$sourceSpec = Join-Path $RepoRoot 'evals/baseline-equivalence/compare.eval.yml'`
132	`if (-not (Test-Path -LiteralPath $sourceSpec)) {`
133	`throw "Compare spec not found at $sourceSpec."`
134	`}`
135	`$signaturePath = Resolve-AgentSurfaceSignaturePath -RepoRoot $RepoRoot -Agent $Agent`
136
137	`$specText = [System.IO.File]::ReadAllText($sourceSpec)`
138	`$signatureText = [System.IO.File]::ReadAllText($signaturePath)`
139
140	$indentedLines = $signatureText -split "`r?`n" \| ForEach-Object {
141	`if ([string]::IsNullOrEmpty($_)) { '' } else { ' ' + $_ }`
142	`}`
143	$indented = $indentedLines -join "`n"
144
145	$replacement = "surface_signatures:`n ${Agent}:`n$indented"
146
147	`if ($specText -notmatch '(?m)^surface_signatures:\s\{\}\s$') {`
148	`throw "compare.eval.yml does not contain the 'surface_signatures: {}' marker. Update the spec per Phase 2 Step 2.5 before running the equivalence driver."`
149	`}`
150
151	`$renderedText = [regex]::Replace($specText, '(?m)^surface_signatures:\s\{\}\s$', { param($m) $replacement }, 1)`
152
153	`if ($renderedText -eq $specText) {`
154	`throw "Render produced an unchanged compare spec for agent '$Agent'. Ensure the 'surface_signatures: {}' marker is present in compare.eval.yml."`
155	`}`
156
157	`$outDir = Split-Path -Parent $OutputPath`
158	`if ($outDir -and -not (Test-Path -LiteralPath $outDir)) {`
159	`New-Item -ItemType Directory -Path $outDir -Force -WhatIf:$false -Confirm:$false \| Out-Null`
160	`}`
161	`[System.IO.File]::WriteAllText($OutputPath, $renderedText)`
162	`return $OutputPath`
163	`}`
164
165	`function Get-AgentModelHint {`
166	`[CmdletBinding()]`
167	`[OutputType([string])]`
168	`param(`
169	`[Parameter(Mandatory)]`
170	`[string]$RepoRoot,`
171	`[Parameter(Mandatory)]`
172	`[string]$Agent`
173	`)`
174
175	`$agentsRoot = Join-Path $RepoRoot '.github/agents'`
176	`if (-not (Test-Path -LiteralPath $agentsRoot)) { return $null }`
177
178	`$candidate = Get-ChildItem -Path $agentsRoot -Recurse -Filter "$Agent.agent.md" -File -ErrorAction SilentlyContinue \|`
179	`Select-Object -First 1`
180	`if (-not $candidate) { return $null }`
181
182	`$match = Select-String -Path $candidate.FullName -Pattern '^\smodel\s:\s(.+)\s$' -List`
183	`if (-not $match) { return $null }`
184
185	`return $match.Matches[0].Groups[1].Value.Trim().Trim('"').Trim("'")`
186	`}`
187
188	`function Resolve-ModelList {`
189	`[CmdletBinding()]`
190	`[OutputType([string[]])]`
191	`param(`
192	`[Parameter(Mandatory)]`
193	`[string]$Tier,`
194	`[string]$Hint`
195	`)`
196
197	`if ($Tier -eq 'nightly') {`
198	`return @('gpt-5.5', 'claude-opus-4.6', 'claude-sonnet-latest')`
199	`}`
200
201	`if ($Hint) { return @($Hint) }`
202	`return @('claude-opus-4.7')`
203	`}`
204
205	`function New-DryRunSummary {`
206	`[CmdletBinding()]`
207	`[OutputType([hashtable])]`
208	`param(`
209	`[Parameter(Mandatory)]`
210	`[string]$Agent,`
211	`[Parameter(Mandatory)]`
212	`[string]$Tier,`
213	`[Parameter(Mandatory)]`
214	`[string]$Model,`
215	`[Parameter(Mandatory)]`
216	`[string]$StimulusFilter,`
217	`[Parameter(Mandatory)]`
218	`[string[]]$PlannedCommands,`
219	`[hashtable]$Variants`
220	`)`
221
222	`return [ordered]@{`
223	`agent = $Agent`
224	`tier = $Tier`
225	`model = $Model`
226	`stimulusFilter = $StimulusFilter`
227	`runs = 0`
228	`ties = 0`
229	`aWins = 0`
230	`bWins = 0`
231	`invariantFailures = 0`
232	`divergenceFailures = 0`
233	`verdict = 'dry-run'`
234	`variants = $Variants`
235	`plannedCommands = $PlannedCommands`
236	`}`
237	`}`
238
239	`function Invoke-VallyCommand {`
240	`[CmdletBinding()]`
241	`param(`
242	`[Parameter(Mandatory)]`
243	`[string[]]$Arguments`
244	`)`
245
246	`& vally @Arguments`
247	`return $LASTEXITCODE`
248	`}`
249
250	`function Invoke-VallyCommandWithCapture {`
251	`[CmdletBinding()]`
252	`[OutputType([hashtable])]`
253	`param(`
254	`[Parameter(Mandatory)]`
255	`[string[]]$Arguments,`
256	`[string]$LogPath`
257	`)`
258
259	`$prev = [Console]::OutputEncoding`
260	`try {`
261	`[Console]::OutputEncoding = [System.Text.Encoding]::UTF8`
262	`$raw = & vally @Arguments 2>&1`
263	`$code = $LASTEXITCODE`
264	`}`
265	`finally {`
266	`[Console]::OutputEncoding = $prev`
267	`}`
268
269	`$lines = @($raw \| ForEach-Object { $_.ToString() })`
270	`foreach ($line in $lines) { Write-Host $line }`
271
272	`if ($LogPath) {`
273	`$dir = Split-Path -Parent $LogPath`
274	`if ($dir -and -not (Test-Path -LiteralPath $dir)) {`
275	`New-Item -ItemType Directory -Path $dir -Force \| Out-Null`
276	`}`
277	`Set-Content -LiteralPath $LogPath -Value $lines -Encoding utf8NoBOM`
278	`}`
279
280	`return @{ ExitCode = $code; Lines = $lines }`
281	`}`
282
283	`function Get-InvariantFailureCount {`
284	`[CmdletBinding()]`
285	`param(`
286	`[Parameter(Mandatory)]`
287	`[AllowNull()]`
288	`[AllowEmptyString()]`
289	`[string]$RunDir`
290	`)`
291
292	`if (-not $RunDir -or -not (Test-Path -LiteralPath $RunDir)) { return $null }`
293	`$resultsMd = Join-Path $RunDir 'eval-results.md'`
294	`if (-not (Test-Path -LiteralPath $resultsMd)) { return $null }`
295	`try {`
296	`$lines = Get-Content -LiteralPath $resultsMd -ErrorAction Stop`
297	`}`
298	`catch {`
299	`return $null`
300	`}`
301	`$tally = Measure-InvariantFailures -Lines $lines`
302	`if ($tally.Total -le 0) { return $null }`
303	`return [int]$tally.Failed`
304	`}`
305
306	`function Get-PlannedCommands {`
307	`[CmdletBinding()]`
308	`[OutputType([string[]])]`
309	`param(`
310	`[Parameter(Mandatory)]`
311	`[string[]]$Models,`
312	`[Parameter(Mandatory)]`
313	`[string]$StimulusFilter,`
314	`[Parameter(Mandatory)]`
315	`[string]$OutputRoot,`
316	`[Parameter(Mandatory)]`
317	`[string]$RunId,`
318	`[Parameter(Mandatory)]`
319	`[string]$CompareSpecPath`
320	`)`
321
322	`$filterTag = if ($StimulusFilter -eq '.*') { '' } else { " # filter: $StimulusFilter" }`
323	`$plan = [System.Collections.Generic.List[string]]::new()`
324	`foreach ($model in $Models) {`
325	`$aDir = Join-Path $OutputRoot "$model/$RunId/baseline"`
326	`$bDir = Join-Path $OutputRoot "$model/$RunId/customized"`
327	`$plan.Add("vally eval --eval-spec evals/baseline-equivalence/baseline/eval.yaml --model $model --output-dir $aDir$filterTag")`
328	`$plan.Add("vally eval --eval-spec evals/baseline-equivalence/customized/eval.yaml --model $model --output-dir $bDir$filterTag")`
329	`$plan.Add("vally compare --eval-spec $CompareSpecPath --run-a <resolved baseline run> --run-b <resolved customized run>")`
330	`}`
331	`return $plan.ToArray()`
332	`}`
333
334	`function Resolve-LatestRunDir {`
335	`[CmdletBinding()]`
336	`[OutputType([string])]`
337	`param(`
338	`[Parameter(Mandatory)]`
339	`[string]$OutputDir`
340	`)`
341
342	`if (-not (Test-Path -LiteralPath $OutputDir)) { return $null }`
343	`$latest = Get-ChildItem -LiteralPath $OutputDir -Directory -ErrorAction SilentlyContinue \|`
344	`Sort-Object LastWriteTime -Descending \|`
345	`Select-Object -First 1`
346	`if (-not $latest) { return $null }`
347	`return $latest.FullName`
348	`}`
349
350	`function Write-SummaryJson {`
351	`[CmdletBinding()]`
352	`param(`
353	`[Parameter(Mandatory)]`
354	`[object]$Summary,`
355	`[Parameter(Mandatory)]`
356	`[string]$Path`
357	`)`
358
359	`$dir = Split-Path -Parent $Path`
360	`if (-not (Test-Path -LiteralPath $dir)) {`
361	`New-Item -ItemType Directory -Path $dir -Force -WhatIf:$false -Confirm:$false \| Out-Null`
362	`}`
363
364	`$json = $Summary \| ConvertTo-Json -Depth 6`
365	`Set-Content -LiteralPath $Path -Value $json -Encoding utf8NoBOM -WhatIf:$false -Confirm:$false`
366	`}`
367
368	`#endregion Helper Functions`
369
370	`#region Main Execution`
371	`if ($MyInvocation.InvocationName -ne '.') {`
372	`try {`
373	`$resolvedRoot = Resolve-RepoRoot -Hint $RepoRoot`
374	`if (-not $OutputPath) {`
375	`$OutputPath = Join-Path $resolvedRoot 'logs/baseline-equivalence-summary.json'`
376	`}`
377
378	`$modelHint = Get-AgentModelHint -RepoRoot $resolvedRoot -Agent $Agent`
379	`$models = @(Resolve-ModelList -Tier $Tier -Hint $modelHint)`
380	`$primaryModel = $models[0]`
381
382	`$outputRoot = Join-Path $resolvedRoot 'evals/results/baseline-equivalence'`
383	`$runId = (Get-Date -AsUTC).ToString('yyyyMMddTHHmmssfffZ')`
384
385	`$defaultVariantA = @{ kind = 'baseline'; name = 'baseline'; label = 'Baseline (A)'; description = ''; applied = @() }`
386	`$defaultVariantB = @{ kind = 'agent'; name = $Agent; label = $Agent; description = ''; applied = @() }`
387	`$variantA = Get-VariantMetadata -VariantYamlPath (Join-Path $resolvedRoot 'evals/baseline-equivalence/baseline/variant.yaml') -Default $defaultVariantA`
388	`$variantB = Get-VariantMetadata -VariantYamlPath (Join-Path $resolvedRoot 'evals/baseline-equivalence/customized/variant.yaml') -Default $defaultVariantB`
389	`$workspaceRoot = Join-Path $resolvedRoot 'evals/baseline-equivalence/customized/workspace'`
390	`$variantB.applied = @(Get-AppliedArtifacts -WorkspaceRoot $workspaceRoot)`
391	`$variants = @{ a = $variantA; b = $variantB; subject = [string]$variantB.name }`
392
393	`Write-Host "Baseline equivalence: agent=$Agent tier=$Tier model(s)=$($models -join ',')" -ForegroundColor Cyan`
394	`Write-Host " Stimulus filter: $StimulusFilter" -ForegroundColor DarkGray`
395	`Write-Host " Summary output: $OutputPath" -ForegroundColor DarkGray`
396	`Write-Host " Results root: $outputRoot" -ForegroundColor DarkGray`
397	`Write-Host " Run id: $runId" -ForegroundColor DarkGray`
398
399	`$renderedCompareSpec = Join-Path $resolvedRoot "logs/baseline-equivalence-compare-$Agent.eval.yml"`
400	`New-RenderedCompareSpec -RepoRoot $resolvedRoot -Agent $Agent -OutputPath $renderedCompareSpec \| Out-Null`
401	`$renderedSpecRelative = [System.IO.Path]::GetRelativePath($resolvedRoot, $renderedCompareSpec).Replace('\', '/')`
402	`Write-Host " Compare spec: $renderedSpecRelative" -ForegroundColor DarkGray`
403
404	`$plannedCommands = Get-PlannedCommands -Models $models -StimulusFilter $StimulusFilter -OutputRoot $outputRoot -RunId $runId -CompareSpecPath $renderedSpecRelative`
405
406	`if ($WhatIfPreference) {`
407	`Write-Host "Dry-run mode: skipping live SDK calls." -ForegroundColor Yellow`
408	`foreach ($cmd in $plannedCommands) {`
409	`Write-Host " $cmd" -ForegroundColor DarkGray`
410	`}`
411
412	$dry = New-DryRunSummary `
413	-Agent $Agent `
414	-Tier $Tier `
415	-Model $primaryModel `
416	-StimulusFilter $StimulusFilter `
417	-PlannedCommands $plannedCommands `
418	`-Variants $variants`
419	`Write-SummaryJson -Summary $dry -Path $OutputPath`
420	`Write-Host "Dry-run summary written: $OutputPath" -ForegroundColor Green`
421	`exit 0`
422	`}`
423
424	`$totalRuns = 0`
425	`$totalTies = 0`
426	`$totalA = 0`
427	`$totalB = 0`
428	`$invariantFailures = 0`
429	`$divergenceFailures = 0`
430	`$compareLogs = [System.Collections.Generic.List[string]]::new()`
431
432	`foreach ($model in $models) {`
433	`$aDir = Join-Path $outputRoot "$model/$runId/baseline"`
434	`$bDir = Join-Path $outputRoot "$model/$runId/customized"`
435	`foreach ($dir in @($aDir, $bDir)) {`
436	`if (-not (Test-Path -LiteralPath $dir)) {`
437	`New-Item -ItemType Directory -Path $dir -Force \| Out-Null`
438	`}`
439	`}`
440
441	`$evalBaseline = @(`
442	`'eval',`
443	`'--eval-spec', 'evals/baseline-equivalence/baseline/eval.yaml',`
444	`'--model', $model,`
445	`'--output-dir', $aDir`
446	`)`
447	`$evalCustomized = @(`
448	`'eval',`
449	`'--eval-spec', 'evals/baseline-equivalence/customized/eval.yaml',`
450	`'--model', $model,`
451	`'--output-dir', $bDir`
452	`)`
453
454	`$codeA = Invoke-VallyCommand -Arguments $evalBaseline`
455	`$baselineRunDir = Resolve-LatestRunDir -OutputDir $aDir`
456	`$baselineFailures = Get-InvariantFailureCount -RunDir $baselineRunDir`
457	`if ($null -ne $baselineFailures) {`
458	`$invariantFailures += $baselineFailures`
459	`}`
460	`elseif ($codeA -ne 0) {`
461	`$invariantFailures++`
462	`}`
463
464	`$codeB = Invoke-VallyCommand -Arguments $evalCustomized`
465	`if ($codeB -ne 0) { $divergenceFailures++ }`
466
467	`$aRunDir = Resolve-LatestRunDir -OutputDir $aDir`
468	`$bRunDir = Resolve-LatestRunDir -OutputDir $bDir`
469	`if (-not $aRunDir -or -not $bRunDir) {`
470	`Write-Host " Compare skipped: missing run dir (a=$aRunDir b=$bRunDir)" -ForegroundColor Yellow`
471	`$divergenceFailures++`
472	`}`
473	`else {`
474	`$compareArgs = @(`
475	`'compare',`
476	`'--eval-spec', $renderedSpecRelative,`
477	`'--run-a', $aRunDir,`
478	`'--run-b', $bRunDir`
479	`)`
480	`$compareLog = Join-Path $resolvedRoot "logs/vally-compare-$model-$runId.log"`
481	`$resultC = Invoke-VallyCommandWithCapture -Arguments $compareArgs -LogPath $compareLog`
482	`if ($resultC.ExitCode -ne 0) { $divergenceFailures++ }`
483	`$compareLogs.Add($compareLog)`
484
485	`$tally = Measure-CompareTrials -Lines $resultC.Lines`
486	`if ($tally.Total -le 0) {`
487	`Write-Host " Compare emitted no parseable trial lines: $compareLog" -ForegroundColor Yellow`
488	`$divergenceFailures++`
489	`}`
490	`$totalRuns += $tally.Total`
491	`$totalTies += $tally.Ties`
492	`$totalA += $tally.AWins`
493	`$totalB += $tally.BWins`
494	`}`
495	`}`
496
497	$verdict = Get-VerdictFromAggregate `
498	-Runs $totalRuns `
499	-Ties $totalTies `
500	-AWins $totalA `
501	-BWins $totalB `
502	-InvariantFailures $invariantFailures `
503	-DivergenceFailures $divergenceFailures `
504	`-Tier $Tier`
505
506	`$summary = [ordered]@{`
507	`agent = $Agent`
508	`tier = $Tier`
509	`model = $primaryModel`
510	`stimulusFilter = $StimulusFilter`
511	`runs = $totalRuns`
512	`ties = $totalTies`
513	`aWins = $totalA`
514	`bWins = $totalB`
515	`invariantFailures = $invariantFailures`
516	`divergenceFailures = $divergenceFailures`
517	`verdict = $verdict`
518	`variants = $variants`
519	`compareLogs = @($compareLogs)`
520	`}`
521
522	`Write-SummaryJson -Summary $summary -Path $OutputPath`
523	`Write-Host "Summary written: $OutputPath ($verdict)" -ForegroundColor Cyan`
524
525	`if ($Tier -eq 'pr') {`
526	`exit 0`
527	`}`
528
529	`if ($verdict -eq 'fail') {`
530	`Write-Host "Nightly verdict: fail" -ForegroundColor Red`
531	`exit 1`
532	`}`
533
534	`exit 0`
535	`}`
536	`catch {`
537	`Write-Error -ErrorAction Continue "Invoke-BaselineEquivalence failed: $($_.Exception.Message)"`
538	`exit 3`
539	`}`
540	`}`
541	`#endregion Main Execution`
542

microsoft/hve-core

Branches

Tags

Clone