microsoft/hve-core

Public

mirrored fromhttps://github.com/microsoft/hve-coreAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
feat/1873-devcontainer

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

scripts/evals/Invoke-BaselineEquivalence.ps1

541lines · modecode

1#!/usr/bin/env pwsh
2# Copyright (c) Microsoft Corporation.
3# SPDX-License-Identifier: MIT
4
5#Requires -Version 7.0
6
7<#
8.SYNOPSIS
9 Runs the Vally baseline-vs-customized equivalence suite for a target hve-core agent.
10
11.DESCRIPTION
12 Drives the `evals/baseline-equivalence/` Vally suite end-to-end. Resolves the target
13 agent's frontmatter `model:` hint, selects a model tier (PR or nightly), invokes
14 `vally eval` once per environment (`baseline` and `task-researcher-context`), invokes
15 `vally compare` to produce a pairwise verdict, and writes a machine-readable summary
16 to `logs/baseline-equivalence-summary.json`.
17
18 Exit policy by tier:
19 - PR tier always exits 0. Equivalence failures surface as `verdict: warn` in the
20 summary JSON. Advisory only.
21 - Nightly tier exits non-zero (1) when `verdict == fail`. Source of truth.
22
23 `-WhatIf` (dry-run) mode prints the planned `vally` command lines, emits a summary
24 JSON populated with zeros and `verdict: dry-run`, and exits 0 without invoking any
25 SDK or external command.
26
27.PARAMETER Agent
28 The target agent slug, matching the basename of an `.agent.md` file under
29 `.github/agents/`. Defaults to `task-researcher`.
30
31.PARAMETER Tier
32 The model tier to exercise. `pr` runs a single primary model; `nightly` runs a model
33 array for broader coverage. Defaults to `pr`.
34
35.PARAMETER StimulusFilter
36 Optional regular expression filtering stimulus names. Defaults to `.*` (all stimuli).
37
38.PARAMETER RepoRoot
39 Repository root. Defaults to the result of `git rev-parse --show-toplevel`, falling
40 back to the parent of `$PSScriptRoot`.
41
42.PARAMETER OutputPath
43 Path to the summary JSON. Defaults to `<RepoRoot>/logs/baseline-equivalence-summary.json`.
44
45.EXAMPLE
46 ./Invoke-BaselineEquivalence.ps1 -Agent task-researcher -Tier pr -WhatIf
47
48 Prints the planned commands and writes a dry-run summary.
49
50.EXAMPLE
51 npm run eval:equivalence -- -Agent task-researcher -Tier pr
52
53 Runs the PR-tier flow via the npm wrapper.
54
55.NOTES
56 Runs via: npm run eval:equivalence
57#>
58
59[CmdletBinding(SupportsShouldProcess = $true)]
60param(
61 [Parameter(Mandatory = $false)]
62 [ValidateNotNullOrEmpty()]
63 [string]$Agent = 'task-researcher',
64
65 [Parameter(Mandatory = $false)]
66 [ValidateSet('pr', 'nightly')]
67 [string]$Tier = 'pr',
68
69 [Parameter(Mandatory = $false)]
70 [string]$StimulusFilter = '.*',
71
72 [Parameter(Mandatory = $false)]
73 [string]$RepoRoot,
74
75 [Parameter(Mandatory = $false)]
76 [string]$OutputPath
77)
78
79$ErrorActionPreference = 'Stop'
80
81Import-Module -Name (Join-Path $PSScriptRoot 'lib/EquivalenceParsing.psm1') -Force
82
83#region Helper Functions
84
85function Resolve-RepoRoot {
86 [CmdletBinding()]
87 [OutputType([string])]
88 param(
89 [string]$Hint
90 )
91
92 if ($Hint) { return (Resolve-Path -LiteralPath $Hint).Path }
93
94 $gitRoot = & git rev-parse --show-toplevel 2>$null
95 if ($LASTEXITCODE -eq 0 -and -not [string]::IsNullOrWhiteSpace($gitRoot)) {
96 return $gitRoot.Trim()
97 }
98
99 return (Resolve-Path -LiteralPath (Join-Path $PSScriptRoot '../..')).Path
100}
101
102function Resolve-AgentSurfaceSignaturePath {
103 [CmdletBinding()]
104 [OutputType([string])]
105 param(
106 [Parameter(Mandatory)]
107 [string]$RepoRoot,
108 [Parameter(Mandatory)]
109 [string]$Agent
110 )
111
112 $path = Join-Path $RepoRoot "evals/baseline-equivalence/surface-signatures/$Agent.yml"
113 if (-not (Test-Path -LiteralPath $path)) {
114 throw "Surface signature not found for agent '$Agent' at $path. Run scripts/evals/New-AgentSurfaceSignatures.ps1 -Agent $Agent to generate."
115 }
116 return $path
117}
118
119function New-RenderedCompareSpec {
120 [CmdletBinding()]
121 [OutputType([string])]
122 param(
123 [Parameter(Mandatory)]
124 [string]$RepoRoot,
125 [Parameter(Mandatory)]
126 [string]$Agent,
127 [Parameter(Mandatory)]
128 [string]$OutputPath
129 )
130
131 $sourceSpec = Join-Path $RepoRoot 'evals/baseline-equivalence/compare.eval.yml'
132 if (-not (Test-Path -LiteralPath $sourceSpec)) {
133 throw "Compare spec not found at $sourceSpec."
134 }
135 $signaturePath = Resolve-AgentSurfaceSignaturePath -RepoRoot $RepoRoot -Agent $Agent
136
137 $specText = [System.IO.File]::ReadAllText($sourceSpec)
138 $signatureText = [System.IO.File]::ReadAllText($signaturePath)
139
140 $indentedLines = $signatureText -split "`r?`n" | ForEach-Object {
141 if ([string]::IsNullOrEmpty($_)) { '' } else { ' ' + $_ }
142 }
143 $indented = $indentedLines -join "`n"
144
145 $replacement = "surface_signatures:`n ${Agent}:`n$indented"
146
147 if ($specText -notmatch '(?m)^surface_signatures:\s*\{\}\s*$') {
148 throw "compare.eval.yml does not contain the 'surface_signatures: {}' marker. Update the spec per Phase 2 Step 2.5 before running the equivalence driver."
149 }
150
151 $renderedText = [regex]::Replace($specText, '(?m)^surface_signatures:\s*\{\}\s*$', { param($m) $replacement }, 1)
152
153 if ($renderedText -eq $specText) {
154 throw "Render produced an unchanged compare spec for agent '$Agent'. Ensure the 'surface_signatures: {}' marker is present in compare.eval.yml."
155 }
156
157 $outDir = Split-Path -Parent $OutputPath
158 if ($outDir -and -not (Test-Path -LiteralPath $outDir)) {
159 New-Item -ItemType Directory -Path $outDir -Force -WhatIf:$false -Confirm:$false | Out-Null
160 }
161 [System.IO.File]::WriteAllText($OutputPath, $renderedText)
162 return $OutputPath
163}
164
165function Get-AgentModelHint {
166 [CmdletBinding()]
167 [OutputType([string])]
168 param(
169 [Parameter(Mandatory)]
170 [string]$RepoRoot,
171 [Parameter(Mandatory)]
172 [string]$Agent
173 )
174
175 $agentsRoot = Join-Path $RepoRoot '.github/agents'
176 if (-not (Test-Path -LiteralPath $agentsRoot)) { return $null }
177
178 $candidate = Get-ChildItem -Path $agentsRoot -Recurse -Filter "$Agent.agent.md" -File -ErrorAction SilentlyContinue |
179 Select-Object -First 1
180 if (-not $candidate) { return $null }
181
182 $match = Select-String -Path $candidate.FullName -Pattern '^\s*model\s*:\s*(.+)\s*$' -List
183 if (-not $match) { return $null }
184
185 return $match.Matches[0].Groups[1].Value.Trim().Trim('"').Trim("'")
186}
187
188function Resolve-ModelList {
189 [CmdletBinding()]
190 [OutputType([string[]])]
191 param(
192 [Parameter(Mandatory)]
193 [string]$Tier,
194 [string]$Hint
195 )
196
197 if ($Tier -eq 'nightly') {
198 return @('gpt-5.5', 'claude-opus-4.6', 'claude-sonnet-latest')
199 }
200
201 if ($Hint) { return @($Hint) }
202 return @('claude-opus-4.7')
203}
204
205function New-DryRunSummary {
206 [CmdletBinding()]
207 [OutputType([hashtable])]
208 param(
209 [Parameter(Mandatory)]
210 [string]$Agent,
211 [Parameter(Mandatory)]
212 [string]$Tier,
213 [Parameter(Mandatory)]
214 [string]$Model,
215 [Parameter(Mandatory)]
216 [string]$StimulusFilter,
217 [Parameter(Mandatory)]
218 [string[]]$PlannedCommands,
219 [hashtable]$Variants
220 )
221
222 return [ordered]@{
223 agent = $Agent
224 tier = $Tier
225 model = $Model
226 stimulusFilter = $StimulusFilter
227 runs = 0
228 ties = 0
229 aWins = 0
230 bWins = 0
231 invariantFailures = 0
232 divergenceFailures = 0
233 verdict = 'dry-run'
234 variants = $Variants
235 plannedCommands = $PlannedCommands
236 }
237}
238
239function Invoke-VallyCommand {
240 [CmdletBinding()]
241 param(
242 [Parameter(Mandatory)]
243 [string[]]$Arguments
244 )
245
246 & vally @Arguments
247 return $LASTEXITCODE
248}
249
250function Invoke-VallyCommandWithCapture {
251 [CmdletBinding()]
252 [OutputType([hashtable])]
253 param(
254 [Parameter(Mandatory)]
255 [string[]]$Arguments,
256 [string]$LogPath
257 )
258
259 $prev = [Console]::OutputEncoding
260 try {
261 [Console]::OutputEncoding = [System.Text.Encoding]::UTF8
262 $raw = & vally @Arguments 2>&1
263 $code = $LASTEXITCODE
264 }
265 finally {
266 [Console]::OutputEncoding = $prev
267 }
268
269 $lines = @($raw | ForEach-Object { $_.ToString() })
270 foreach ($line in $lines) { Write-Host $line }
271
272 if ($LogPath) {
273 $dir = Split-Path -Parent $LogPath
274 if ($dir -and -not (Test-Path -LiteralPath $dir)) {
275 New-Item -ItemType Directory -Path $dir -Force | Out-Null
276 }
277 Set-Content -LiteralPath $LogPath -Value $lines -Encoding utf8NoBOM
278 }
279
280 return @{ ExitCode = $code; Lines = $lines }
281}
282
283function Get-InvariantFailureCount {
284 [CmdletBinding()]
285 param(
286 [Parameter(Mandatory)]
287 [AllowNull()]
288 [AllowEmptyString()]
289 [string]$RunDir
290 )
291
292 if (-not $RunDir -or -not (Test-Path -LiteralPath $RunDir)) { return $null }
293 $resultsMd = Join-Path $RunDir 'eval-results.md'
294 if (-not (Test-Path -LiteralPath $resultsMd)) { return $null }
295 try {
296 $lines = Get-Content -LiteralPath $resultsMd -ErrorAction Stop
297 }
298 catch {
299 return $null
300 }
301 $tally = Measure-InvariantFailures -Lines $lines
302 if ($tally.Total -le 0) { return $null }
303 return [int]$tally.Failed
304}
305
306function Get-PlannedCommands {
307 [CmdletBinding()]
308 [OutputType([string[]])]
309 param(
310 [Parameter(Mandatory)]
311 [string[]]$Models,
312 [Parameter(Mandatory)]
313 [string]$StimulusFilter,
314 [Parameter(Mandatory)]
315 [string]$OutputRoot,
316 [Parameter(Mandatory)]
317 [string]$RunId,
318 [Parameter(Mandatory)]
319 [string]$CompareSpecPath
320 )
321
322 $filterTag = if ($StimulusFilter -eq '.*') { '' } else { " # filter: $StimulusFilter" }
323 $plan = [System.Collections.Generic.List[string]]::new()
324 foreach ($model in $Models) {
325 $aDir = Join-Path $OutputRoot "$model/$RunId/baseline"
326 $bDir = Join-Path $OutputRoot "$model/$RunId/customized"
327 $plan.Add("vally eval --eval-spec evals/baseline-equivalence/baseline/eval.yaml --model $model --output-dir $aDir$filterTag")
328 $plan.Add("vally eval --eval-spec evals/baseline-equivalence/customized/eval.yaml --model $model --output-dir $bDir$filterTag")
329 $plan.Add("vally compare --eval-spec $CompareSpecPath --run-a <resolved baseline run> --run-b <resolved customized run>")
330 }
331 return $plan.ToArray()
332}
333
334function Resolve-LatestRunDir {
335 [CmdletBinding()]
336 [OutputType([string])]
337 param(
338 [Parameter(Mandatory)]
339 [string]$OutputDir
340 )
341
342 if (-not (Test-Path -LiteralPath $OutputDir)) { return $null }
343 $latest = Get-ChildItem -LiteralPath $OutputDir -Directory -ErrorAction SilentlyContinue |
344 Sort-Object LastWriteTime -Descending |
345 Select-Object -First 1
346 if (-not $latest) { return $null }
347 return $latest.FullName
348}
349
350function Write-SummaryJson {
351 [CmdletBinding()]
352 param(
353 [Parameter(Mandatory)]
354 [object]$Summary,
355 [Parameter(Mandatory)]
356 [string]$Path
357 )
358
359 $dir = Split-Path -Parent $Path
360 if (-not (Test-Path -LiteralPath $dir)) {
361 New-Item -ItemType Directory -Path $dir -Force -WhatIf:$false -Confirm:$false | Out-Null
362 }
363
364 $json = $Summary | ConvertTo-Json -Depth 6
365 Set-Content -LiteralPath $Path -Value $json -Encoding utf8NoBOM -WhatIf:$false -Confirm:$false
366}
367
368#endregion Helper Functions
369
370#region Main Execution
371if ($MyInvocation.InvocationName -ne '.') {
372 try {
373 $resolvedRoot = Resolve-RepoRoot -Hint $RepoRoot
374 if (-not $OutputPath) {
375 $OutputPath = Join-Path $resolvedRoot 'logs/baseline-equivalence-summary.json'
376 }
377
378 $modelHint = Get-AgentModelHint -RepoRoot $resolvedRoot -Agent $Agent
379 $models = @(Resolve-ModelList -Tier $Tier -Hint $modelHint)
380 $primaryModel = $models[0]
381
382 $outputRoot = Join-Path $resolvedRoot 'evals/results/baseline-equivalence'
383 $runId = (Get-Date -AsUTC).ToString('yyyyMMddTHHmmssfffZ')
384
385 $defaultVariantA = @{ kind = 'baseline'; name = 'baseline'; label = 'Baseline (A)'; description = ''; applied = @() }
386 $defaultVariantB = @{ kind = 'agent'; name = $Agent; label = $Agent; description = ''; applied = @() }
387 $variantA = Get-VariantMetadata -VariantYamlPath (Join-Path $resolvedRoot 'evals/baseline-equivalence/baseline/variant.yaml') -Default $defaultVariantA
388 $variantB = Get-VariantMetadata -VariantYamlPath (Join-Path $resolvedRoot 'evals/baseline-equivalence/customized/variant.yaml') -Default $defaultVariantB
389 $workspaceRoot = Join-Path $resolvedRoot 'evals/baseline-equivalence/customized/workspace'
390 $variantB.applied = @(Get-AppliedArtifacts -WorkspaceRoot $workspaceRoot)
391 $variants = @{ a = $variantA; b = $variantB; subject = [string]$variantB.name }
392
393 Write-Host "Baseline equivalence: agent=$Agent tier=$Tier model(s)=$($models -join ',')" -ForegroundColor Cyan
394 Write-Host " Stimulus filter: $StimulusFilter" -ForegroundColor DarkGray
395 Write-Host " Summary output: $OutputPath" -ForegroundColor DarkGray
396 Write-Host " Results root: $outputRoot" -ForegroundColor DarkGray
397 Write-Host " Run id: $runId" -ForegroundColor DarkGray
398
399 $renderedCompareSpec = Join-Path $resolvedRoot "logs/baseline-equivalence-compare-$Agent.eval.yml"
400 New-RenderedCompareSpec -RepoRoot $resolvedRoot -Agent $Agent -OutputPath $renderedCompareSpec | Out-Null
401 $renderedSpecRelative = [System.IO.Path]::GetRelativePath($resolvedRoot, $renderedCompareSpec).Replace('\', '/')
402 Write-Host " Compare spec: $renderedSpecRelative" -ForegroundColor DarkGray
403
404 $plannedCommands = Get-PlannedCommands -Models $models -StimulusFilter $StimulusFilter -OutputRoot $outputRoot -RunId $runId -CompareSpecPath $renderedSpecRelative
405
406 if ($WhatIfPreference) {
407 Write-Host "Dry-run mode: skipping live SDK calls." -ForegroundColor Yellow
408 foreach ($cmd in $plannedCommands) {
409 Write-Host " $cmd" -ForegroundColor DarkGray
410 }
411
412 $dry = New-DryRunSummary `
413 -Agent $Agent `
414 -Tier $Tier `
415 -Model $primaryModel `
416 -StimulusFilter $StimulusFilter `
417 -PlannedCommands $plannedCommands `
418 -Variants $variants
419 Write-SummaryJson -Summary $dry -Path $OutputPath
420 Write-Host "Dry-run summary written: $OutputPath" -ForegroundColor Green
421 exit 0
422 }
423
424 $totalRuns = 0
425 $totalTies = 0
426 $totalA = 0
427 $totalB = 0
428 $invariantFailures = 0
429 $divergenceFailures = 0
430 $compareLogs = [System.Collections.Generic.List[string]]::new()
431
432 foreach ($model in $models) {
433 $aDir = Join-Path $outputRoot "$model/$runId/baseline"
434 $bDir = Join-Path $outputRoot "$model/$runId/customized"
435 foreach ($dir in @($aDir, $bDir)) {
436 if (-not (Test-Path -LiteralPath $dir)) {
437 New-Item -ItemType Directory -Path $dir -Force | Out-Null
438 }
439 }
440
441 $evalBaseline = @(
442 'eval',
443 '--eval-spec', 'evals/baseline-equivalence/baseline/eval.yaml',
444 '--model', $model,
445 '--output-dir', $aDir
446 )
447 $evalCustomized = @(
448 'eval',
449 '--eval-spec', 'evals/baseline-equivalence/customized/eval.yaml',
450 '--model', $model,
451 '--output-dir', $bDir
452 )
453
454 $codeA = Invoke-VallyCommand -Arguments $evalBaseline
455 $baselineRunDir = Resolve-LatestRunDir -OutputDir $aDir
456 $baselineFailures = Get-InvariantFailureCount -RunDir $baselineRunDir
457 if ($null -ne $baselineFailures) {
458 $invariantFailures += $baselineFailures
459 }
460 elseif ($codeA -ne 0) {
461 $invariantFailures++
462 }
463
464 $codeB = Invoke-VallyCommand -Arguments $evalCustomized
465 if ($codeB -ne 0) { $divergenceFailures++ }
466
467 $aRunDir = Resolve-LatestRunDir -OutputDir $aDir
468 $bRunDir = Resolve-LatestRunDir -OutputDir $bDir
469 if (-not $aRunDir -or -not $bRunDir) {
470 Write-Host " Compare skipped: missing run dir (a=$aRunDir b=$bRunDir)" -ForegroundColor Yellow
471 $divergenceFailures++
472 }
473 else {
474 $compareArgs = @(
475 'compare',
476 '--eval-spec', $renderedSpecRelative,
477 '--run-a', $aRunDir,
478 '--run-b', $bRunDir
479 )
480 $compareLog = Join-Path $resolvedRoot "logs/vally-compare-$model-$runId.log"
481 $resultC = Invoke-VallyCommandWithCapture -Arguments $compareArgs -LogPath $compareLog
482 if ($resultC.ExitCode -ne 0) { $divergenceFailures++ }
483 $compareLogs.Add($compareLog)
484
485 $tally = Measure-CompareTrials -Lines $resultC.Lines
486 if ($tally.Total -le 0) {
487 Write-Host " Compare emitted no parseable trial lines: $compareLog" -ForegroundColor Yellow
488 $divergenceFailures++
489 }
490 $totalRuns += $tally.Total
491 $totalTies += $tally.Ties
492 $totalA += $tally.AWins
493 $totalB += $tally.BWins
494 }
495 }
496
497 $verdict = Get-VerdictFromAggregate `
498 -Runs $totalRuns `
499 -Ties $totalTies `
500 -AWins $totalA `
501 -BWins $totalB `
502 -InvariantFailures $invariantFailures `
503 -DivergenceFailures $divergenceFailures `
504 -Tier $Tier
505
506 $summary = [ordered]@{
507 agent = $Agent
508 tier = $Tier
509 model = $primaryModel
510 stimulusFilter = $StimulusFilter
511 runs = $totalRuns
512 ties = $totalTies
513 aWins = $totalA
514 bWins = $totalB
515 invariantFailures = $invariantFailures
516 divergenceFailures = $divergenceFailures
517 verdict = $verdict
518 variants = $variants
519 compareLogs = @($compareLogs)
520 }
521
522 Write-SummaryJson -Summary $summary -Path $OutputPath
523 Write-Host "Summary written: $OutputPath ($verdict)" -ForegroundColor Cyan
524
525 if ($Tier -eq 'pr') {
526 exit 0
527 }
528
529 if ($verdict -eq 'fail') {
530 Write-Host "Nightly verdict: fail" -ForegroundColor Red
531 exit 1
532 }
533
534 exit 0
535 }
536 catch {
537 Write-Error -ErrorAction Continue "Invoke-BaselineEquivalence failed: $($_.Exception.Message)"
538 exit 3
539 }
540}
541#endregion Main Execution
542