microsoft/hve-core
Publicmirrored fromhttps://github.com/microsoft/hve-coreAvailable
scripts/evals/Invoke-AgentMatrix.ps1
673lines · modecode
| 1 | #!/usr/bin/env pwsh |
| 2 | # Copyright (c) Microsoft Corporation. |
| 3 | # SPDX-License-Identifier: MIT |
| 4 | |
| 5 | #Requires -Version 7.0 |
| 6 | |
| 7 | <# |
| 8 | .SYNOPSIS |
| 9 | Runs the Vally `agent-behavior` suite per parent-agent slug and aggregates |
| 10 | a matrix-style summary. |
| 11 | |
| 12 | .DESCRIPTION |
| 13 | Drives `npx vally eval --eval-spec evals/agent-behavior/stimuli/<slug>.yml` for either |
| 14 | a curated set of slugs (`-Changed`) or the full inventory (`-All`). |
| 15 | Emits one per-agent summary plus an aggregate `agent-matrix-summary.json` |
| 16 | and applies a tier exit policy: |
| 17 | |
| 18 | - `pr` : exit 0 always (advisory). |
| 19 | - `nightly` : exit 1 when any agent's `overall` is `fail`; otherwise exit 0. |
| 20 | |
| 21 | `-WhatIf` (dry-run) enumerates the slugs that would be exercised, reports the |
| 22 | planned `vally` command lines plus the per-slug `cost_tier` from AGENTS.yml, |
| 23 | writes a dry-run summary to the output directory, and exits 0 without |
| 24 | invoking any external command. |
| 25 | |
| 26 | .PARAMETER All |
| 27 | Run the full agent-behavior matrix using slugs from |
| 28 | `evals/agent-behavior/AGENTS.yml`. |
| 29 | |
| 30 | .PARAMETER Changed |
| 31 | Explicit set of changed agent slugs (or paths) to evaluate. Paths are |
| 32 | resolved to parent-agent slugs via `Get-AffectedAgentSlugs`. Mutually |
| 33 | exclusive with `-All`. |
| 34 | |
| 35 | .PARAMETER Tier |
| 36 | Exit policy. `pr` (default) always exits 0; `nightly` exits 1 on any |
| 37 | `overall: fail`. |
| 38 | |
| 39 | .PARAMETER OutputDir |
| 40 | Directory for per-agent summary JSON files and the aggregate |
| 41 | `agent-matrix-summary.json`. Defaults to |
| 42 | `<RepoRoot>/evals/results/agent-matrix/<yyyy-MM-dd>/`. |
| 43 | |
| 44 | .PARAMETER Concurrency |
| 45 | Reserved for parallel execution (WI-04). Currently runs sequentially; |
| 46 | values greater than 1 produce a warning and fall back to 1. |
| 47 | |
| 48 | .PARAMETER RepoRoot |
| 49 | Repository root. Defaults to `git rev-parse --show-toplevel`. |
| 50 | |
| 51 | .PARAMETER Model |
| 52 | SDK model id passed to `vally eval --model`. Defaults to |
| 53 | `claude-haiku-4.5`. |
| 54 | |
| 55 | .EXAMPLE |
| 56 | ./Invoke-AgentMatrix.ps1 -All -Tier nightly -WhatIf |
| 57 | |
| 58 | Lists every agent slug, prints planned `vally` commands and per-slug cost |
| 59 | tiers, writes a dry-run summary, and exits 0. |
| 60 | |
| 61 | .EXAMPLE |
| 62 | npm run eval:agent:changed -- -WhatIf |
| 63 | |
| 64 | PR-tier advisory run filtered by git-changed agents. |
| 65 | |
| 66 | .NOTES |
| 67 | Runs via: npm run eval:agent / npm run eval:agent:matrix / npm run eval:agent:changed |
| 68 | #> |
| 69 | |
| 70 | [CmdletBinding(SupportsShouldProcess = $true, DefaultParameterSetName = 'All')] |
| 71 | param( |
| 72 | [Parameter(ParameterSetName = 'All', Mandatory = $false)] |
| 73 | [switch]$All, |
| 74 | |
| 75 | [Parameter(ParameterSetName = 'Changed', Mandatory = $true)] |
| 76 | [AllowEmptyCollection()] |
| 77 | [string[]]$Changed, |
| 78 | |
| 79 | [Parameter(Mandatory = $false)] |
| 80 | [ValidateSet('pr', 'nightly')] |
| 81 | [string]$Tier = 'pr', |
| 82 | |
| 83 | [Parameter(Mandatory = $false)] |
| 84 | [string]$OutputDir, |
| 85 | |
| 86 | [Parameter(Mandatory = $false)] |
| 87 | [ValidateRange(1, 32)] |
| 88 | [int]$Concurrency = 1, |
| 89 | |
| 90 | [Parameter(Mandatory = $false)] |
| 91 | [string]$RepoRoot, |
| 92 | |
| 93 | [Parameter(Mandatory = $false)] |
| 94 | [string]$Model = 'claude-haiku-4.5' |
| 95 | ) |
| 96 | |
| 97 | Set-StrictMode -Version Latest |
| 98 | $ErrorActionPreference = 'Stop' |
| 99 | |
| 100 | #region Helper Functions |
| 101 | |
| 102 | function Import-YamlModule { |
| 103 | [CmdletBinding()] |
| 104 | param() |
| 105 | |
| 106 | if (Get-Module -Name 'powershell-yaml') { return } |
| 107 | if (-not (Get-Module -ListAvailable -Name 'powershell-yaml')) { |
| 108 | throw "Required module 'powershell-yaml' is not installed. Run 'Install-Module powershell-yaml -Scope CurrentUser' before invoking this script." |
| 109 | } |
| 110 | Import-Module powershell-yaml -ErrorAction Stop | Out-Null |
| 111 | } |
| 112 | |
| 113 | function Resolve-RepoRoot { |
| 114 | [CmdletBinding()] |
| 115 | [OutputType([string])] |
| 116 | param([string]$Hint) |
| 117 | |
| 118 | if ($Hint) { return (Resolve-Path -LiteralPath $Hint).Path } |
| 119 | try { |
| 120 | $root = (& git rev-parse --show-toplevel 2>$null).Trim() |
| 121 | if ($LASTEXITCODE -eq 0 -and $root) { return $root } |
| 122 | } catch { |
| 123 | Write-Verbose "git rev-parse failed: $($_.Exception.Message)" |
| 124 | } |
| 125 | return (Resolve-Path -LiteralPath (Join-Path $PSScriptRoot '../..')).Path |
| 126 | } |
| 127 | |
| 128 | function Read-AgentInventory { |
| 129 | [CmdletBinding()] |
| 130 | [OutputType([System.Collections.Generic.List[hashtable]])] |
| 131 | param([Parameter(Mandatory)] [string]$RepoRoot) |
| 132 | |
| 133 | $path = Join-Path $RepoRoot 'evals/agent-behavior/AGENTS.yml' |
| 134 | if (-not (Test-Path -LiteralPath $path)) { |
| 135 | throw "Agent inventory not found at $path. Run scripts/evals/Build-AgentInventory.ps1 to generate." |
| 136 | } |
| 137 | |
| 138 | Import-YamlModule |
| 139 | $raw = [System.IO.File]::ReadAllText($path) |
| 140 | $parsed = ConvertFrom-Yaml -Yaml $raw |
| 141 | if (-not $parsed -or -not $parsed.ContainsKey('agents')) { |
| 142 | throw "Agent inventory at $path is missing the 'agents:' collection." |
| 143 | } |
| 144 | |
| 145 | $list = [System.Collections.Generic.List[hashtable]]::new() |
| 146 | foreach ($entry in $parsed['agents']) { |
| 147 | if (-not $entry -or -not $entry.ContainsKey('slug')) { continue } |
| 148 | $list.Add(@{ |
| 149 | slug = [string]$entry['slug'] |
| 150 | path = if ($entry.ContainsKey('path')) { [string]$entry['path'] } else { '' } |
| 151 | class = if ($entry.ContainsKey('class')) { [string]$entry['class'] } else { '' } |
| 152 | cost_tier = if ($entry.ContainsKey('cost_tier')) { [string]$entry['cost_tier'] } else { 'unknown' } |
| 153 | }) |
| 154 | } |
| 155 | return $list |
| 156 | } |
| 157 | |
| 158 | function Resolve-SlugSet { |
| 159 | [CmdletBinding()] |
| 160 | [OutputType([string[]])] |
| 161 | param( |
| 162 | [Parameter(Mandatory)] [string]$RepoRoot, |
| 163 | [Parameter(Mandatory)] [System.Collections.Generic.List[hashtable]]$Inventory, |
| 164 | [Parameter(Mandatory)] [string]$ParameterSet, |
| 165 | [string[]]$Changed |
| 166 | ) |
| 167 | |
| 168 | $known = [System.Collections.Generic.HashSet[string]]::new([System.StringComparer]::OrdinalIgnoreCase) |
| 169 | foreach ($entry in $Inventory) { [void]$known.Add($entry['slug']) } |
| 170 | |
| 171 | if ($ParameterSet -eq 'All') { |
| 172 | return ,[string[]](@($Inventory | ForEach-Object { $_['slug'] } | Sort-Object -Unique)) |
| 173 | } |
| 174 | |
| 175 | if (-not $Changed -or $Changed.Count -eq 0) { |
| 176 | return ,[string[]]@() |
| 177 | } |
| 178 | |
| 179 | $resolved = [System.Collections.Generic.HashSet[string]]::new([System.StringComparer]::OrdinalIgnoreCase) |
| 180 | $pathLike = [System.Collections.Generic.List[string]]::new() |
| 181 | |
| 182 | foreach ($item in $Changed) { |
| 183 | if ([string]::IsNullOrWhiteSpace($item)) { continue } |
| 184 | $trimmed = $item.Trim() |
| 185 | if ($known.Contains($trimmed) -and ($trimmed -notmatch '[\\/]')) { |
| 186 | [void]$resolved.Add($trimmed) |
| 187 | } else { |
| 188 | $pathLike.Add($trimmed) |
| 189 | } |
| 190 | } |
| 191 | |
| 192 | if ($pathLike.Count -gt 0) { |
| 193 | $modulePath = Join-Path $PSScriptRoot 'Modules/AffectedAgents.psm1' |
| 194 | if (-not (Test-Path -LiteralPath $modulePath)) { |
| 195 | throw "Required module not found: $modulePath" |
| 196 | } |
| 197 | Import-Module $modulePath -Force | Out-Null |
| 198 | $derived = Get-AffectedAgentSlugs -ChangedFiles $pathLike.ToArray() -RepoRoot $RepoRoot |
| 199 | foreach ($slug in $derived) { |
| 200 | if ($known.Contains($slug)) { [void]$resolved.Add($slug) } |
| 201 | } |
| 202 | } |
| 203 | |
| 204 | return ,[string[]](@($resolved | Sort-Object)) |
| 205 | } |
| 206 | |
| 207 | function Get-PlannedCommand { |
| 208 | [CmdletBinding()] |
| 209 | [OutputType([string])] |
| 210 | param( |
| 211 | [Parameter(Mandatory)] [string]$Slug, |
| 212 | [Parameter(Mandatory)] [string]$Model |
| 213 | ) |
| 214 | return "npx vally eval --eval-spec evals/agent-behavior/stimuli/$Slug.yml --model $Model" |
| 215 | } |
| 216 | |
| 217 | function Resolve-NpxExecutable { |
| 218 | [CmdletBinding()] |
| 219 | [OutputType([string])] |
| 220 | param() |
| 221 | |
| 222 | # On Windows, `Get-Command npx` may resolve to `npx.ps1`, whose argument |
| 223 | # forwarding is broken when invoked via the `&` call operator (it drops or |
| 224 | # mangles dashed args and yields 'could not determine executable to run'). |
| 225 | # Prefer `npx.cmd` explicitly on Windows; fall back to plain `npx` elsewhere. |
| 226 | if ($IsWindows) { |
| 227 | $cmd = Get-Command 'npx.cmd' -ErrorAction SilentlyContinue |
| 228 | if ($cmd) { return $cmd.Source } |
| 229 | } |
| 230 | $generic = Get-Command 'npx' -ErrorAction SilentlyContinue |
| 231 | if ($generic) { return $generic.Source } |
| 232 | throw "Could not locate the 'npx' executable on PATH." |
| 233 | } |
| 234 | |
| 235 | function Invoke-VallyAgentRun { |
| 236 | [CmdletBinding()] |
| 237 | [OutputType([hashtable])] |
| 238 | param( |
| 239 | [Parameter(Mandatory)] [string]$Slug, |
| 240 | [Parameter(Mandatory)] [string]$LogPath, |
| 241 | [Parameter(Mandatory)] [string]$Model |
| 242 | ) |
| 243 | |
| 244 | $npx = Resolve-NpxExecutable |
| 245 | $vallyArgs = @('vally', 'eval', '--eval-spec', "evals/agent-behavior/stimuli/$Slug.yml", '--model', $Model) |
| 246 | $prev = [Console]::OutputEncoding |
| 247 | try { |
| 248 | [Console]::OutputEncoding = [System.Text.Encoding]::UTF8 |
| 249 | $raw = & $npx @vallyArgs 2>&1 |
| 250 | $code = $LASTEXITCODE |
| 251 | } |
| 252 | finally { |
| 253 | [Console]::OutputEncoding = $prev |
| 254 | } |
| 255 | |
| 256 | $lines = @($raw | ForEach-Object { $_.ToString() }) |
| 257 | foreach ($line in $lines) { Write-Host $line } |
| 258 | |
| 259 | if ($LogPath) { |
| 260 | $dir = Split-Path -Parent $LogPath |
| 261 | if ($dir -and -not (Test-Path -LiteralPath $dir)) { |
| 262 | New-Item -ItemType Directory -Path $dir -Force -WhatIf:$false -Confirm:$false | Out-Null |
| 263 | } |
| 264 | Set-Content -LiteralPath $LogPath -Value $lines -Encoding utf8NoBOM -WhatIf:$false -Confirm:$false |
| 265 | } |
| 266 | |
| 267 | return @{ ExitCode = $code; Lines = $lines } |
| 268 | } |
| 269 | |
| 270 | function Get-GraderStatusesFromLog { |
| 271 | [CmdletBinding()] |
| 272 | [OutputType([System.Collections.Generic.List[hashtable]])] |
| 273 | param([Parameter(Mandatory)] [AllowEmptyCollection()] [AllowEmptyString()] [string[]]$Lines) |
| 274 | |
| 275 | # Vally emits a per-eval Graders block of the form: |
| 276 | # Graders (2/3) |
| 277 | # ───────────────────────────────────────── |
| 278 | # ✔ field-vocab-present Output matches pattern /(?i)(title|...)/ |
| 279 | # ✘ tracking-file-write Output does not match pattern /(?i)\.copilot-tracking/workitems/ |
| 280 | # ✔ no-source-edit Output does not match pattern /(?i)(\.cs|...)/ |
| 281 | # <blank line> |
| 282 | # 1 grader(s) failed. |
| 283 | # |
| 284 | # The legacy "grader X: pass" textual form is also tolerated for forward compatibility. |
| 285 | $graders = [System.Collections.Generic.List[hashtable]]::new() |
| 286 | $seen = [System.Collections.Generic.HashSet[string]]::new([System.StringComparer]::OrdinalIgnoreCase) |
| 287 | |
| 288 | $glyphRegex = [regex]'^\s*(?<glyph>[\u2714\u2718])\s+(?<name>[\w\.\-:]+)\s+(?<message>.+?)\s*$' |
| 289 | $legacyRegex = [regex]'(?i)grader\s+["'']?(?<name>[\w\.\-:]+)["'']?\s*[:=\-]\s*(?<status>pass|fail|warn|skip)' |
| 290 | $patternRegex = [regex]'(?<negation>does not )?match(?:es)? pattern\s+(?<pattern>/.+/)' |
| 291 | # Vally colorizes its console output with ANSI SGR sequences; strip them so glyph/name parsing works. |
| 292 | $ansiRegex = [regex]"\x1B\[[0-9;?]*[ -/]*[@-~]" |
| 293 | $inBlock = $false |
| 294 | |
| 295 | foreach ($rawLine in $Lines) { |
| 296 | if ($null -eq $rawLine) { continue } |
| 297 | $line = $ansiRegex.Replace([string]$rawLine, '') |
| 298 | |
| 299 | if ($line -match '^\s*Graders\s*\(') { $inBlock = $true; continue } |
| 300 | if ($inBlock -and ($line -match '^\s*\d+\s+grader\(s\)\s+failed' -or [string]::IsNullOrWhiteSpace($line))) { |
| 301 | $inBlock = $false |
| 302 | continue |
| 303 | } |
| 304 | |
| 305 | if ($inBlock) { |
| 306 | $glyphMatch = $glyphRegex.Match($line) |
| 307 | if ($glyphMatch.Success) { |
| 308 | $name = $glyphMatch.Groups['name'].Value |
| 309 | if (-not $seen.Add($name)) { continue } |
| 310 | $status = if ($glyphMatch.Groups['glyph'].Value -eq [char]0x2714) { 'pass' } else { 'fail' } |
| 311 | $message = $glyphMatch.Groups['message'].Value.Trim() |
| 312 | $pattern = '' |
| 313 | $patternMatch = $patternRegex.Match($message) |
| 314 | if ($patternMatch.Success) { $pattern = $patternMatch.Groups['pattern'].Value } |
| 315 | $graders.Add(@{ |
| 316 | name = $name |
| 317 | status = $status |
| 318 | message = $message |
| 319 | pattern = $pattern |
| 320 | }) |
| 321 | continue |
| 322 | } |
| 323 | } |
| 324 | |
| 325 | $legacyMatch = $legacyRegex.Match($line) |
| 326 | if ($legacyMatch.Success) { |
| 327 | $name = $legacyMatch.Groups['name'].Value |
| 328 | if (-not $seen.Add($name)) { continue } |
| 329 | $graders.Add(@{ |
| 330 | name = $name |
| 331 | status = $legacyMatch.Groups['status'].Value.ToLowerInvariant() |
| 332 | message = '' |
| 333 | pattern = '' |
| 334 | }) |
| 335 | } |
| 336 | } |
| 337 | return $graders |
| 338 | } |
| 339 | |
| 340 | function Get-VallyOutputDirFromLog { |
| 341 | [CmdletBinding()] |
| 342 | [OutputType([string])] |
| 343 | param([Parameter(Mandatory)] [AllowEmptyCollection()] [AllowEmptyString()] [string[]]$Lines) |
| 344 | |
| 345 | $regex = [regex]'(?im)^\s*Output\s+directory:\s*(?<dir>.+?)\s*$' |
| 346 | foreach ($line in $Lines) { |
| 347 | if ($null -eq $line) { continue } |
| 348 | $m = $regex.Match($line) |
| 349 | if ($m.Success) { return $m.Groups['dir'].Value.Trim() } |
| 350 | } |
| 351 | return '' |
| 352 | } |
| 353 | |
| 354 | function Read-VallyTrajectoryDetails { |
| 355 | [CmdletBinding()] |
| 356 | [OutputType([hashtable])] |
| 357 | param([Parameter(Mandatory)] [AllowEmptyString()] [string]$OutputDir) |
| 358 | |
| 359 | $empty = @{ stimulusPrompt = ''; output = ''; richGraders = @() } |
| 360 | if (-not $OutputDir) { return $empty } |
| 361 | $jsonlPath = Join-Path $OutputDir 'results.jsonl' |
| 362 | if (-not (Test-Path -LiteralPath $jsonlPath -PathType Leaf)) { return $empty } |
| 363 | |
| 364 | try { |
| 365 | $first = Get-Content -LiteralPath $jsonlPath -TotalCount 1 -ErrorAction Stop |
| 366 | if (-not $first) { return $empty } |
| 367 | $obj = $first | ConvertFrom-Json -Depth 60 -ErrorAction Stop |
| 368 | } catch { |
| 369 | Write-Verbose "Failed to parse vally JSONL at $jsonlPath`: $($_.Exception.Message)" |
| 370 | return $empty |
| 371 | } |
| 372 | |
| 373 | $stimPrompt = '' |
| 374 | if ($obj.PSObject.Properties['trajectory'] -and $obj.trajectory ` |
| 375 | -and $obj.trajectory.PSObject.Properties['stimulus'] -and $obj.trajectory.stimulus ` |
| 376 | -and $obj.trajectory.stimulus.PSObject.Properties['prompt']) { |
| 377 | $stimPrompt = [string]$obj.trajectory.stimulus.prompt |
| 378 | } |
| 379 | |
| 380 | $output = '' |
| 381 | if ($obj.PSObject.Properties['trajectory'] -and $obj.trajectory ` |
| 382 | -and $obj.trajectory.PSObject.Properties['output']) { |
| 383 | $rawOutput = $obj.trajectory.output |
| 384 | $output = if ($rawOutput -is [string]) { $rawOutput } else { ($rawOutput | ConvertTo-Json -Depth 12) } |
| 385 | } |
| 386 | |
| 387 | $rich = [System.Collections.Generic.List[hashtable]]::new() |
| 388 | $richPatternRegex = [regex]'(?<negation>does not )?match(?:es)? pattern\s+(?<pattern>/.+/)' |
| 389 | if ($obj.PSObject.Properties['gradeResult'] -and $obj.gradeResult ` |
| 390 | -and $obj.gradeResult.PSObject.Properties['details'] -and $obj.gradeResult.details) { |
| 391 | foreach ($d in @($obj.gradeResult.details)) { |
| 392 | if (-not $d) { continue } |
| 393 | $evidence = if ($d.PSObject.Properties['evidence']) { [string]$d.evidence } else { '' } |
| 394 | $pattern = '' |
| 395 | if ($evidence) { |
| 396 | $pm = $richPatternRegex.Match($evidence) |
| 397 | if ($pm.Success) { $pattern = $pm.Groups['pattern'].Value } |
| 398 | } |
| 399 | $rich.Add(@{ |
| 400 | name = if ($d.PSObject.Properties['name']) { [string]$d.name } else { '' } |
| 401 | status = if ($d.PSObject.Properties['passed']) { if ($d.passed) { 'pass' } else { 'fail' } } else { 'unknown' } |
| 402 | evidence = $evidence |
| 403 | pattern = $pattern |
| 404 | label = if ($d.PSObject.Properties['label']) { [string]$d.label } else { '' } |
| 405 | kind = if ($d.PSObject.Properties['kind']) { [string]$d.kind } else { '' } |
| 406 | }) |
| 407 | } |
| 408 | } |
| 409 | |
| 410 | return @{ |
| 411 | stimulusPrompt = $stimPrompt |
| 412 | output = $output |
| 413 | richGraders = $rich.ToArray() |
| 414 | } |
| 415 | } |
| 416 | |
| 417 | function Merge-GraderDetails { |
| 418 | [CmdletBinding()] |
| 419 | [OutputType([System.Collections.Generic.List[hashtable]])] |
| 420 | param( |
| 421 | [Parameter(Mandatory)] [AllowEmptyCollection()] [System.Collections.Generic.List[hashtable]]$LogGraders, |
| 422 | [Parameter(Mandatory)] [AllowEmptyCollection()] [object[]]$RichGraders |
| 423 | ) |
| 424 | |
| 425 | $merged = [System.Collections.Generic.List[hashtable]]::new() |
| 426 | $richByName = @{} |
| 427 | foreach ($r in $RichGraders) { |
| 428 | if (-not $r) { continue } |
| 429 | $rn = [string]$r['name'] |
| 430 | if ($rn) { $richByName[$rn] = $r } |
| 431 | } |
| 432 | |
| 433 | foreach ($g in $LogGraders) { |
| 434 | $name = [string]$g['name'] |
| 435 | $entry = @{ |
| 436 | name = $name |
| 437 | status = [string]$g['status'] |
| 438 | message = if ($g.ContainsKey('message')) { [string]$g['message'] } else { '' } |
| 439 | pattern = if ($g.ContainsKey('pattern')) { [string]$g['pattern'] } else { '' } |
| 440 | evidence = '' |
| 441 | label = '' |
| 442 | kind = '' |
| 443 | } |
| 444 | if ($richByName.ContainsKey($name)) { |
| 445 | $r = $richByName[$name] |
| 446 | $entry['evidence'] = [string]$r['evidence'] |
| 447 | $entry['label'] = [string]$r['label'] |
| 448 | $entry['kind'] = [string]$r['kind'] |
| 449 | if (-not $entry['status']) { $entry['status'] = [string]$r['status'] } |
| 450 | if (-not $entry['pattern'] -and $r.ContainsKey('pattern')) { |
| 451 | $entry['pattern'] = [string]$r['pattern'] |
| 452 | } |
| 453 | } |
| 454 | $merged.Add($entry) |
| 455 | } |
| 456 | |
| 457 | # Include rich-only graders that the log parser missed (defensive fallback). |
| 458 | $seen = [System.Collections.Generic.HashSet[string]]::new([System.StringComparer]::OrdinalIgnoreCase) |
| 459 | foreach ($e in $merged) { [void]$seen.Add($e['name']) } |
| 460 | foreach ($name in $richByName.Keys) { |
| 461 | if ($seen.Contains($name)) { continue } |
| 462 | $r = $richByName[$name] |
| 463 | $evidence = [string]$r['evidence'] |
| 464 | $merged.Add(@{ |
| 465 | name = $name |
| 466 | status = [string]$r['status'] |
| 467 | message = $evidence |
| 468 | pattern = if ($r.ContainsKey('pattern')) { [string]$r['pattern'] } else { '' } |
| 469 | evidence = $evidence |
| 470 | label = [string]$r['label'] |
| 471 | kind = [string]$r['kind'] |
| 472 | }) |
| 473 | } |
| 474 | return $merged |
| 475 | } |
| 476 | |
| 477 | function New-AgentSummary { |
| 478 | [CmdletBinding()] |
| 479 | [OutputType([hashtable])] |
| 480 | param( |
| 481 | [Parameter(Mandatory)] [hashtable]$AgentEntry, |
| 482 | [Parameter(Mandatory)] [int]$ExitCode, |
| 483 | [Parameter(Mandatory)] [AllowEmptyCollection()] [System.Collections.Generic.List[hashtable]]$Graders, |
| 484 | [Parameter(Mandatory)] [string]$LogPath, |
| 485 | [string]$OutputDir = '', |
| 486 | [string]$StimulusPrompt = '', |
| 487 | [string]$Output = '' |
| 488 | ) |
| 489 | |
| 490 | $overall = if ($ExitCode -eq 0) { 'pass' } else { 'fail' } |
| 491 | if ($overall -eq 'pass' -and $Graders.Count -gt 0) { |
| 492 | foreach ($g in $Graders) { |
| 493 | if ($g['status'] -eq 'fail') { $overall = 'fail'; break } |
| 494 | } |
| 495 | } |
| 496 | |
| 497 | $graderObjects = @($Graders | ForEach-Object { |
| 498 | [ordered]@{ |
| 499 | name = [string]$_['name'] |
| 500 | status = [string]$_['status'] |
| 501 | message = if ($_.ContainsKey('message')) { [string]$_['message'] } else { '' } |
| 502 | pattern = if ($_.ContainsKey('pattern')) { [string]$_['pattern'] } else { '' } |
| 503 | evidence = if ($_.ContainsKey('evidence')) { [string]$_['evidence'] } else { '' } |
| 504 | label = if ($_.ContainsKey('label')) { [string]$_['label'] } else { '' } |
| 505 | kind = if ($_.ContainsKey('kind')) { [string]$_['kind'] } else { '' } |
| 506 | } |
| 507 | }) |
| 508 | |
| 509 | return [ordered]@{ |
| 510 | slug = [string]$AgentEntry['slug'] |
| 511 | class = [string]$AgentEntry['class'] |
| 512 | cost_tier = [string]$AgentEntry['cost_tier'] |
| 513 | graders = $graderObjects |
| 514 | overall = $overall |
| 515 | exitCode = $ExitCode |
| 516 | logPath = $LogPath |
| 517 | vallyOutputDir = $OutputDir |
| 518 | stimulusPrompt = $StimulusPrompt |
| 519 | output = $Output |
| 520 | } |
| 521 | } |
| 522 | |
| 523 | function New-MatrixSummary { |
| 524 | [CmdletBinding()] |
| 525 | [OutputType([hashtable])] |
| 526 | param( |
| 527 | [Parameter(Mandatory)] [string]$Tier, |
| 528 | [Parameter(Mandatory)] [string]$Mode, |
| 529 | [Parameter(Mandatory)] [AllowEmptyCollection()] [System.Collections.Generic.List[hashtable]]$Results, |
| 530 | [string[]]$PlannedCommands, |
| 531 | [string]$Verdict |
| 532 | ) |
| 533 | |
| 534 | $failures = @($Results | Where-Object { $_['overall'] -eq 'fail' } | ForEach-Object { [string]$_['slug'] }) |
| 535 | $overall = if ($Verdict) { $Verdict } elseif ($failures.Count -gt 0) { 'fail' } else { 'pass' } |
| 536 | |
| 537 | return [ordered]@{ |
| 538 | generatedAt = (Get-Date -AsUTC).ToString('yyyy-MM-ddTHH:mm:ssZ') |
| 539 | tier = $Tier |
| 540 | mode = $Mode |
| 541 | agentCount = $Results.Count |
| 542 | overall = $overall |
| 543 | failures = $failures |
| 544 | results = @($Results) |
| 545 | plannedCommands = @($PlannedCommands) |
| 546 | } |
| 547 | } |
| 548 | |
| 549 | function Write-SummaryJson { |
| 550 | [CmdletBinding()] |
| 551 | param( |
| 552 | [Parameter(Mandatory)] [object]$Summary, |
| 553 | [Parameter(Mandatory)] [string]$Path |
| 554 | ) |
| 555 | |
| 556 | $dir = Split-Path -Parent $Path |
| 557 | if ($dir -and -not (Test-Path -LiteralPath $dir)) { |
| 558 | New-Item -ItemType Directory -Path $dir -Force -WhatIf:$false -Confirm:$false | Out-Null |
| 559 | } |
| 560 | $json = $Summary | ConvertTo-Json -Depth 12 |
| 561 | Set-Content -LiteralPath $Path -Value $json -Encoding utf8NoBOM -WhatIf:$false -Confirm:$false |
| 562 | } |
| 563 | |
| 564 | #endregion Helper Functions |
| 565 | |
| 566 | #region Main Execution |
| 567 | if ($MyInvocation.InvocationName -ne '.') { |
| 568 | try { |
| 569 | $resolvedRoot = Resolve-RepoRoot -Hint $RepoRoot |
| 570 | if ($Concurrency -gt 1) { |
| 571 | Write-Warning "Concurrency > 1 reserved for WI-04; running sequentially." |
| 572 | $Concurrency = 1 |
| 573 | } |
| 574 | |
| 575 | if (-not $OutputDir) { |
| 576 | $dateStamp = (Get-Date -AsUTC).ToString('yyyy-MM-dd') |
| 577 | $OutputDir = Join-Path $resolvedRoot "evals/results/agent-matrix/$dateStamp" |
| 578 | } |
| 579 | if (-not (Test-Path -LiteralPath $OutputDir)) { |
| 580 | New-Item -ItemType Directory -Path $OutputDir -Force -WhatIf:$false -Confirm:$false | Out-Null |
| 581 | } |
| 582 | |
| 583 | $inventory = Read-AgentInventory -RepoRoot $resolvedRoot |
| 584 | $inventoryBySlug = @{} |
| 585 | foreach ($entry in $inventory) { $inventoryBySlug[$entry['slug']] = $entry } |
| 586 | |
| 587 | $slugs = Resolve-SlugSet -RepoRoot $resolvedRoot -Inventory $inventory -ParameterSet $PSCmdlet.ParameterSetName -Changed $Changed |
| 588 | |
| 589 | $mode = $PSCmdlet.ParameterSetName.ToLowerInvariant() |
| 590 | Write-Host "Agent matrix: mode=$mode tier=$Tier slug_count=$($slugs.Count)" -ForegroundColor Cyan |
| 591 | Write-Host " Output dir: $OutputDir" -ForegroundColor DarkGray |
| 592 | |
| 593 | $plannedCommands = @($slugs | ForEach-Object { Get-PlannedCommand -Slug $_ -Model $Model }) |
| 594 | |
| 595 | $summaryPath = Join-Path $OutputDir 'agent-matrix-summary.json' |
| 596 | |
| 597 | if ($slugs.Count -eq 0) { |
| 598 | Write-Host "No agent slugs resolved; nothing to evaluate." -ForegroundColor Yellow |
| 599 | $emptyResults = [System.Collections.Generic.List[hashtable]]::new() |
| 600 | $verdict = if ($WhatIfPreference) { 'dry-run' } else { 'pass' } |
| 601 | $summary = New-MatrixSummary -Tier $Tier -Mode $mode -Results $emptyResults -PlannedCommands $plannedCommands -Verdict $verdict |
| 602 | Write-SummaryJson -Summary $summary -Path $summaryPath |
| 603 | Write-Host "Summary written: $summaryPath ($verdict)" -ForegroundColor Green |
| 604 | exit 0 |
| 605 | } |
| 606 | |
| 607 | if ($WhatIfPreference) { |
| 608 | Write-Host "Dry-run mode: skipping live vally invocations." -ForegroundColor Yellow |
| 609 | $dryResults = [System.Collections.Generic.List[hashtable]]::new() |
| 610 | foreach ($slug in $slugs) { |
| 611 | $entry = $inventoryBySlug[$slug] |
| 612 | $cmd = Get-PlannedCommand -Slug $slug -Model $Model |
| 613 | Write-Host " [$($entry['cost_tier'])] $cmd" -ForegroundColor DarkGray |
| 614 | $dryResults.Add([ordered]@{ |
| 615 | slug = $slug |
| 616 | class = [string]$entry['class'] |
| 617 | cost_tier = [string]$entry['cost_tier'] |
| 618 | graders = @() |
| 619 | overall = 'dry-run' |
| 620 | exitCode = 0 |
| 621 | logPath = '' |
| 622 | }) |
| 623 | } |
| 624 | $summary = New-MatrixSummary -Tier $Tier -Mode $mode -Results $dryResults -PlannedCommands $plannedCommands -Verdict 'dry-run' |
| 625 | Write-SummaryJson -Summary $summary -Path $summaryPath |
| 626 | Write-Host "Dry-run summary written: $summaryPath" -ForegroundColor Green |
| 627 | exit 0 |
| 628 | } |
| 629 | |
| 630 | $logsRoot = Join-Path $resolvedRoot 'logs/agent-matrix' |
| 631 | $runId = (Get-Date -AsUTC).ToString('yyyyMMddTHHmmssfffZ') |
| 632 | |
| 633 | $results = [System.Collections.Generic.List[hashtable]]::new() |
| 634 | foreach ($slug in $slugs) { |
| 635 | $entry = $inventoryBySlug[$slug] |
| 636 | $logPath = Join-Path $logsRoot "$slug-$runId.log" |
| 637 | Write-Host "[$slug] running agent-behavior eval" -ForegroundColor Cyan |
| 638 | $run = Invoke-VallyAgentRun -Slug $slug -LogPath $logPath -Model $Model |
| 639 | $graders = Get-GraderStatusesFromLog -Lines $run['Lines'] |
| 640 | if ($null -eq $graders) { $graders = [System.Collections.Generic.List[hashtable]]::new() } |
| 641 | |
| 642 | $vallyOutDir = Get-VallyOutputDirFromLog -Lines $run['Lines'] |
| 643 | $details = Read-VallyTrajectoryDetails -OutputDir $vallyOutDir |
| 644 | if ($details['richGraders'] -and $details['richGraders'].Count -gt 0) { |
| 645 | $graders = Merge-GraderDetails -LogGraders $graders -RichGraders $details['richGraders'] |
| 646 | } |
| 647 | |
| 648 | $summary = New-AgentSummary -AgentEntry $entry -ExitCode $run['ExitCode'] -Graders $graders ` |
| 649 | -LogPath $logPath -OutputDir $vallyOutDir ` |
| 650 | -StimulusPrompt $details['stimulusPrompt'] -Output $details['output'] |
| 651 | |
| 652 | $perAgentPath = Join-Path $OutputDir "$slug.json" |
| 653 | Write-SummaryJson -Summary $summary -Path $perAgentPath |
| 654 | $results.Add($summary) |
| 655 | } |
| 656 | |
| 657 | $matrixSummary = New-MatrixSummary -Tier $Tier -Mode $mode -Results $results -PlannedCommands $plannedCommands |
| 658 | Write-SummaryJson -Summary $matrixSummary -Path $summaryPath |
| 659 | Write-Host "Summary written: $summaryPath ($($matrixSummary['overall']))" -ForegroundColor Cyan |
| 660 | |
| 661 | if ($Tier -eq 'pr') { exit 0 } |
| 662 | if ($matrixSummary['overall'] -eq 'fail') { |
| 663 | Write-Host "Nightly verdict: fail (failures: $($matrixSummary['failures'] -join ', '))" -ForegroundColor Red |
| 664 | exit 1 |
| 665 | } |
| 666 | exit 0 |
| 667 | } |
| 668 | catch { |
| 669 | Write-Error -ErrorAction Continue "Invoke-AgentMatrix failed: $($_.Exception.Message)" |
| 670 | exit 3 |
| 671 | } |
| 672 | } |
| 673 | #endregion Main Execution |
| 674 | |