microsoft/hve-core

Public

mirrored fromhttps://github.com/microsoft/hve-coreAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
feat/1637-d-skill-paths

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

scripts/evals/Invoke-AgentMatrix.ps1

673lines · modecode

1#!/usr/bin/env pwsh
2# Copyright (c) Microsoft Corporation.
3# SPDX-License-Identifier: MIT
4
5#Requires -Version 7.0
6
7<#
8.SYNOPSIS
9 Runs the Vally `agent-behavior` suite per parent-agent slug and aggregates
10 a matrix-style summary.
11
12.DESCRIPTION
13 Drives `npx vally eval --eval-spec evals/agent-behavior/stimuli/<slug>.yml` for either
14 a curated set of slugs (`-Changed`) or the full inventory (`-All`).
15 Emits one per-agent summary plus an aggregate `agent-matrix-summary.json`
16 and applies a tier exit policy:
17
18 - `pr` : exit 0 always (advisory).
19 - `nightly` : exit 1 when any agent's `overall` is `fail`; otherwise exit 0.
20
21 `-WhatIf` (dry-run) enumerates the slugs that would be exercised, reports the
22 planned `vally` command lines plus the per-slug `cost_tier` from AGENTS.yml,
23 writes a dry-run summary to the output directory, and exits 0 without
24 invoking any external command.
25
26.PARAMETER All
27 Run the full agent-behavior matrix using slugs from
28 `evals/agent-behavior/AGENTS.yml`.
29
30.PARAMETER Changed
31 Explicit set of changed agent slugs (or paths) to evaluate. Paths are
32 resolved to parent-agent slugs via `Get-AffectedAgentSlugs`. Mutually
33 exclusive with `-All`.
34
35.PARAMETER Tier
36 Exit policy. `pr` (default) always exits 0; `nightly` exits 1 on any
37 `overall: fail`.
38
39.PARAMETER OutputDir
40 Directory for per-agent summary JSON files and the aggregate
41 `agent-matrix-summary.json`. Defaults to
42 `<RepoRoot>/evals/results/agent-matrix/<yyyy-MM-dd>/`.
43
44.PARAMETER Concurrency
45 Reserved for parallel execution (WI-04). Currently runs sequentially;
46 values greater than 1 produce a warning and fall back to 1.
47
48.PARAMETER RepoRoot
49 Repository root. Defaults to `git rev-parse --show-toplevel`.
50
51.PARAMETER Model
52 SDK model id passed to `vally eval --model`. Defaults to
53 `claude-haiku-4.5`.
54
55.EXAMPLE
56 ./Invoke-AgentMatrix.ps1 -All -Tier nightly -WhatIf
57
58 Lists every agent slug, prints planned `vally` commands and per-slug cost
59 tiers, writes a dry-run summary, and exits 0.
60
61.EXAMPLE
62 npm run eval:agent:changed -- -WhatIf
63
64 PR-tier advisory run filtered by git-changed agents.
65
66.NOTES
67 Runs via: npm run eval:agent / npm run eval:agent:matrix / npm run eval:agent:changed
68#>
69
70[CmdletBinding(SupportsShouldProcess = $true, DefaultParameterSetName = 'All')]
71param(
72 [Parameter(ParameterSetName = 'All', Mandatory = $false)]
73 [switch]$All,
74
75 [Parameter(ParameterSetName = 'Changed', Mandatory = $true)]
76 [AllowEmptyCollection()]
77 [string[]]$Changed,
78
79 [Parameter(Mandatory = $false)]
80 [ValidateSet('pr', 'nightly')]
81 [string]$Tier = 'pr',
82
83 [Parameter(Mandatory = $false)]
84 [string]$OutputDir,
85
86 [Parameter(Mandatory = $false)]
87 [ValidateRange(1, 32)]
88 [int]$Concurrency = 1,
89
90 [Parameter(Mandatory = $false)]
91 [string]$RepoRoot,
92
93 [Parameter(Mandatory = $false)]
94 [string]$Model = 'claude-haiku-4.5'
95)
96
97Set-StrictMode -Version Latest
98$ErrorActionPreference = 'Stop'
99
100#region Helper Functions
101
102function Import-YamlModule {
103 [CmdletBinding()]
104 param()
105
106 if (Get-Module -Name 'powershell-yaml') { return }
107 if (-not (Get-Module -ListAvailable -Name 'powershell-yaml')) {
108 throw "Required module 'powershell-yaml' is not installed. Run 'Install-Module powershell-yaml -Scope CurrentUser' before invoking this script."
109 }
110 Import-Module powershell-yaml -ErrorAction Stop | Out-Null
111}
112
113function Resolve-RepoRoot {
114 [CmdletBinding()]
115 [OutputType([string])]
116 param([string]$Hint)
117
118 if ($Hint) { return (Resolve-Path -LiteralPath $Hint).Path }
119 try {
120 $root = (& git rev-parse --show-toplevel 2>$null).Trim()
121 if ($LASTEXITCODE -eq 0 -and $root) { return $root }
122 } catch {
123 Write-Verbose "git rev-parse failed: $($_.Exception.Message)"
124 }
125 return (Resolve-Path -LiteralPath (Join-Path $PSScriptRoot '../..')).Path
126}
127
128function Read-AgentInventory {
129 [CmdletBinding()]
130 [OutputType([System.Collections.Generic.List[hashtable]])]
131 param([Parameter(Mandatory)] [string]$RepoRoot)
132
133 $path = Join-Path $RepoRoot 'evals/agent-behavior/AGENTS.yml'
134 if (-not (Test-Path -LiteralPath $path)) {
135 throw "Agent inventory not found at $path. Run scripts/evals/Build-AgentInventory.ps1 to generate."
136 }
137
138 Import-YamlModule
139 $raw = [System.IO.File]::ReadAllText($path)
140 $parsed = ConvertFrom-Yaml -Yaml $raw
141 if (-not $parsed -or -not $parsed.ContainsKey('agents')) {
142 throw "Agent inventory at $path is missing the 'agents:' collection."
143 }
144
145 $list = [System.Collections.Generic.List[hashtable]]::new()
146 foreach ($entry in $parsed['agents']) {
147 if (-not $entry -or -not $entry.ContainsKey('slug')) { continue }
148 $list.Add(@{
149 slug = [string]$entry['slug']
150 path = if ($entry.ContainsKey('path')) { [string]$entry['path'] } else { '' }
151 class = if ($entry.ContainsKey('class')) { [string]$entry['class'] } else { '' }
152 cost_tier = if ($entry.ContainsKey('cost_tier')) { [string]$entry['cost_tier'] } else { 'unknown' }
153 })
154 }
155 return $list
156}
157
158function Resolve-SlugSet {
159 [CmdletBinding()]
160 [OutputType([string[]])]
161 param(
162 [Parameter(Mandatory)] [string]$RepoRoot,
163 [Parameter(Mandatory)] [System.Collections.Generic.List[hashtable]]$Inventory,
164 [Parameter(Mandatory)] [string]$ParameterSet,
165 [string[]]$Changed
166 )
167
168 $known = [System.Collections.Generic.HashSet[string]]::new([System.StringComparer]::OrdinalIgnoreCase)
169 foreach ($entry in $Inventory) { [void]$known.Add($entry['slug']) }
170
171 if ($ParameterSet -eq 'All') {
172 return ,[string[]](@($Inventory | ForEach-Object { $_['slug'] } | Sort-Object -Unique))
173 }
174
175 if (-not $Changed -or $Changed.Count -eq 0) {
176 return ,[string[]]@()
177 }
178
179 $resolved = [System.Collections.Generic.HashSet[string]]::new([System.StringComparer]::OrdinalIgnoreCase)
180 $pathLike = [System.Collections.Generic.List[string]]::new()
181
182 foreach ($item in $Changed) {
183 if ([string]::IsNullOrWhiteSpace($item)) { continue }
184 $trimmed = $item.Trim()
185 if ($known.Contains($trimmed) -and ($trimmed -notmatch '[\\/]')) {
186 [void]$resolved.Add($trimmed)
187 } else {
188 $pathLike.Add($trimmed)
189 }
190 }
191
192 if ($pathLike.Count -gt 0) {
193 $modulePath = Join-Path $PSScriptRoot 'Modules/AffectedAgents.psm1'
194 if (-not (Test-Path -LiteralPath $modulePath)) {
195 throw "Required module not found: $modulePath"
196 }
197 Import-Module $modulePath -Force | Out-Null
198 $derived = Get-AffectedAgentSlugs -ChangedFiles $pathLike.ToArray() -RepoRoot $RepoRoot
199 foreach ($slug in $derived) {
200 if ($known.Contains($slug)) { [void]$resolved.Add($slug) }
201 }
202 }
203
204 return ,[string[]](@($resolved | Sort-Object))
205}
206
207function Get-PlannedCommand {
208 [CmdletBinding()]
209 [OutputType([string])]
210 param(
211 [Parameter(Mandatory)] [string]$Slug,
212 [Parameter(Mandatory)] [string]$Model
213 )
214 return "npx vally eval --eval-spec evals/agent-behavior/stimuli/$Slug.yml --model $Model"
215}
216
217function Resolve-NpxExecutable {
218 [CmdletBinding()]
219 [OutputType([string])]
220 param()
221
222 # On Windows, `Get-Command npx` may resolve to `npx.ps1`, whose argument
223 # forwarding is broken when invoked via the `&` call operator (it drops or
224 # mangles dashed args and yields 'could not determine executable to run').
225 # Prefer `npx.cmd` explicitly on Windows; fall back to plain `npx` elsewhere.
226 if ($IsWindows) {
227 $cmd = Get-Command 'npx.cmd' -ErrorAction SilentlyContinue
228 if ($cmd) { return $cmd.Source }
229 }
230 $generic = Get-Command 'npx' -ErrorAction SilentlyContinue
231 if ($generic) { return $generic.Source }
232 throw "Could not locate the 'npx' executable on PATH."
233}
234
235function Invoke-VallyAgentRun {
236 [CmdletBinding()]
237 [OutputType([hashtable])]
238 param(
239 [Parameter(Mandatory)] [string]$Slug,
240 [Parameter(Mandatory)] [string]$LogPath,
241 [Parameter(Mandatory)] [string]$Model
242 )
243
244 $npx = Resolve-NpxExecutable
245 $vallyArgs = @('vally', 'eval', '--eval-spec', "evals/agent-behavior/stimuli/$Slug.yml", '--model', $Model)
246 $prev = [Console]::OutputEncoding
247 try {
248 [Console]::OutputEncoding = [System.Text.Encoding]::UTF8
249 $raw = & $npx @vallyArgs 2>&1
250 $code = $LASTEXITCODE
251 }
252 finally {
253 [Console]::OutputEncoding = $prev
254 }
255
256 $lines = @($raw | ForEach-Object { $_.ToString() })
257 foreach ($line in $lines) { Write-Host $line }
258
259 if ($LogPath) {
260 $dir = Split-Path -Parent $LogPath
261 if ($dir -and -not (Test-Path -LiteralPath $dir)) {
262 New-Item -ItemType Directory -Path $dir -Force -WhatIf:$false -Confirm:$false | Out-Null
263 }
264 Set-Content -LiteralPath $LogPath -Value $lines -Encoding utf8NoBOM -WhatIf:$false -Confirm:$false
265 }
266
267 return @{ ExitCode = $code; Lines = $lines }
268}
269
270function Get-GraderStatusesFromLog {
271 [CmdletBinding()]
272 [OutputType([System.Collections.Generic.List[hashtable]])]
273 param([Parameter(Mandatory)] [AllowEmptyCollection()] [AllowEmptyString()] [string[]]$Lines)
274
275 # Vally emits a per-eval Graders block of the form:
276 # Graders (2/3)
277 # ─────────────────────────────────────────
278 # ✔ field-vocab-present Output matches pattern /(?i)(title|...)/
279 # ✘ tracking-file-write Output does not match pattern /(?i)\.copilot-tracking/workitems/
280 # ✔ no-source-edit Output does not match pattern /(?i)(\.cs|...)/
281 # <blank line>
282 # 1 grader(s) failed.
283 #
284 # The legacy "grader X: pass" textual form is also tolerated for forward compatibility.
285 $graders = [System.Collections.Generic.List[hashtable]]::new()
286 $seen = [System.Collections.Generic.HashSet[string]]::new([System.StringComparer]::OrdinalIgnoreCase)
287
288 $glyphRegex = [regex]'^\s*(?<glyph>[\u2714\u2718])\s+(?<name>[\w\.\-:]+)\s+(?<message>.+?)\s*$'
289 $legacyRegex = [regex]'(?i)grader\s+["'']?(?<name>[\w\.\-:]+)["'']?\s*[:=\-]\s*(?<status>pass|fail|warn|skip)'
290 $patternRegex = [regex]'(?<negation>does not )?match(?:es)? pattern\s+(?<pattern>/.+/)'
291 # Vally colorizes its console output with ANSI SGR sequences; strip them so glyph/name parsing works.
292 $ansiRegex = [regex]"\x1B\[[0-9;?]*[ -/]*[@-~]"
293 $inBlock = $false
294
295 foreach ($rawLine in $Lines) {
296 if ($null -eq $rawLine) { continue }
297 $line = $ansiRegex.Replace([string]$rawLine, '')
298
299 if ($line -match '^\s*Graders\s*\(') { $inBlock = $true; continue }
300 if ($inBlock -and ($line -match '^\s*\d+\s+grader\(s\)\s+failed' -or [string]::IsNullOrWhiteSpace($line))) {
301 $inBlock = $false
302 continue
303 }
304
305 if ($inBlock) {
306 $glyphMatch = $glyphRegex.Match($line)
307 if ($glyphMatch.Success) {
308 $name = $glyphMatch.Groups['name'].Value
309 if (-not $seen.Add($name)) { continue }
310 $status = if ($glyphMatch.Groups['glyph'].Value -eq [char]0x2714) { 'pass' } else { 'fail' }
311 $message = $glyphMatch.Groups['message'].Value.Trim()
312 $pattern = ''
313 $patternMatch = $patternRegex.Match($message)
314 if ($patternMatch.Success) { $pattern = $patternMatch.Groups['pattern'].Value }
315 $graders.Add(@{
316 name = $name
317 status = $status
318 message = $message
319 pattern = $pattern
320 })
321 continue
322 }
323 }
324
325 $legacyMatch = $legacyRegex.Match($line)
326 if ($legacyMatch.Success) {
327 $name = $legacyMatch.Groups['name'].Value
328 if (-not $seen.Add($name)) { continue }
329 $graders.Add(@{
330 name = $name
331 status = $legacyMatch.Groups['status'].Value.ToLowerInvariant()
332 message = ''
333 pattern = ''
334 })
335 }
336 }
337 return $graders
338}
339
340function Get-VallyOutputDirFromLog {
341 [CmdletBinding()]
342 [OutputType([string])]
343 param([Parameter(Mandatory)] [AllowEmptyCollection()] [AllowEmptyString()] [string[]]$Lines)
344
345 $regex = [regex]'(?im)^\s*Output\s+directory:\s*(?<dir>.+?)\s*$'
346 foreach ($line in $Lines) {
347 if ($null -eq $line) { continue }
348 $m = $regex.Match($line)
349 if ($m.Success) { return $m.Groups['dir'].Value.Trim() }
350 }
351 return ''
352}
353
354function Read-VallyTrajectoryDetails {
355 [CmdletBinding()]
356 [OutputType([hashtable])]
357 param([Parameter(Mandatory)] [AllowEmptyString()] [string]$OutputDir)
358
359 $empty = @{ stimulusPrompt = ''; output = ''; richGraders = @() }
360 if (-not $OutputDir) { return $empty }
361 $jsonlPath = Join-Path $OutputDir 'results.jsonl'
362 if (-not (Test-Path -LiteralPath $jsonlPath -PathType Leaf)) { return $empty }
363
364 try {
365 $first = Get-Content -LiteralPath $jsonlPath -TotalCount 1 -ErrorAction Stop
366 if (-not $first) { return $empty }
367 $obj = $first | ConvertFrom-Json -Depth 60 -ErrorAction Stop
368 } catch {
369 Write-Verbose "Failed to parse vally JSONL at $jsonlPath`: $($_.Exception.Message)"
370 return $empty
371 }
372
373 $stimPrompt = ''
374 if ($obj.PSObject.Properties['trajectory'] -and $obj.trajectory `
375 -and $obj.trajectory.PSObject.Properties['stimulus'] -and $obj.trajectory.stimulus `
376 -and $obj.trajectory.stimulus.PSObject.Properties['prompt']) {
377 $stimPrompt = [string]$obj.trajectory.stimulus.prompt
378 }
379
380 $output = ''
381 if ($obj.PSObject.Properties['trajectory'] -and $obj.trajectory `
382 -and $obj.trajectory.PSObject.Properties['output']) {
383 $rawOutput = $obj.trajectory.output
384 $output = if ($rawOutput -is [string]) { $rawOutput } else { ($rawOutput | ConvertTo-Json -Depth 12) }
385 }
386
387 $rich = [System.Collections.Generic.List[hashtable]]::new()
388 $richPatternRegex = [regex]'(?<negation>does not )?match(?:es)? pattern\s+(?<pattern>/.+/)'
389 if ($obj.PSObject.Properties['gradeResult'] -and $obj.gradeResult `
390 -and $obj.gradeResult.PSObject.Properties['details'] -and $obj.gradeResult.details) {
391 foreach ($d in @($obj.gradeResult.details)) {
392 if (-not $d) { continue }
393 $evidence = if ($d.PSObject.Properties['evidence']) { [string]$d.evidence } else { '' }
394 $pattern = ''
395 if ($evidence) {
396 $pm = $richPatternRegex.Match($evidence)
397 if ($pm.Success) { $pattern = $pm.Groups['pattern'].Value }
398 }
399 $rich.Add(@{
400 name = if ($d.PSObject.Properties['name']) { [string]$d.name } else { '' }
401 status = if ($d.PSObject.Properties['passed']) { if ($d.passed) { 'pass' } else { 'fail' } } else { 'unknown' }
402 evidence = $evidence
403 pattern = $pattern
404 label = if ($d.PSObject.Properties['label']) { [string]$d.label } else { '' }
405 kind = if ($d.PSObject.Properties['kind']) { [string]$d.kind } else { '' }
406 })
407 }
408 }
409
410 return @{
411 stimulusPrompt = $stimPrompt
412 output = $output
413 richGraders = $rich.ToArray()
414 }
415}
416
417function Merge-GraderDetails {
418 [CmdletBinding()]
419 [OutputType([System.Collections.Generic.List[hashtable]])]
420 param(
421 [Parameter(Mandatory)] [AllowEmptyCollection()] [System.Collections.Generic.List[hashtable]]$LogGraders,
422 [Parameter(Mandatory)] [AllowEmptyCollection()] [object[]]$RichGraders
423 )
424
425 $merged = [System.Collections.Generic.List[hashtable]]::new()
426 $richByName = @{}
427 foreach ($r in $RichGraders) {
428 if (-not $r) { continue }
429 $rn = [string]$r['name']
430 if ($rn) { $richByName[$rn] = $r }
431 }
432
433 foreach ($g in $LogGraders) {
434 $name = [string]$g['name']
435 $entry = @{
436 name = $name
437 status = [string]$g['status']
438 message = if ($g.ContainsKey('message')) { [string]$g['message'] } else { '' }
439 pattern = if ($g.ContainsKey('pattern')) { [string]$g['pattern'] } else { '' }
440 evidence = ''
441 label = ''
442 kind = ''
443 }
444 if ($richByName.ContainsKey($name)) {
445 $r = $richByName[$name]
446 $entry['evidence'] = [string]$r['evidence']
447 $entry['label'] = [string]$r['label']
448 $entry['kind'] = [string]$r['kind']
449 if (-not $entry['status']) { $entry['status'] = [string]$r['status'] }
450 if (-not $entry['pattern'] -and $r.ContainsKey('pattern')) {
451 $entry['pattern'] = [string]$r['pattern']
452 }
453 }
454 $merged.Add($entry)
455 }
456
457 # Include rich-only graders that the log parser missed (defensive fallback).
458 $seen = [System.Collections.Generic.HashSet[string]]::new([System.StringComparer]::OrdinalIgnoreCase)
459 foreach ($e in $merged) { [void]$seen.Add($e['name']) }
460 foreach ($name in $richByName.Keys) {
461 if ($seen.Contains($name)) { continue }
462 $r = $richByName[$name]
463 $evidence = [string]$r['evidence']
464 $merged.Add(@{
465 name = $name
466 status = [string]$r['status']
467 message = $evidence
468 pattern = if ($r.ContainsKey('pattern')) { [string]$r['pattern'] } else { '' }
469 evidence = $evidence
470 label = [string]$r['label']
471 kind = [string]$r['kind']
472 })
473 }
474 return $merged
475}
476
477function New-AgentSummary {
478 [CmdletBinding()]
479 [OutputType([hashtable])]
480 param(
481 [Parameter(Mandatory)] [hashtable]$AgentEntry,
482 [Parameter(Mandatory)] [int]$ExitCode,
483 [Parameter(Mandatory)] [AllowEmptyCollection()] [System.Collections.Generic.List[hashtable]]$Graders,
484 [Parameter(Mandatory)] [string]$LogPath,
485 [string]$OutputDir = '',
486 [string]$StimulusPrompt = '',
487 [string]$Output = ''
488 )
489
490 $overall = if ($ExitCode -eq 0) { 'pass' } else { 'fail' }
491 if ($overall -eq 'pass' -and $Graders.Count -gt 0) {
492 foreach ($g in $Graders) {
493 if ($g['status'] -eq 'fail') { $overall = 'fail'; break }
494 }
495 }
496
497 $graderObjects = @($Graders | ForEach-Object {
498 [ordered]@{
499 name = [string]$_['name']
500 status = [string]$_['status']
501 message = if ($_.ContainsKey('message')) { [string]$_['message'] } else { '' }
502 pattern = if ($_.ContainsKey('pattern')) { [string]$_['pattern'] } else { '' }
503 evidence = if ($_.ContainsKey('evidence')) { [string]$_['evidence'] } else { '' }
504 label = if ($_.ContainsKey('label')) { [string]$_['label'] } else { '' }
505 kind = if ($_.ContainsKey('kind')) { [string]$_['kind'] } else { '' }
506 }
507 })
508
509 return [ordered]@{
510 slug = [string]$AgentEntry['slug']
511 class = [string]$AgentEntry['class']
512 cost_tier = [string]$AgentEntry['cost_tier']
513 graders = $graderObjects
514 overall = $overall
515 exitCode = $ExitCode
516 logPath = $LogPath
517 vallyOutputDir = $OutputDir
518 stimulusPrompt = $StimulusPrompt
519 output = $Output
520 }
521}
522
523function New-MatrixSummary {
524 [CmdletBinding()]
525 [OutputType([hashtable])]
526 param(
527 [Parameter(Mandatory)] [string]$Tier,
528 [Parameter(Mandatory)] [string]$Mode,
529 [Parameter(Mandatory)] [AllowEmptyCollection()] [System.Collections.Generic.List[hashtable]]$Results,
530 [string[]]$PlannedCommands,
531 [string]$Verdict
532 )
533
534 $failures = @($Results | Where-Object { $_['overall'] -eq 'fail' } | ForEach-Object { [string]$_['slug'] })
535 $overall = if ($Verdict) { $Verdict } elseif ($failures.Count -gt 0) { 'fail' } else { 'pass' }
536
537 return [ordered]@{
538 generatedAt = (Get-Date -AsUTC).ToString('yyyy-MM-ddTHH:mm:ssZ')
539 tier = $Tier
540 mode = $Mode
541 agentCount = $Results.Count
542 overall = $overall
543 failures = $failures
544 results = @($Results)
545 plannedCommands = @($PlannedCommands)
546 }
547}
548
549function Write-SummaryJson {
550 [CmdletBinding()]
551 param(
552 [Parameter(Mandatory)] [object]$Summary,
553 [Parameter(Mandatory)] [string]$Path
554 )
555
556 $dir = Split-Path -Parent $Path
557 if ($dir -and -not (Test-Path -LiteralPath $dir)) {
558 New-Item -ItemType Directory -Path $dir -Force -WhatIf:$false -Confirm:$false | Out-Null
559 }
560 $json = $Summary | ConvertTo-Json -Depth 12
561 Set-Content -LiteralPath $Path -Value $json -Encoding utf8NoBOM -WhatIf:$false -Confirm:$false
562}
563
564#endregion Helper Functions
565
566#region Main Execution
567if ($MyInvocation.InvocationName -ne '.') {
568 try {
569 $resolvedRoot = Resolve-RepoRoot -Hint $RepoRoot
570 if ($Concurrency -gt 1) {
571 Write-Warning "Concurrency > 1 reserved for WI-04; running sequentially."
572 $Concurrency = 1
573 }
574
575 if (-not $OutputDir) {
576 $dateStamp = (Get-Date -AsUTC).ToString('yyyy-MM-dd')
577 $OutputDir = Join-Path $resolvedRoot "evals/results/agent-matrix/$dateStamp"
578 }
579 if (-not (Test-Path -LiteralPath $OutputDir)) {
580 New-Item -ItemType Directory -Path $OutputDir -Force -WhatIf:$false -Confirm:$false | Out-Null
581 }
582
583 $inventory = Read-AgentInventory -RepoRoot $resolvedRoot
584 $inventoryBySlug = @{}
585 foreach ($entry in $inventory) { $inventoryBySlug[$entry['slug']] = $entry }
586
587 $slugs = Resolve-SlugSet -RepoRoot $resolvedRoot -Inventory $inventory -ParameterSet $PSCmdlet.ParameterSetName -Changed $Changed
588
589 $mode = $PSCmdlet.ParameterSetName.ToLowerInvariant()
590 Write-Host "Agent matrix: mode=$mode tier=$Tier slug_count=$($slugs.Count)" -ForegroundColor Cyan
591 Write-Host " Output dir: $OutputDir" -ForegroundColor DarkGray
592
593 $plannedCommands = @($slugs | ForEach-Object { Get-PlannedCommand -Slug $_ -Model $Model })
594
595 $summaryPath = Join-Path $OutputDir 'agent-matrix-summary.json'
596
597 if ($slugs.Count -eq 0) {
598 Write-Host "No agent slugs resolved; nothing to evaluate." -ForegroundColor Yellow
599 $emptyResults = [System.Collections.Generic.List[hashtable]]::new()
600 $verdict = if ($WhatIfPreference) { 'dry-run' } else { 'pass' }
601 $summary = New-MatrixSummary -Tier $Tier -Mode $mode -Results $emptyResults -PlannedCommands $plannedCommands -Verdict $verdict
602 Write-SummaryJson -Summary $summary -Path $summaryPath
603 Write-Host "Summary written: $summaryPath ($verdict)" -ForegroundColor Green
604 exit 0
605 }
606
607 if ($WhatIfPreference) {
608 Write-Host "Dry-run mode: skipping live vally invocations." -ForegroundColor Yellow
609 $dryResults = [System.Collections.Generic.List[hashtable]]::new()
610 foreach ($slug in $slugs) {
611 $entry = $inventoryBySlug[$slug]
612 $cmd = Get-PlannedCommand -Slug $slug -Model $Model
613 Write-Host " [$($entry['cost_tier'])] $cmd" -ForegroundColor DarkGray
614 $dryResults.Add([ordered]@{
615 slug = $slug
616 class = [string]$entry['class']
617 cost_tier = [string]$entry['cost_tier']
618 graders = @()
619 overall = 'dry-run'
620 exitCode = 0
621 logPath = ''
622 })
623 }
624 $summary = New-MatrixSummary -Tier $Tier -Mode $mode -Results $dryResults -PlannedCommands $plannedCommands -Verdict 'dry-run'
625 Write-SummaryJson -Summary $summary -Path $summaryPath
626 Write-Host "Dry-run summary written: $summaryPath" -ForegroundColor Green
627 exit 0
628 }
629
630 $logsRoot = Join-Path $resolvedRoot 'logs/agent-matrix'
631 $runId = (Get-Date -AsUTC).ToString('yyyyMMddTHHmmssfffZ')
632
633 $results = [System.Collections.Generic.List[hashtable]]::new()
634 foreach ($slug in $slugs) {
635 $entry = $inventoryBySlug[$slug]
636 $logPath = Join-Path $logsRoot "$slug-$runId.log"
637 Write-Host "[$slug] running agent-behavior eval" -ForegroundColor Cyan
638 $run = Invoke-VallyAgentRun -Slug $slug -LogPath $logPath -Model $Model
639 $graders = Get-GraderStatusesFromLog -Lines $run['Lines']
640 if ($null -eq $graders) { $graders = [System.Collections.Generic.List[hashtable]]::new() }
641
642 $vallyOutDir = Get-VallyOutputDirFromLog -Lines $run['Lines']
643 $details = Read-VallyTrajectoryDetails -OutputDir $vallyOutDir
644 if ($details['richGraders'] -and $details['richGraders'].Count -gt 0) {
645 $graders = Merge-GraderDetails -LogGraders $graders -RichGraders $details['richGraders']
646 }
647
648 $summary = New-AgentSummary -AgentEntry $entry -ExitCode $run['ExitCode'] -Graders $graders `
649 -LogPath $logPath -OutputDir $vallyOutDir `
650 -StimulusPrompt $details['stimulusPrompt'] -Output $details['output']
651
652 $perAgentPath = Join-Path $OutputDir "$slug.json"
653 Write-SummaryJson -Summary $summary -Path $perAgentPath
654 $results.Add($summary)
655 }
656
657 $matrixSummary = New-MatrixSummary -Tier $Tier -Mode $mode -Results $results -PlannedCommands $plannedCommands
658 Write-SummaryJson -Summary $matrixSummary -Path $summaryPath
659 Write-Host "Summary written: $summaryPath ($($matrixSummary['overall']))" -ForegroundColor Cyan
660
661 if ($Tier -eq 'pr') { exit 0 }
662 if ($matrixSummary['overall'] -eq 'fail') {
663 Write-Host "Nightly verdict: fail (failures: $($matrixSummary['failures'] -join ', '))" -ForegroundColor Red
664 exit 1
665 }
666 exit 0
667 }
668 catch {
669 Write-Error -ErrorAction Continue "Invoke-AgentMatrix failed: $($_.Exception.Message)"
670 exit 3
671 }
672}
673#endregion Main Execution
674