microsoft/hve-core
Publicmirrored from https://github.com/microsoft/hve-coreAvailable
scripts/tests/evals/Invoke-AgentMatrix.Tests.ps1
351lines · modecode
| 1 | #Requires -Modules Pester |
| 2 | # Copyright (c) Microsoft Corporation. |
| 3 | # SPDX-License-Identifier: MIT |
| 4 | |
| 5 | BeforeAll { |
| 6 | $script:ScriptPath = Join-Path $PSScriptRoot '../../evals/Invoke-AgentMatrix.ps1' |
| 7 | $script:RepoRoot = Resolve-Path (Join-Path $PSScriptRoot '../../..') | Select-Object -ExpandProperty Path |
| 8 | $script:InventoryPath = Join-Path $script:RepoRoot 'evals/agent-behavior/AGENTS.yml' |
| 9 | } |
| 10 | |
| 11 | Describe 'Invoke-AgentMatrix.ps1 (dry-run)' -Tag 'Unit' { |
| 12 | |
| 13 | BeforeEach { |
| 14 | $script:OutputDir = Join-Path $TestDrive ("am-" + [Guid]::NewGuid().ToString('N')) |
| 15 | New-Item -ItemType Directory -Path $script:OutputDir -Force | Out-Null |
| 16 | $script:SummaryPath = Join-Path $script:OutputDir 'agent-matrix-summary.json' |
| 17 | } |
| 18 | |
| 19 | Context 'All mode' { |
| 20 | BeforeEach { |
| 21 | & $script:ScriptPath ` |
| 22 | -All ` |
| 23 | -Tier pr ` |
| 24 | -RepoRoot $script:RepoRoot ` |
| 25 | -OutputDir $script:OutputDir ` |
| 26 | -WhatIf *> $null |
| 27 | $script:Summary = Get-Content -LiteralPath $script:SummaryPath -Raw | ConvertFrom-Json |
| 28 | } |
| 29 | |
| 30 | It 'Exits with code 0' { |
| 31 | $LASTEXITCODE | Should -Be 0 |
| 32 | } |
| 33 | |
| 34 | It 'Writes the aggregate summary JSON' { |
| 35 | Test-Path -LiteralPath $script:SummaryPath | Should -BeTrue |
| 36 | } |
| 37 | |
| 38 | It 'Records tier=pr and mode=all' { |
| 39 | $script:Summary.tier | Should -Be 'pr' |
| 40 | $script:Summary.mode | Should -Be 'all' |
| 41 | } |
| 42 | |
| 43 | It 'Reports verdict=dry-run' { |
| 44 | $script:Summary.overall | Should -Be 'dry-run' |
| 45 | } |
| 46 | |
| 47 | It 'Enumerates exactly 46 parent agents (DD-09)' { |
| 48 | $script:Summary.agentCount | Should -Be 46 |
| 49 | $script:Summary.results.Count | Should -Be 46 |
| 50 | $script:Summary.plannedCommands.Count | Should -Be 46 |
| 51 | } |
| 52 | |
| 53 | It 'Records a class and cost_tier for every result row' { |
| 54 | foreach ($row in $script:Summary.results) { |
| 55 | $row.slug | Should -Not -BeNullOrEmpty |
| 56 | $row.class | Should -Not -BeNullOrEmpty |
| 57 | $row.cost_tier | Should -Not -BeNullOrEmpty |
| 58 | $row.overall | Should -Be 'dry-run' |
| 59 | } |
| 60 | } |
| 61 | |
| 62 | It 'Plans a vally command per slug using --eval-spec for the slug stimulus file' { |
| 63 | $first = $script:Summary.plannedCommands[0] |
| 64 | $first | Should -Match '^npx vally eval --eval-spec evals/agent-behavior/stimuli/[^/]+\.yml$' |
| 65 | } |
| 66 | } |
| 67 | |
| 68 | Context 'Changed mode with explicit slugs' { |
| 69 | BeforeEach { |
| 70 | & $script:ScriptPath ` |
| 71 | -Changed @('task-researcher', 'task-planner') ` |
| 72 | -Tier pr ` |
| 73 | -RepoRoot $script:RepoRoot ` |
| 74 | -OutputDir $script:OutputDir ` |
| 75 | -WhatIf *> $null |
| 76 | $script:Summary = Get-Content -LiteralPath $script:SummaryPath -Raw | ConvertFrom-Json |
| 77 | } |
| 78 | |
| 79 | It 'Exits with code 0' { |
| 80 | $LASTEXITCODE | Should -Be 0 |
| 81 | } |
| 82 | |
| 83 | It 'Records mode=changed' { |
| 84 | $script:Summary.mode | Should -Be 'changed' |
| 85 | } |
| 86 | |
| 87 | It 'Enumerates only the requested known slugs' { |
| 88 | $script:Summary.agentCount | Should -Be 2 |
| 89 | $slugs = @($script:Summary.results | ForEach-Object { $_.slug }) |
| 90 | $slugs | Should -Contain 'task-researcher' |
| 91 | $slugs | Should -Contain 'task-planner' |
| 92 | } |
| 93 | } |
| 94 | |
| 95 | Context 'Changed mode with no slugs' { |
| 96 | BeforeEach { |
| 97 | & $script:ScriptPath ` |
| 98 | -Changed @() ` |
| 99 | -Tier pr ` |
| 100 | -RepoRoot $script:RepoRoot ` |
| 101 | -OutputDir $script:OutputDir ` |
| 102 | -WhatIf *> $null |
| 103 | $script:Summary = Get-Content -LiteralPath $script:SummaryPath -Raw | ConvertFrom-Json |
| 104 | } |
| 105 | |
| 106 | It 'Exits with code 0' { |
| 107 | $LASTEXITCODE | Should -Be 0 |
| 108 | } |
| 109 | |
| 110 | It 'Writes an empty summary' { |
| 111 | $script:Summary.agentCount | Should -Be 0 |
| 112 | $script:Summary.results.Count | Should -Be 0 |
| 113 | } |
| 114 | } |
| 115 | |
| 116 | Context 'Nightly tier metadata' { |
| 117 | BeforeEach { |
| 118 | & $script:ScriptPath ` |
| 119 | -All ` |
| 120 | -Tier nightly ` |
| 121 | -RepoRoot $script:RepoRoot ` |
| 122 | -OutputDir $script:OutputDir ` |
| 123 | -WhatIf *> $null |
| 124 | $script:Summary = Get-Content -LiteralPath $script:SummaryPath -Raw | ConvertFrom-Json |
| 125 | } |
| 126 | |
| 127 | It 'Records tier=nightly' { |
| 128 | $script:Summary.tier | Should -Be 'nightly' |
| 129 | } |
| 130 | |
| 131 | It 'Exits 0 in dry-run even at nightly tier' { |
| 132 | $LASTEXITCODE | Should -Be 0 |
| 133 | } |
| 134 | } |
| 135 | |
| 136 | Context 'Parameter validation' { |
| 137 | It 'Rejects an unknown tier' { |
| 138 | { & $script:ScriptPath -All -Tier 'weekly' -RepoRoot $script:RepoRoot -OutputDir $script:OutputDir -WhatIf } | |
| 139 | Should -Throw |
| 140 | } |
| 141 | |
| 142 | It 'Rejects combining -All and -Changed' { |
| 143 | { & $script:ScriptPath -All -Changed @('task-researcher') -RepoRoot $script:RepoRoot -OutputDir $script:OutputDir -WhatIf } | |
| 144 | Should -Throw |
| 145 | } |
| 146 | } |
| 147 | } |
| 148 | |
| 149 | Describe 'Invoke-AgentMatrix helper functions' -Tag 'Unit' { |
| 150 | |
| 151 | BeforeAll { |
| 152 | . $script:ScriptPath |
| 153 | } |
| 154 | |
| 155 | Context 'Get-GraderStatusesFromLog' { |
| 156 | It 'Parses pass/fail grader lines' { |
| 157 | $lines = @( |
| 158 | 'grader "header-present": pass', |
| 159 | 'grader "scope-adherence": fail', |
| 160 | 'grader "no-source-edit": pass' |
| 161 | ) |
| 162 | $result = @(Get-GraderStatusesFromLog -Lines $lines) |
| 163 | $result.Count | Should -Be 3 |
| 164 | ($result | Where-Object { $_['name'] -eq 'header-present' }).status | Should -Be 'pass' |
| 165 | ($result | Where-Object { $_['name'] -eq 'scope-adherence' }).status | Should -Be 'fail' |
| 166 | } |
| 167 | |
| 168 | It 'Deduplicates repeated grader names' { |
| 169 | $lines = @( |
| 170 | 'grader "header-present": pass', |
| 171 | 'grader "header-present": fail' |
| 172 | ) |
| 173 | $result = @(Get-GraderStatusesFromLog -Lines $lines) |
| 174 | $result.Count | Should -Be 1 |
| 175 | $result[0]['status'] | Should -Be 'pass' |
| 176 | } |
| 177 | |
| 178 | It 'Returns an empty collection on empty input' { |
| 179 | $result = @(Get-GraderStatusesFromLog -Lines @()) |
| 180 | $result.Count | Should -Be 0 |
| 181 | } |
| 182 | |
| 183 | It 'Ignores lines that do not match the grader pattern' { |
| 184 | $result = @(Get-GraderStatusesFromLog -Lines @('random log line', 'no grader here')) |
| 185 | $result.Count | Should -Be 0 |
| 186 | } |
| 187 | } |
| 188 | |
| 189 | Context 'New-AgentSummary' { |
| 190 | BeforeEach { |
| 191 | $script:Entry = @{ slug = 'task-researcher'; class = 'research-writer'; cost_tier = 'light' } |
| 192 | $script:Graders = [System.Collections.Generic.List[hashtable]]::new() |
| 193 | $script:Graders.Add(@{ name = 'header-present'; status = 'pass' }) |
| 194 | } |
| 195 | |
| 196 | It 'Reports overall=pass when ExitCode=0 and no failing graders' { |
| 197 | $summary = New-AgentSummary -AgentEntry $script:Entry -ExitCode 0 -Graders $script:Graders -LogPath 'x.log' |
| 198 | $summary.overall | Should -Be 'pass' |
| 199 | $summary.slug | Should -Be 'task-researcher' |
| 200 | $summary.class | Should -Be 'research-writer' |
| 201 | $summary.cost_tier | Should -Be 'light' |
| 202 | $summary.logPath | Should -Be 'x.log' |
| 203 | $summary.exitCode | Should -Be 0 |
| 204 | } |
| 205 | |
| 206 | It 'Reports overall=fail when ExitCode is non-zero' { |
| 207 | $summary = New-AgentSummary -AgentEntry $script:Entry -ExitCode 2 -Graders $script:Graders -LogPath 'x.log' |
| 208 | $summary.overall | Should -Be 'fail' |
| 209 | $summary.exitCode | Should -Be 2 |
| 210 | } |
| 211 | |
| 212 | It 'Reports overall=fail when a grader status is fail even with exit 0' { |
| 213 | $script:Graders.Add(@{ name = 'scope'; status = 'fail' }) |
| 214 | $summary = New-AgentSummary -AgentEntry $script:Entry -ExitCode 0 -Graders $script:Graders -LogPath 'x.log' |
| 215 | $summary.overall | Should -Be 'fail' |
| 216 | } |
| 217 | } |
| 218 | |
| 219 | Context 'New-MatrixSummary' { |
| 220 | It 'Collects failure slugs and sets overall=fail' { |
| 221 | $results = [System.Collections.Generic.List[hashtable]]::new() |
| 222 | $results.Add(@{ slug = 'a'; overall = 'pass' }) |
| 223 | $results.Add(@{ slug = 'b'; overall = 'fail' }) |
| 224 | $summary = New-MatrixSummary -Tier 'nightly' -Mode 'all' -Results $results -PlannedCommands @('cmd-a','cmd-b') |
| 225 | $summary.overall | Should -Be 'fail' |
| 226 | $summary.failures | Should -Contain 'b' |
| 227 | $summary.agentCount | Should -Be 2 |
| 228 | $summary.tier | Should -Be 'nightly' |
| 229 | $summary.mode | Should -Be 'all' |
| 230 | $summary.plannedCommands.Count | Should -Be 2 |
| 231 | } |
| 232 | |
| 233 | It 'Sets overall=pass when all results pass' { |
| 234 | $results = [System.Collections.Generic.List[hashtable]]::new() |
| 235 | $results.Add(@{ slug = 'a'; overall = 'pass' }) |
| 236 | $results.Add(@{ slug = 'b'; overall = 'pass' }) |
| 237 | $summary = New-MatrixSummary -Tier 'pr' -Mode 'changed' -Results $results -PlannedCommands @() |
| 238 | $summary.overall | Should -Be 'pass' |
| 239 | $summary.failures.Count | Should -Be 0 |
| 240 | } |
| 241 | |
| 242 | It 'Honors an explicit verdict override' { |
| 243 | $results = [System.Collections.Generic.List[hashtable]]::new() |
| 244 | $summary = New-MatrixSummary -Tier 'pr' -Mode 'all' -Results $results -PlannedCommands @() -Verdict 'dry-run' |
| 245 | $summary.overall | Should -Be 'dry-run' |
| 246 | } |
| 247 | } |
| 248 | |
| 249 | Context 'Resolve-SlugSet' { |
| 250 | BeforeAll { |
| 251 | $script:Inventory = Read-AgentInventory -RepoRoot $script:RepoRoot |
| 252 | } |
| 253 | |
| 254 | It 'Returns every inventory slug in All mode' { |
| 255 | $slugs = Resolve-SlugSet -RepoRoot $script:RepoRoot -Inventory $script:Inventory -ParameterSet 'All' |
| 256 | $slugs.Count | Should -Be $script:Inventory.Count |
| 257 | $slugs.Count | Should -Be 46 |
| 258 | } |
| 259 | |
| 260 | It 'Filters Changed inputs to known slugs' { |
| 261 | $slugs = Resolve-SlugSet -RepoRoot $script:RepoRoot -Inventory $script:Inventory -ParameterSet 'Changed' -Changed @('task-researcher', 'definitely-not-an-agent') |
| 262 | $slugs | Should -Contain 'task-researcher' |
| 263 | $slugs | Should -Not -Contain 'definitely-not-an-agent' |
| 264 | } |
| 265 | |
| 266 | It 'Returns an empty array when Changed is empty' { |
| 267 | $slugs = Resolve-SlugSet -RepoRoot $script:RepoRoot -Inventory $script:Inventory -ParameterSet 'Changed' -Changed @() |
| 268 | $slugs.Count | Should -Be 0 |
| 269 | } |
| 270 | } |
| 271 | |
| 272 | Context 'Get-GraderStatusesFromLog pattern extraction' { |
| 273 | It 'Extracts pattern from positive-match glyph line ("matches pattern ...")' { |
| 274 | $checkGlyph = [string][char]0x2714 |
| 275 | $lines = @( |
| 276 | 'Graders (1/1)', |
| 277 | " $checkGlyph field-vocab-present Output matches pattern /(?i)(title|description)/", |
| 278 | '' |
| 279 | ) |
| 280 | $result = @(Get-GraderStatusesFromLog -Lines $lines) |
| 281 | $result.Count | Should -Be 1 |
| 282 | $result[0]['name'] | Should -Be 'field-vocab-present' |
| 283 | $result[0]['status'] | Should -Be 'pass' |
| 284 | $result[0]['pattern'] | Should -Be '/(?i)(title|description)/' |
| 285 | } |
| 286 | |
| 287 | It 'Extracts pattern from negative-match glyph line ("does not match pattern ...")' { |
| 288 | $crossGlyph = [string][char]0x2718 |
| 289 | $lines = @( |
| 290 | 'Graders (0/1)', |
| 291 | " $crossGlyph tracking-file-write Output does not match pattern /(?i)\.copilot-tracking/workitems/", |
| 292 | '' |
| 293 | ) |
| 294 | $result = @(Get-GraderStatusesFromLog -Lines $lines) |
| 295 | $result.Count | Should -Be 1 |
| 296 | $result[0]['name'] | Should -Be 'tracking-file-write' |
| 297 | $result[0]['status'] | Should -Be 'fail' |
| 298 | $result[0]['pattern'] | Should -Be '/(?i)\.copilot-tracking/workitems/' |
| 299 | } |
| 300 | } |
| 301 | |
| 302 | Context 'Merge-GraderDetails' { |
| 303 | It 'Preserves log message when rich grader provides only evidence' { |
| 304 | $logGrader = @{ |
| 305 | name = 'field-vocab-present' |
| 306 | status = 'pass' |
| 307 | message = 'Output matches pattern /(?i)(title)/' |
| 308 | pattern = '/(?i)(title)/' |
| 309 | } |
| 310 | $logList = [System.Collections.Generic.List[hashtable]]::new() |
| 311 | $logList.Add($logGrader) |
| 312 | $richGrader = @{ |
| 313 | name = 'field-vocab-present' |
| 314 | status = 'pass' |
| 315 | evidence = 'Output matches pattern /(?i)(title)/' |
| 316 | pattern = '/(?i)(title)/' |
| 317 | label = 'vocab' |
| 318 | kind = 'regex' |
| 319 | } |
| 320 | $merged = @(Merge-GraderDetails -LogGraders $logList -RichGraders @($richGrader)) |
| 321 | $merged.Count | Should -Be 1 |
| 322 | $merged[0]['message'] | Should -Be 'Output matches pattern /(?i)(title)/' |
| 323 | $merged[0]['pattern'] | Should -Be '/(?i)(title)/' |
| 324 | $merged[0]['evidence'] | Should -Be 'Output matches pattern /(?i)(title)/' |
| 325 | $merged[0]['label'] | Should -Be 'vocab' |
| 326 | $merged[0]['kind'] | Should -Be 'regex' |
| 327 | } |
| 328 | |
| 329 | It 'Backfills pattern from rich grader when log pattern is empty' { |
| 330 | $logGrader = @{ |
| 331 | name = 'no-source-edit' |
| 332 | status = 'pass' |
| 333 | message = 'Output does not match pattern /\.cs/' |
| 334 | pattern = '' |
| 335 | } |
| 336 | $logList = [System.Collections.Generic.List[hashtable]]::new() |
| 337 | $logList.Add($logGrader) |
| 338 | $richGrader = @{ |
| 339 | name = 'no-source-edit' |
| 340 | status = 'pass' |
| 341 | evidence = 'Output does not match pattern /\.cs/' |
| 342 | pattern = '/\.cs/' |
| 343 | label = '' |
| 344 | kind = '' |
| 345 | } |
| 346 | $merged = @(Merge-GraderDetails -LogGraders $logList -RichGraders @($richGrader)) |
| 347 | $merged[0]['pattern'] | Should -Be '/\.cs/' |
| 348 | $merged[0]['message'] | Should -Be 'Output does not match pattern /\.cs/' |
| 349 | } |
| 350 | } |
| 351 | } |
| 352 | |