microsoft/hve-core

Public

mirrored from https://github.com/microsoft/hve-coreAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
feat/1637-b-tracking-paths

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

scripts/tests/evals/Invoke-AgentMatrix.Tests.ps1

351lines · modecode

1#Requires -Modules Pester
2# Copyright (c) Microsoft Corporation.
3# SPDX-License-Identifier: MIT
4
5BeforeAll {
6 $script:ScriptPath = Join-Path $PSScriptRoot '../../evals/Invoke-AgentMatrix.ps1'
7 $script:RepoRoot = Resolve-Path (Join-Path $PSScriptRoot '../../..') | Select-Object -ExpandProperty Path
8 $script:InventoryPath = Join-Path $script:RepoRoot 'evals/agent-behavior/AGENTS.yml'
9}
10
11Describe 'Invoke-AgentMatrix.ps1 (dry-run)' -Tag 'Unit' {
12
13 BeforeEach {
14 $script:OutputDir = Join-Path $TestDrive ("am-" + [Guid]::NewGuid().ToString('N'))
15 New-Item -ItemType Directory -Path $script:OutputDir -Force | Out-Null
16 $script:SummaryPath = Join-Path $script:OutputDir 'agent-matrix-summary.json'
17 }
18
19 Context 'All mode' {
20 BeforeEach {
21 & $script:ScriptPath `
22 -All `
23 -Tier pr `
24 -RepoRoot $script:RepoRoot `
25 -OutputDir $script:OutputDir `
26 -WhatIf *> $null
27 $script:Summary = Get-Content -LiteralPath $script:SummaryPath -Raw | ConvertFrom-Json
28 }
29
30 It 'Exits with code 0' {
31 $LASTEXITCODE | Should -Be 0
32 }
33
34 It 'Writes the aggregate summary JSON' {
35 Test-Path -LiteralPath $script:SummaryPath | Should -BeTrue
36 }
37
38 It 'Records tier=pr and mode=all' {
39 $script:Summary.tier | Should -Be 'pr'
40 $script:Summary.mode | Should -Be 'all'
41 }
42
43 It 'Reports verdict=dry-run' {
44 $script:Summary.overall | Should -Be 'dry-run'
45 }
46
47 It 'Enumerates exactly 46 parent agents (DD-09)' {
48 $script:Summary.agentCount | Should -Be 46
49 $script:Summary.results.Count | Should -Be 46
50 $script:Summary.plannedCommands.Count | Should -Be 46
51 }
52
53 It 'Records a class and cost_tier for every result row' {
54 foreach ($row in $script:Summary.results) {
55 $row.slug | Should -Not -BeNullOrEmpty
56 $row.class | Should -Not -BeNullOrEmpty
57 $row.cost_tier | Should -Not -BeNullOrEmpty
58 $row.overall | Should -Be 'dry-run'
59 }
60 }
61
62 It 'Plans a vally command per slug using --eval-spec for the slug stimulus file' {
63 $first = $script:Summary.plannedCommands[0]
64 $first | Should -Match '^npx vally eval --eval-spec evals/agent-behavior/stimuli/[^/]+\.yml$'
65 }
66 }
67
68 Context 'Changed mode with explicit slugs' {
69 BeforeEach {
70 & $script:ScriptPath `
71 -Changed @('task-researcher', 'task-planner') `
72 -Tier pr `
73 -RepoRoot $script:RepoRoot `
74 -OutputDir $script:OutputDir `
75 -WhatIf *> $null
76 $script:Summary = Get-Content -LiteralPath $script:SummaryPath -Raw | ConvertFrom-Json
77 }
78
79 It 'Exits with code 0' {
80 $LASTEXITCODE | Should -Be 0
81 }
82
83 It 'Records mode=changed' {
84 $script:Summary.mode | Should -Be 'changed'
85 }
86
87 It 'Enumerates only the requested known slugs' {
88 $script:Summary.agentCount | Should -Be 2
89 $slugs = @($script:Summary.results | ForEach-Object { $_.slug })
90 $slugs | Should -Contain 'task-researcher'
91 $slugs | Should -Contain 'task-planner'
92 }
93 }
94
95 Context 'Changed mode with no slugs' {
96 BeforeEach {
97 & $script:ScriptPath `
98 -Changed @() `
99 -Tier pr `
100 -RepoRoot $script:RepoRoot `
101 -OutputDir $script:OutputDir `
102 -WhatIf *> $null
103 $script:Summary = Get-Content -LiteralPath $script:SummaryPath -Raw | ConvertFrom-Json
104 }
105
106 It 'Exits with code 0' {
107 $LASTEXITCODE | Should -Be 0
108 }
109
110 It 'Writes an empty summary' {
111 $script:Summary.agentCount | Should -Be 0
112 $script:Summary.results.Count | Should -Be 0
113 }
114 }
115
116 Context 'Nightly tier metadata' {
117 BeforeEach {
118 & $script:ScriptPath `
119 -All `
120 -Tier nightly `
121 -RepoRoot $script:RepoRoot `
122 -OutputDir $script:OutputDir `
123 -WhatIf *> $null
124 $script:Summary = Get-Content -LiteralPath $script:SummaryPath -Raw | ConvertFrom-Json
125 }
126
127 It 'Records tier=nightly' {
128 $script:Summary.tier | Should -Be 'nightly'
129 }
130
131 It 'Exits 0 in dry-run even at nightly tier' {
132 $LASTEXITCODE | Should -Be 0
133 }
134 }
135
136 Context 'Parameter validation' {
137 It 'Rejects an unknown tier' {
138 { & $script:ScriptPath -All -Tier 'weekly' -RepoRoot $script:RepoRoot -OutputDir $script:OutputDir -WhatIf } |
139 Should -Throw
140 }
141
142 It 'Rejects combining -All and -Changed' {
143 { & $script:ScriptPath -All -Changed @('task-researcher') -RepoRoot $script:RepoRoot -OutputDir $script:OutputDir -WhatIf } |
144 Should -Throw
145 }
146 }
147}
148
149Describe 'Invoke-AgentMatrix helper functions' -Tag 'Unit' {
150
151 BeforeAll {
152 . $script:ScriptPath
153 }
154
155 Context 'Get-GraderStatusesFromLog' {
156 It 'Parses pass/fail grader lines' {
157 $lines = @(
158 'grader "header-present": pass',
159 'grader "scope-adherence": fail',
160 'grader "no-source-edit": pass'
161 )
162 $result = @(Get-GraderStatusesFromLog -Lines $lines)
163 $result.Count | Should -Be 3
164 ($result | Where-Object { $_['name'] -eq 'header-present' }).status | Should -Be 'pass'
165 ($result | Where-Object { $_['name'] -eq 'scope-adherence' }).status | Should -Be 'fail'
166 }
167
168 It 'Deduplicates repeated grader names' {
169 $lines = @(
170 'grader "header-present": pass',
171 'grader "header-present": fail'
172 )
173 $result = @(Get-GraderStatusesFromLog -Lines $lines)
174 $result.Count | Should -Be 1
175 $result[0]['status'] | Should -Be 'pass'
176 }
177
178 It 'Returns an empty collection on empty input' {
179 $result = @(Get-GraderStatusesFromLog -Lines @())
180 $result.Count | Should -Be 0
181 }
182
183 It 'Ignores lines that do not match the grader pattern' {
184 $result = @(Get-GraderStatusesFromLog -Lines @('random log line', 'no grader here'))
185 $result.Count | Should -Be 0
186 }
187 }
188
189 Context 'New-AgentSummary' {
190 BeforeEach {
191 $script:Entry = @{ slug = 'task-researcher'; class = 'research-writer'; cost_tier = 'light' }
192 $script:Graders = [System.Collections.Generic.List[hashtable]]::new()
193 $script:Graders.Add(@{ name = 'header-present'; status = 'pass' })
194 }
195
196 It 'Reports overall=pass when ExitCode=0 and no failing graders' {
197 $summary = New-AgentSummary -AgentEntry $script:Entry -ExitCode 0 -Graders $script:Graders -LogPath 'x.log'
198 $summary.overall | Should -Be 'pass'
199 $summary.slug | Should -Be 'task-researcher'
200 $summary.class | Should -Be 'research-writer'
201 $summary.cost_tier | Should -Be 'light'
202 $summary.logPath | Should -Be 'x.log'
203 $summary.exitCode | Should -Be 0
204 }
205
206 It 'Reports overall=fail when ExitCode is non-zero' {
207 $summary = New-AgentSummary -AgentEntry $script:Entry -ExitCode 2 -Graders $script:Graders -LogPath 'x.log'
208 $summary.overall | Should -Be 'fail'
209 $summary.exitCode | Should -Be 2
210 }
211
212 It 'Reports overall=fail when a grader status is fail even with exit 0' {
213 $script:Graders.Add(@{ name = 'scope'; status = 'fail' })
214 $summary = New-AgentSummary -AgentEntry $script:Entry -ExitCode 0 -Graders $script:Graders -LogPath 'x.log'
215 $summary.overall | Should -Be 'fail'
216 }
217 }
218
219 Context 'New-MatrixSummary' {
220 It 'Collects failure slugs and sets overall=fail' {
221 $results = [System.Collections.Generic.List[hashtable]]::new()
222 $results.Add(@{ slug = 'a'; overall = 'pass' })
223 $results.Add(@{ slug = 'b'; overall = 'fail' })
224 $summary = New-MatrixSummary -Tier 'nightly' -Mode 'all' -Results $results -PlannedCommands @('cmd-a','cmd-b')
225 $summary.overall | Should -Be 'fail'
226 $summary.failures | Should -Contain 'b'
227 $summary.agentCount | Should -Be 2
228 $summary.tier | Should -Be 'nightly'
229 $summary.mode | Should -Be 'all'
230 $summary.plannedCommands.Count | Should -Be 2
231 }
232
233 It 'Sets overall=pass when all results pass' {
234 $results = [System.Collections.Generic.List[hashtable]]::new()
235 $results.Add(@{ slug = 'a'; overall = 'pass' })
236 $results.Add(@{ slug = 'b'; overall = 'pass' })
237 $summary = New-MatrixSummary -Tier 'pr' -Mode 'changed' -Results $results -PlannedCommands @()
238 $summary.overall | Should -Be 'pass'
239 $summary.failures.Count | Should -Be 0
240 }
241
242 It 'Honors an explicit verdict override' {
243 $results = [System.Collections.Generic.List[hashtable]]::new()
244 $summary = New-MatrixSummary -Tier 'pr' -Mode 'all' -Results $results -PlannedCommands @() -Verdict 'dry-run'
245 $summary.overall | Should -Be 'dry-run'
246 }
247 }
248
249 Context 'Resolve-SlugSet' {
250 BeforeAll {
251 $script:Inventory = Read-AgentInventory -RepoRoot $script:RepoRoot
252 }
253
254 It 'Returns every inventory slug in All mode' {
255 $slugs = Resolve-SlugSet -RepoRoot $script:RepoRoot -Inventory $script:Inventory -ParameterSet 'All'
256 $slugs.Count | Should -Be $script:Inventory.Count
257 $slugs.Count | Should -Be 46
258 }
259
260 It 'Filters Changed inputs to known slugs' {
261 $slugs = Resolve-SlugSet -RepoRoot $script:RepoRoot -Inventory $script:Inventory -ParameterSet 'Changed' -Changed @('task-researcher', 'definitely-not-an-agent')
262 $slugs | Should -Contain 'task-researcher'
263 $slugs | Should -Not -Contain 'definitely-not-an-agent'
264 }
265
266 It 'Returns an empty array when Changed is empty' {
267 $slugs = Resolve-SlugSet -RepoRoot $script:RepoRoot -Inventory $script:Inventory -ParameterSet 'Changed' -Changed @()
268 $slugs.Count | Should -Be 0
269 }
270 }
271
272 Context 'Get-GraderStatusesFromLog pattern extraction' {
273 It 'Extracts pattern from positive-match glyph line ("matches pattern ...")' {
274 $checkGlyph = [string][char]0x2714
275 $lines = @(
276 'Graders (1/1)',
277 " $checkGlyph field-vocab-present Output matches pattern /(?i)(title|description)/",
278 ''
279 )
280 $result = @(Get-GraderStatusesFromLog -Lines $lines)
281 $result.Count | Should -Be 1
282 $result[0]['name'] | Should -Be 'field-vocab-present'
283 $result[0]['status'] | Should -Be 'pass'
284 $result[0]['pattern'] | Should -Be '/(?i)(title|description)/'
285 }
286
287 It 'Extracts pattern from negative-match glyph line ("does not match pattern ...")' {
288 $crossGlyph = [string][char]0x2718
289 $lines = @(
290 'Graders (0/1)',
291 " $crossGlyph tracking-file-write Output does not match pattern /(?i)\.copilot-tracking/workitems/",
292 ''
293 )
294 $result = @(Get-GraderStatusesFromLog -Lines $lines)
295 $result.Count | Should -Be 1
296 $result[0]['name'] | Should -Be 'tracking-file-write'
297 $result[0]['status'] | Should -Be 'fail'
298 $result[0]['pattern'] | Should -Be '/(?i)\.copilot-tracking/workitems/'
299 }
300 }
301
302 Context 'Merge-GraderDetails' {
303 It 'Preserves log message when rich grader provides only evidence' {
304 $logGrader = @{
305 name = 'field-vocab-present'
306 status = 'pass'
307 message = 'Output matches pattern /(?i)(title)/'
308 pattern = '/(?i)(title)/'
309 }
310 $logList = [System.Collections.Generic.List[hashtable]]::new()
311 $logList.Add($logGrader)
312 $richGrader = @{
313 name = 'field-vocab-present'
314 status = 'pass'
315 evidence = 'Output matches pattern /(?i)(title)/'
316 pattern = '/(?i)(title)/'
317 label = 'vocab'
318 kind = 'regex'
319 }
320 $merged = @(Merge-GraderDetails -LogGraders $logList -RichGraders @($richGrader))
321 $merged.Count | Should -Be 1
322 $merged[0]['message'] | Should -Be 'Output matches pattern /(?i)(title)/'
323 $merged[0]['pattern'] | Should -Be '/(?i)(title)/'
324 $merged[0]['evidence'] | Should -Be 'Output matches pattern /(?i)(title)/'
325 $merged[0]['label'] | Should -Be 'vocab'
326 $merged[0]['kind'] | Should -Be 'regex'
327 }
328
329 It 'Backfills pattern from rich grader when log pattern is empty' {
330 $logGrader = @{
331 name = 'no-source-edit'
332 status = 'pass'
333 message = 'Output does not match pattern /\.cs/'
334 pattern = ''
335 }
336 $logList = [System.Collections.Generic.List[hashtable]]::new()
337 $logList.Add($logGrader)
338 $richGrader = @{
339 name = 'no-source-edit'
340 status = 'pass'
341 evidence = 'Output does not match pattern /\.cs/'
342 pattern = '/\.cs/'
343 label = ''
344 kind = ''
345 }
346 $merged = @(Merge-GraderDetails -LogGraders $logList -RichGraders @($richGrader))
347 $merged[0]['pattern'] | Should -Be '/\.cs/'
348 $merged[0]['message'] | Should -Be 'Output does not match pattern /\.cs/'
349 }
350 }
351}
352