microsoft/hve-core
Publicmirrored fromhttps://github.com/microsoft/hve-coreAvailable
scripts/tests/evals/Build-AgentBehaviorSpec.Tests.ps1
323lines · modecode
| 1 | #Requires -Modules Pester |
| 2 | # Copyright (c) Microsoft Corporation. |
| 3 | # SPDX-License-Identifier: MIT |
| 4 | |
| 5 | BeforeAll { |
| 6 | $script:ScriptPath = Join-Path $PSScriptRoot '../../evals/Build-AgentBehaviorSpec.ps1' |
| 7 | |
| 8 | Import-Module powershell-yaml -ErrorAction Stop |
| 9 | |
| 10 | function script:Invoke-Generator { |
| 11 | param( |
| 12 | [Parameter(Mandatory)] [string]$Root, |
| 13 | [switch]$DryRun, |
| 14 | [switch]$Force |
| 15 | ) |
| 16 | $argList = @('-NoProfile', '-NoLogo', '-File', $script:ScriptPath, '-RepoRoot', $Root) |
| 17 | if ($DryRun) { $argList += '-WhatIf' } |
| 18 | if ($Force) { $argList += '-Force' } |
| 19 | $stdout = & pwsh @argList 2>&1 |
| 20 | return [pscustomobject]@{ |
| 21 | ExitCode = $LASTEXITCODE |
| 22 | Output = ($stdout | Out-String) |
| 23 | } |
| 24 | } |
| 25 | |
| 26 | function script:Initialize-FixtureRoot { |
| 27 | param([Parameter(Mandatory)] [string]$Root) |
| 28 | New-Item -ItemType Directory -Path $Root -Force | Out-Null |
| 29 | New-Item -ItemType Directory -Path (Join-Path $Root 'evals/agent-behavior/stimuli') -Force | Out-Null |
| 30 | } |
| 31 | |
| 32 | function script:Write-Partial { |
| 33 | param( |
| 34 | [Parameter(Mandatory)] [string]$Root, |
| 35 | [Parameter(Mandatory)] [string]$Slug, |
| 36 | [Parameter(Mandatory)] [string]$Content |
| 37 | ) |
| 38 | $path = Join-Path $Root "evals/agent-behavior/stimuli/$Slug.yml" |
| 39 | [System.IO.File]::WriteAllText($path, $Content) |
| 40 | return $path |
| 41 | } |
| 42 | |
| 43 | function script:Write-SeedEvalYaml { |
| 44 | param( |
| 45 | [Parameter(Mandatory)] [string]$Root, |
| 46 | [Parameter(Mandatory)] [string]$Content |
| 47 | ) |
| 48 | $path = Join-Path $Root 'evals/agent-behavior/eval.yaml' |
| 49 | [System.IO.File]::WriteAllText($path, $Content) |
| 50 | return $path |
| 51 | } |
| 52 | |
| 53 | function script:Read-OutputYaml { |
| 54 | param([Parameter(Mandatory)] [string]$Root) |
| 55 | $path = Join-Path $Root 'evals/agent-behavior/eval.yaml' |
| 56 | return [System.IO.File]::ReadAllText($path) |
| 57 | } |
| 58 | |
| 59 | function script:Read-OutputObject { |
| 60 | param([Parameter(Mandatory)] [string]$Root) |
| 61 | return ConvertFrom-Yaml -Yaml (script:Read-OutputYaml -Root $Root) |
| 62 | } |
| 63 | } |
| 64 | |
| 65 | Describe 'Build-AgentBehaviorSpec.ps1' -Tag 'Unit' { |
| 66 | BeforeEach { |
| 67 | $script:TestRoot = Join-Path $TestDrive ([Guid]::NewGuid().ToString()) |
| 68 | Initialize-FixtureRoot -Root $script:TestRoot |
| 69 | } |
| 70 | |
| 71 | Context 'Rendering with multiple partials' { |
| 72 | It 'Concatenates partials in alphabetical order and injects agent tag from slug' { |
| 73 | Write-Partial -Root $script:TestRoot -Slug 'beta' -Content @" |
| 74 | stimuli: |
| 75 | - name: beta-case |
| 76 | prompt: Beta agent prompt. |
| 77 | graders: |
| 78 | - type: output-matches |
| 79 | name: beta-grader |
| 80 | config: |
| 81 | pattern: "(?i)beta" |
| 82 | "@ |
| 83 | Write-Partial -Root $script:TestRoot -Slug 'alpha' -Content @" |
| 84 | stimuli: |
| 85 | - name: alpha-case |
| 86 | prompt: Alpha agent prompt. |
| 87 | graders: |
| 88 | - type: output-matches |
| 89 | name: alpha-grader |
| 90 | config: |
| 91 | pattern: "(?i)alpha" |
| 92 | "@ |
| 93 | |
| 94 | $result = Invoke-Generator -Root $script:TestRoot |
| 95 | $result.ExitCode | Should -Be 0 |
| 96 | |
| 97 | $spec = Read-OutputObject -Root $script:TestRoot |
| 98 | $spec.stimuli | Should -HaveCount 2 |
| 99 | $spec.stimuli[0].name | Should -Be 'alpha-case' |
| 100 | $spec.stimuli[1].name | Should -Be 'beta-case' |
| 101 | $spec.stimuli[0].tags.agent | Should -Be 'alpha' |
| 102 | $spec.stimuli[1].tags.agent | Should -Be 'beta' |
| 103 | } |
| 104 | |
| 105 | It 'Writes the generator banner as the first line' { |
| 106 | Write-Partial -Root $script:TestRoot -Slug 'solo' -Content @" |
| 107 | stimuli: |
| 108 | - name: solo-case |
| 109 | prompt: Solo agent prompt. |
| 110 | "@ |
| 111 | (Invoke-Generator -Root $script:TestRoot).ExitCode | Should -Be 0 |
| 112 | $text = Read-OutputYaml -Root $script:TestRoot |
| 113 | $firstLine = ($text -split "`n")[0] |
| 114 | $firstLine | Should -Be '# Generated by Build-AgentBehaviorSpec.ps1 - do not edit by hand.' |
| 115 | } |
| 116 | } |
| 117 | |
| 118 | Context 'No partials' { |
| 119 | It 'Emits an empty stimuli list and exits 0' { |
| 120 | $result = Invoke-Generator -Root $script:TestRoot |
| 121 | $result.ExitCode | Should -Be 0 |
| 122 | $text = Read-OutputYaml -Root $script:TestRoot |
| 123 | $text | Should -Match '(?m)^stimuli:\s*\[\]\s*$' |
| 124 | } |
| 125 | } |
| 126 | |
| 127 | Context 'Top-level key preservation' { |
| 128 | It 'Preserves byte-identical top-level keys from the existing eval.yaml prelude' { |
| 129 | $seed = @" |
| 130 | # Generated by Build-AgentBehaviorSpec.ps1 - do not edit by hand. |
| 131 | suite: agent-behavior-test |
| 132 | version: 1 |
| 133 | description: > |
| 134 | Multi-line |
| 135 | description block. |
| 136 | config: |
| 137 | executor: copilot-sdk |
| 138 | runs: 3 |
| 139 | stimuli: [] |
| 140 | "@ |
| 141 | Write-SeedEvalYaml -Root $script:TestRoot -Content $seed |
| 142 | Write-Partial -Root $script:TestRoot -Slug 'gamma' -Content @" |
| 143 | stimuli: |
| 144 | - name: gamma-case |
| 145 | prompt: Gamma agent prompt. |
| 146 | "@ |
| 147 | |
| 148 | $result = Invoke-Generator -Root $script:TestRoot -Force |
| 149 | $result.ExitCode | Should -Be 0 |
| 150 | |
| 151 | $regenerated = Read-OutputYaml -Root $script:TestRoot |
| 152 | $seedLines = ($seed -replace "`r`n", "`n") -split "`n" |
| 153 | $newLines = ($regenerated -replace "`r`n", "`n") -split "`n" |
| 154 | for ($i = 0; $i -lt 8; $i++) { |
| 155 | $newLines[$i] | Should -Be $seedLines[$i] |
| 156 | } |
| 157 | } |
| 158 | } |
| 159 | |
| 160 | Context 'Tag injection conflict' { |
| 161 | It 'Halts when a partial declares tags.agent that disagrees with the file slug' { |
| 162 | Write-Partial -Root $script:TestRoot -Slug 'expected-slug' -Content @" |
| 163 | stimuli: |
| 164 | - name: mismatched |
| 165 | prompt: A prompt. |
| 166 | tags: |
| 167 | agent: other-slug |
| 168 | "@ |
| 169 | $result = Invoke-Generator -Root $script:TestRoot |
| 170 | $result.ExitCode | Should -Not -Be 0 |
| 171 | $result.Output | Should -Match "expected-slug" |
| 172 | $result.Output | Should -Match "other-slug" |
| 173 | } |
| 174 | |
| 175 | It 'Accepts a partial that explicitly tags the matching agent slug' { |
| 176 | Write-Partial -Root $script:TestRoot -Slug 'matched-slug' -Content @" |
| 177 | stimuli: |
| 178 | - name: matched |
| 179 | prompt: A prompt. |
| 180 | tags: |
| 181 | agent: matched-slug |
| 182 | category: agent-behavior |
| 183 | "@ |
| 184 | $result = Invoke-Generator -Root $script:TestRoot |
| 185 | $result.ExitCode | Should -Be 0 |
| 186 | $spec = Read-OutputObject -Root $script:TestRoot |
| 187 | $spec.stimuli[0].tags.agent | Should -Be 'matched-slug' |
| 188 | $spec.stimuli[0].tags.category | Should -Be 'agent-behavior' |
| 189 | } |
| 190 | } |
| 191 | |
| 192 | Context 'Drift detection (-WhatIf)' { |
| 193 | It 'Exits 0 when on-disk output already matches the rendered spec' { |
| 194 | Write-Partial -Root $script:TestRoot -Slug 'driftless' -Content @" |
| 195 | stimuli: |
| 196 | - name: driftless-case |
| 197 | prompt: Driftless prompt. |
| 198 | "@ |
| 199 | (Invoke-Generator -Root $script:TestRoot).ExitCode | Should -Be 0 |
| 200 | |
| 201 | $result = Invoke-Generator -Root $script:TestRoot -DryRun |
| 202 | $result.ExitCode | Should -Be 0 |
| 203 | $diffPath = Join-Path $script:TestRoot 'logs/agent-behavior-spec-drift.diff' |
| 204 | Test-Path -LiteralPath $diffPath | Should -BeFalse |
| 205 | } |
| 206 | |
| 207 | It 'Exits 1 and writes a drift diff when on-disk content differs' { |
| 208 | Write-Partial -Root $script:TestRoot -Slug 'drift' -Content @" |
| 209 | stimuli: |
| 210 | - name: drift-case |
| 211 | prompt: Drift prompt. |
| 212 | "@ |
| 213 | (Invoke-Generator -Root $script:TestRoot).ExitCode | Should -Be 0 |
| 214 | |
| 215 | Write-Partial -Root $script:TestRoot -Slug 'drift' -Content @" |
| 216 | stimuli: |
| 217 | - name: drift-case |
| 218 | prompt: Drift prompt UPDATED. |
| 219 | "@ |
| 220 | $result = Invoke-Generator -Root $script:TestRoot -DryRun |
| 221 | $result.ExitCode | Should -Be 1 |
| 222 | |
| 223 | $diffPath = Join-Path $script:TestRoot 'logs/agent-behavior-spec-drift.diff' |
| 224 | Test-Path -LiteralPath $diffPath | Should -BeTrue |
| 225 | $diff = [System.IO.File]::ReadAllText($diffPath) |
| 226 | $diff | Should -Match 'expected' |
| 227 | $diff | Should -Match 'actual' |
| 228 | } |
| 229 | } |
| 230 | |
| 231 | Context 'Overwrite semantics' { |
| 232 | It 'Refuses to overwrite an existing file that differs without -Force' { |
| 233 | Write-SeedEvalYaml -Root $script:TestRoot -Content "stimuli: []`n" |
| 234 | Write-Partial -Root $script:TestRoot -Slug 'agent-a' -Content @" |
| 235 | stimuli: |
| 236 | - name: agent-a-case |
| 237 | prompt: Prompt. |
| 238 | "@ |
| 239 | $result = Invoke-Generator -Root $script:TestRoot |
| 240 | $result.ExitCode | Should -Not -Be 0 |
| 241 | $result.Output | Should -Match 'Force' |
| 242 | } |
| 243 | |
| 244 | It 'Overwrites the existing file with -Force' { |
| 245 | Write-SeedEvalYaml -Root $script:TestRoot -Content "stimuli: []`n" |
| 246 | Write-Partial -Root $script:TestRoot -Slug 'agent-b' -Content @" |
| 247 | stimuli: |
| 248 | - name: agent-b-case |
| 249 | prompt: Prompt. |
| 250 | "@ |
| 251 | $result = Invoke-Generator -Root $script:TestRoot -Force |
| 252 | $result.ExitCode | Should -Be 0 |
| 253 | $spec = Read-OutputObject -Root $script:TestRoot |
| 254 | $spec.stimuli[0].name | Should -Be 'agent-b-case' |
| 255 | } |
| 256 | |
| 257 | It 'Skips writing when -Force is set but content is identical' { |
| 258 | Write-Partial -Root $script:TestRoot -Slug 'idem' -Content @" |
| 259 | stimuli: |
| 260 | - name: idem-case |
| 261 | prompt: Prompt. |
| 262 | "@ |
| 263 | (Invoke-Generator -Root $script:TestRoot).ExitCode | Should -Be 0 |
| 264 | $first = Read-OutputYaml -Root $script:TestRoot |
| 265 | (Invoke-Generator -Root $script:TestRoot -Force).ExitCode | Should -Be 0 |
| 266 | $second = Read-OutputYaml -Root $script:TestRoot |
| 267 | $second | Should -Be $first |
| 268 | } |
| 269 | } |
| 270 | |
| 271 | Context 'Idempotency' { |
| 272 | It 'Produces the same output when run twice in a row' { |
| 273 | Write-Partial -Root $script:TestRoot -Slug 'idem' -Content @" |
| 274 | stimuli: |
| 275 | - name: idem-case |
| 276 | prompt: Prompt. |
| 277 | "@ |
| 278 | (Invoke-Generator -Root $script:TestRoot).ExitCode | Should -Be 0 |
| 279 | $first = Read-OutputYaml -Root $script:TestRoot |
| 280 | $drift = Invoke-Generator -Root $script:TestRoot -DryRun |
| 281 | $drift.ExitCode | Should -Be 0 |
| 282 | $second = Read-OutputYaml -Root $script:TestRoot |
| 283 | $second | Should -Be $first |
| 284 | } |
| 285 | } |
| 286 | |
| 287 | Context 'Partial validation errors' { |
| 288 | It 'Names the offending file when a partial is invalid YAML' { |
| 289 | Write-Partial -Root $script:TestRoot -Slug 'broken' -Content "stimuli:`n - name: x`n bad-indent:" |
| 290 | $result = Invoke-Generator -Root $script:TestRoot |
| 291 | $result.ExitCode | Should -Not -Be 0 |
| 292 | $result.Output | Should -Match 'broken\.yml' |
| 293 | } |
| 294 | |
| 295 | It 'Fails when a stimulus is missing the name field' { |
| 296 | Write-Partial -Root $script:TestRoot -Slug 'no-name' -Content @" |
| 297 | stimuli: |
| 298 | - prompt: A prompt with no name. |
| 299 | "@ |
| 300 | $result = Invoke-Generator -Root $script:TestRoot |
| 301 | $result.ExitCode | Should -Not -Be 0 |
| 302 | $result.Output | Should -Match "name" |
| 303 | } |
| 304 | |
| 305 | It 'Fails when a stimulus is missing the prompt field' { |
| 306 | Write-Partial -Root $script:TestRoot -Slug 'no-prompt' -Content @" |
| 307 | stimuli: |
| 308 | - name: prompt-less |
| 309 | "@ |
| 310 | $result = Invoke-Generator -Root $script:TestRoot |
| 311 | $result.ExitCode | Should -Not -Be 0 |
| 312 | $result.Output | Should -Match "prompt" |
| 313 | } |
| 314 | |
| 315 | It 'Silently skips a partial whose stimuli list is empty' { |
| 316 | Write-Partial -Root $script:TestRoot -Slug 'silent' -Content "stimuli: []`n" |
| 317 | $result = Invoke-Generator -Root $script:TestRoot |
| 318 | $result.ExitCode | Should -Be 0 |
| 319 | $spec = Read-OutputObject -Root $script:TestRoot |
| 320 | ($null -eq $spec.stimuli -or $spec.stimuli.Count -eq 0) | Should -BeTrue |
| 321 | } |
| 322 | } |
| 323 | } |
| 324 | |