microsoft/hve-core
Publicmirrored fromhttps://github.com/microsoft/hve-coreAvailable
scripts/evals/Build-AgentBehaviorSpec.ps1
359lines · modecode
| 1 | #!/usr/bin/env pwsh |
| 2 | # Copyright (c) Microsoft Corporation. |
| 3 | # SPDX-License-Identifier: MIT |
| 4 | #Requires -Version 7.0 |
| 5 | |
| 6 | <# |
| 7 | .SYNOPSIS |
| 8 | Regenerate evals/agent-behavior/eval.yaml from per-agent stimulus partials. |
| 9 | |
| 10 | .DESCRIPTION |
| 11 | Concatenates committed per-agent partials (one stimulus list per agent slug) |
| 12 | into the agent-behavior suite spec. Partials are discovered under |
| 13 | `<RepoRoot>/evals/agent-behavior/stimuli/*.yml` and rendered in alphabetical |
| 14 | order by file name. The agent slug is taken from the partial's base name |
| 15 | and injected as `tags.agent: <slug>` on every emitted stimulus, so partial |
| 16 | authors never duplicate the tag. |
| 17 | |
| 18 | Top-level keys (everything except `stimuli:`) from the existing output file |
| 19 | are preserved verbatim. The single-line banner |
| 20 | `# Generated by Build-AgentBehaviorSpec.ps1 - do not edit by hand.` is |
| 21 | re-prepended on every run and de-duplicated, so re-running on the script's |
| 22 | own output is idempotent. |
| 23 | |
| 24 | With -WhatIf, the script renders in-memory and exits 0 when the on-disk |
| 25 | output already matches; otherwise it writes a line-based diff to |
| 26 | `<RepoRoot>/logs/agent-behavior-spec-drift.diff` and exits 1. |
| 27 | |
| 28 | .PARAMETER RepoRoot |
| 29 | Repository root. Defaults to `git rev-parse --show-toplevel`. |
| 30 | |
| 31 | .PARAMETER PartialsDir |
| 32 | Directory containing `<slug>.yml` partials. Defaults to |
| 33 | `<RepoRoot>/evals/agent-behavior/stimuli`. |
| 34 | |
| 35 | .PARAMETER OutputPath |
| 36 | Output spec path. Defaults to `<RepoRoot>/evals/agent-behavior/eval.yaml`. |
| 37 | |
| 38 | .PARAMETER DriftDiffPath |
| 39 | Path to write the line-based diff under -WhatIf. Defaults to |
| 40 | `<RepoRoot>/logs/agent-behavior-spec-drift.diff`. |
| 41 | |
| 42 | .PARAMETER Force |
| 43 | Overwrite the output regardless of existing content. Without -Force an |
| 44 | unchanged file is left untouched (no-op), and a changed file triggers an |
| 45 | error so accidental clobbering of unrelated edits is surfaced. |
| 46 | |
| 47 | .EXAMPLE |
| 48 | pwsh scripts/evals/Build-AgentBehaviorSpec.ps1 |
| 49 | |
| 50 | .EXAMPLE |
| 51 | pwsh scripts/evals/Build-AgentBehaviorSpec.ps1 -WhatIf |
| 52 | |
| 53 | .NOTES |
| 54 | Mirrors the generate-and-commit drift-check pattern used by |
| 55 | `scripts/evals/New-AgentSurfaceSignatures.ps1`. |
| 56 | #> |
| 57 | [CmdletBinding(SupportsShouldProcess)] |
| 58 | [OutputType([string])] |
| 59 | param( |
| 60 | [string]$RepoRoot, |
| 61 | |
| 62 | [string]$PartialsDir, |
| 63 | |
| 64 | [string]$OutputPath, |
| 65 | |
| 66 | [string]$DriftDiffPath, |
| 67 | |
| 68 | [switch]$Force |
| 69 | ) |
| 70 | |
| 71 | Set-StrictMode -Version Latest |
| 72 | $ErrorActionPreference = 'Stop' |
| 73 | |
| 74 | #region Constants |
| 75 | |
| 76 | $script:GeneratorBanner = '# Generated by Build-AgentBehaviorSpec.ps1 - do not edit by hand.' |
| 77 | |
| 78 | #endregion Constants |
| 79 | |
| 80 | #region Functions |
| 81 | |
| 82 | function Resolve-RepoRoot { |
| 83 | [CmdletBinding()] |
| 84 | [OutputType([string])] |
| 85 | param([string]$Override) |
| 86 | |
| 87 | if ($Override) { |
| 88 | return (Resolve-Path -LiteralPath $Override).Path |
| 89 | } |
| 90 | |
| 91 | try { |
| 92 | $root = (& git rev-parse --show-toplevel 2>$null).Trim() |
| 93 | if ($LASTEXITCODE -eq 0 -and $root) { return $root } |
| 94 | } catch { |
| 95 | Write-Verbose "git rev-parse failed: $($_.Exception.Message)" |
| 96 | } |
| 97 | |
| 98 | return (Get-Location).Path |
| 99 | } |
| 100 | |
| 101 | function Import-YamlModule { |
| 102 | [CmdletBinding()] |
| 103 | param() |
| 104 | |
| 105 | if (Get-Module -Name 'powershell-yaml') { return } |
| 106 | if (-not (Get-Module -ListAvailable -Name 'powershell-yaml')) { |
| 107 | throw "Required module 'powershell-yaml' is not installed. Run 'Install-Module powershell-yaml -Scope CurrentUser' before invoking this script." |
| 108 | } |
| 109 | Import-Module powershell-yaml -ErrorAction Stop | Out-Null |
| 110 | } |
| 111 | |
| 112 | function Get-PartialFiles { |
| 113 | [CmdletBinding()] |
| 114 | [OutputType([System.IO.FileInfo[]])] |
| 115 | param([Parameter(Mandatory)] [string]$PartialsDir) |
| 116 | |
| 117 | if (-not (Test-Path -LiteralPath $PartialsDir)) { |
| 118 | return @() |
| 119 | } |
| 120 | return @(Get-ChildItem -Path $PartialsDir -Filter '*.yml' -File | Sort-Object -Property Name) |
| 121 | } |
| 122 | |
| 123 | function Read-PartialStimuli { |
| 124 | [CmdletBinding()] |
| 125 | [OutputType([System.Collections.IList])] |
| 126 | param( |
| 127 | [Parameter(Mandatory)] [string]$Path, |
| 128 | [Parameter(Mandatory)] [string]$Slug |
| 129 | ) |
| 130 | |
| 131 | $raw = [System.IO.File]::ReadAllText($Path) |
| 132 | try { |
| 133 | $parsed = ConvertFrom-Yaml -Yaml $raw -Ordered |
| 134 | } catch { |
| 135 | throw "Failed to parse partial '$Path' as YAML: $($_.Exception.Message)" |
| 136 | } |
| 137 | |
| 138 | if ($null -eq $parsed) { |
| 139 | return @() |
| 140 | } |
| 141 | |
| 142 | if ($parsed -isnot [System.Collections.IDictionary]) { |
| 143 | throw "Partial '$Path' must be a YAML mapping with a top-level 'stimuli' key." |
| 144 | } |
| 145 | |
| 146 | if (-not $parsed.Contains('stimuli')) { |
| 147 | return @() |
| 148 | } |
| 149 | |
| 150 | $stimuli = $parsed['stimuli'] |
| 151 | if ($null -eq $stimuli) { |
| 152 | return @() |
| 153 | } |
| 154 | if ($stimuli -isnot [System.Collections.IList]) { |
| 155 | throw "Partial '$Path' has a 'stimuli' key that is not a list." |
| 156 | } |
| 157 | |
| 158 | $injected = [System.Collections.Generic.List[object]]::new() |
| 159 | foreach ($item in $stimuli) { |
| 160 | if ($item -isnot [System.Collections.IDictionary]) { |
| 161 | throw "Partial '$Path' contains a stimulus entry that is not a mapping." |
| 162 | } |
| 163 | if (-not $item.Contains('name') -or [string]::IsNullOrWhiteSpace([string]$item['name'])) { |
| 164 | throw "Partial '$Path' contains a stimulus missing a non-empty 'name' field." |
| 165 | } |
| 166 | if (-not $item.Contains('prompt') -or [string]::IsNullOrWhiteSpace([string]$item['prompt'])) { |
| 167 | throw "Partial '$Path' stimulus '$($item['name'])' is missing a non-empty 'prompt' field." |
| 168 | } |
| 169 | |
| 170 | $tags = if ($item.Contains('tags')) { $item['tags'] } else { $null } |
| 171 | if ($null -eq $tags) { |
| 172 | $tags = [ordered]@{} |
| 173 | $item['tags'] = $tags |
| 174 | } elseif ($tags -isnot [System.Collections.IDictionary]) { |
| 175 | throw "Partial '$Path' stimulus '$($item['name'])' has a non-mapping 'tags' value." |
| 176 | } |
| 177 | |
| 178 | if ($tags.Contains('agent')) { |
| 179 | $existing = [string]$tags['agent'] |
| 180 | if ($existing -ne $Slug) { |
| 181 | throw "Partial '$Path' stimulus '$($item['name'])' declares tags.agent='$existing' but file slug is '$Slug'. Remove the agent tag from the partial; the generator injects it from the file name." |
| 182 | } |
| 183 | } else { |
| 184 | $tags['agent'] = $Slug |
| 185 | } |
| 186 | |
| 187 | $injected.Add($item) |
| 188 | } |
| 189 | return , $injected |
| 190 | } |
| 191 | |
| 192 | function Split-ExistingPrelude { |
| 193 | [CmdletBinding()] |
| 194 | [OutputType([hashtable])] |
| 195 | param([string]$ExistingText) |
| 196 | |
| 197 | if (-not $ExistingText) { |
| 198 | return @{ Prelude = ''; HadStimuli = $false } |
| 199 | } |
| 200 | |
| 201 | $lines = $ExistingText -split "(?<=`n)" |
| 202 | for ($i = 0; $i -lt $lines.Count; $i++) { |
| 203 | if ($lines[$i] -match '^stimuli\s*:') { |
| 204 | $preludeLines = if ($i -gt 0) { $lines[0..($i - 1)] } else { @() } |
| 205 | return @{ Prelude = ($preludeLines -join ''); HadStimuli = $true } |
| 206 | } |
| 207 | } |
| 208 | |
| 209 | $trailingNewline = if ($ExistingText.EndsWith("`n")) { '' } else { "`n" } |
| 210 | return @{ Prelude = ($ExistingText + $trailingNewline); HadStimuli = $false } |
| 211 | } |
| 212 | |
| 213 | function Remove-LeadingBanner { |
| 214 | [CmdletBinding()] |
| 215 | [OutputType([string])] |
| 216 | param([string]$Prelude) |
| 217 | |
| 218 | if (-not $Prelude) { return '' } |
| 219 | $lines = $Prelude -split "(?<=`n)" |
| 220 | $skip = 0 |
| 221 | while ($skip -lt $lines.Count -and $lines[$skip].TrimEnd("`r", "`n").StartsWith('# Generated by Build-AgentBehaviorSpec.ps1')) { |
| 222 | $skip++ |
| 223 | } |
| 224 | if ($skip -eq 0) { return $Prelude } |
| 225 | if ($skip -ge $lines.Count) { return '' } |
| 226 | return ($lines[$skip..($lines.Count - 1)] -join '') |
| 227 | } |
| 228 | |
| 229 | function Format-StimuliBlock { |
| 230 | [CmdletBinding()] |
| 231 | [OutputType([string])] |
| 232 | param([Parameter()] [System.Collections.IList]$Stimuli) |
| 233 | |
| 234 | if (-not $Stimuli -or $Stimuli.Count -eq 0) { |
| 235 | return "stimuli: []`n" |
| 236 | } |
| 237 | |
| 238 | $wrapper = [ordered]@{ stimuli = $Stimuli } |
| 239 | $rendered = ConvertTo-Yaml -Data $wrapper |
| 240 | if (-not $rendered.EndsWith("`n")) { $rendered += "`n" } |
| 241 | return $rendered |
| 242 | } |
| 243 | |
| 244 | function Get-RenderedSpec { |
| 245 | [CmdletBinding()] |
| 246 | [OutputType([string])] |
| 247 | param( |
| 248 | [Parameter()] [string]$ExistingText, |
| 249 | [Parameter()] [System.Collections.IList]$Stimuli |
| 250 | ) |
| 251 | |
| 252 | $split = Split-ExistingPrelude -ExistingText $ExistingText |
| 253 | $prelude = Remove-LeadingBanner -Prelude $split.Prelude |
| 254 | |
| 255 | $sb = [System.Text.StringBuilder]::new() |
| 256 | [void]$sb.Append($script:GeneratorBanner) |
| 257 | [void]$sb.Append("`n") |
| 258 | if ($prelude) { |
| 259 | [void]$sb.Append($prelude) |
| 260 | if (-not $prelude.EndsWith("`n")) { [void]$sb.Append("`n") } |
| 261 | } |
| 262 | [void]$sb.Append((Format-StimuliBlock -Stimuli $Stimuli)) |
| 263 | return $sb.ToString() |
| 264 | } |
| 265 | |
| 266 | function Get-LineDiff { |
| 267 | [CmdletBinding()] |
| 268 | [OutputType([string])] |
| 269 | param( |
| 270 | [Parameter(Mandatory)] [string]$Expected, |
| 271 | [Parameter(Mandatory)] [string]$Actual, |
| 272 | [Parameter(Mandatory)] [string]$Path |
| 273 | ) |
| 274 | |
| 275 | $expectedLines = $Expected -split "`r?`n" |
| 276 | $actualLines = $Actual -split "`r?`n" |
| 277 | $sb = [System.Text.StringBuilder]::new() |
| 278 | [void]$sb.AppendLine("--- expected $Path") |
| 279 | [void]$sb.AppendLine("+++ actual $Path") |
| 280 | |
| 281 | $diff = Compare-Object -ReferenceObject $expectedLines -DifferenceObject $actualLines |
| 282 | foreach ($entry in $diff) { |
| 283 | $prefix = if ($entry.SideIndicator -eq '<=') { '-' } else { '+' } |
| 284 | [void]$sb.AppendLine("$prefix$($entry.InputObject)") |
| 285 | } |
| 286 | return $sb.ToString() |
| 287 | } |
| 288 | |
| 289 | #endregion Functions |
| 290 | |
| 291 | #region Main Execution |
| 292 | |
| 293 | $resolvedRoot = Resolve-RepoRoot -Override $RepoRoot |
| 294 | if (-not $PartialsDir) { |
| 295 | $PartialsDir = Join-Path $resolvedRoot 'evals/agent-behavior/stimuli' |
| 296 | } |
| 297 | if (-not $OutputPath) { |
| 298 | $OutputPath = Join-Path $resolvedRoot 'evals/agent-behavior/eval.yaml' |
| 299 | } |
| 300 | if (-not $DriftDiffPath) { |
| 301 | $DriftDiffPath = Join-Path $resolvedRoot 'logs/agent-behavior-spec-drift.diff' |
| 302 | } |
| 303 | |
| 304 | Import-YamlModule |
| 305 | |
| 306 | $partials = Get-PartialFiles -PartialsDir $PartialsDir |
| 307 | $allStimuli = [System.Collections.Generic.List[object]]::new() |
| 308 | foreach ($partial in $partials) { |
| 309 | $slug = $partial.BaseName |
| 310 | foreach ($stimulus in (Read-PartialStimuli -Path $partial.FullName -Slug $slug)) { |
| 311 | $allStimuli.Add($stimulus) |
| 312 | } |
| 313 | } |
| 314 | |
| 315 | $existingText = if (Test-Path -LiteralPath $OutputPath) { |
| 316 | [System.IO.File]::ReadAllText($OutputPath) -replace "`r`n", "`n" |
| 317 | } else { |
| 318 | '' |
| 319 | } |
| 320 | |
| 321 | $rendered = Get-RenderedSpec -ExistingText $existingText -Stimuli $allStimuli |
| 322 | # ConvertTo-Yaml emits CRLF on Windows; normalize to LF so on-disk content |
| 323 | # stays platform-stable and drift comparisons are byte-accurate. |
| 324 | $rendered = $rendered -replace "`r`n", "`n" |
| 325 | |
| 326 | if ($WhatIfPreference) { |
| 327 | if ($existingText -eq $rendered) { |
| 328 | Write-Host "no drift: $OutputPath" -ForegroundColor Green |
| 329 | exit 0 |
| 330 | } |
| 331 | $diffDir = Split-Path -Parent $DriftDiffPath |
| 332 | if ($diffDir -and -not (Test-Path -LiteralPath $diffDir)) { |
| 333 | # -WhatIf:$false bypasses inherited WhatIfPreference so the diff dir is |
| 334 | # always materialized during drift detection runs. |
| 335 | New-Item -ItemType Directory -Path $diffDir -Force -WhatIf:$false | Out-Null |
| 336 | } |
| 337 | $diffText = Get-LineDiff -Expected $rendered -Actual $existingText -Path $OutputPath |
| 338 | [System.IO.File]::WriteAllText($DriftDiffPath, $diffText) |
| 339 | Write-Host "drift detected; diff written to $DriftDiffPath" -ForegroundColor Yellow |
| 340 | exit 1 |
| 341 | } |
| 342 | |
| 343 | if ((Test-Path -LiteralPath $OutputPath) -and -not $Force) { |
| 344 | if ($existingText -eq $rendered) { |
| 345 | Write-Host "skipped (no changes): $OutputPath" -ForegroundColor Gray |
| 346 | return $OutputPath |
| 347 | } |
| 348 | throw "Output file already exists and differs from rendered content. Re-run with -Force to overwrite: $OutputPath" |
| 349 | } |
| 350 | |
| 351 | $outputDir = Split-Path -Parent $OutputPath |
| 352 | if ($outputDir -and -not (Test-Path -LiteralPath $outputDir)) { |
| 353 | New-Item -ItemType Directory -Path $outputDir -Force | Out-Null |
| 354 | } |
| 355 | [System.IO.File]::WriteAllText($OutputPath, $rendered) |
| 356 | Write-Host "wrote: $OutputPath" -ForegroundColor Green |
| 357 | return $OutputPath |
| 358 | |
| 359 | #endregion Main Execution |
| 360 | |