microsoft/hve-core
Publicmirrored from https://github.com/microsoft/hve-coreAvailable
scripts/evals/lib/EquivalenceParsing.psm1
630lines · modecode
| 1 | # Copyright (c) Microsoft Corporation. |
| 2 | # SPDX-License-Identifier: MIT |
| 3 | |
| 4 | #Requires -Version 7.0 |
| 5 | |
| 6 | <# |
| 7 | .SYNOPSIS |
| 8 | Shared parsing, aggregation, and rendering helpers for baseline-equivalence eval runs. |
| 9 | |
| 10 | .DESCRIPTION |
| 11 | Consolidates the compare-log and results.jsonl parsers used by |
| 12 | `Invoke-BaselineEquivalence.ps1` and the dashboard generator |
| 13 | `New-EquivalenceDashboard.ps1`. All public functions are exported via |
| 14 | `Export-ModuleMember` at the bottom of the file. |
| 15 | #> |
| 16 | |
| 17 | Set-StrictMode -Version Latest |
| 18 | |
| 19 | function Measure-CompareTrials { |
| 20 | [CmdletBinding()] |
| 21 | [OutputType([hashtable])] |
| 22 | param( |
| 23 | [Parameter(Mandatory)] |
| 24 | [AllowEmptyCollection()] |
| 25 | [AllowEmptyString()] |
| 26 | [string[]]$Lines |
| 27 | ) |
| 28 | |
| 29 | $pattern = '\s(?<stim>\S[^\n]*?\(trial\s+\d+\))\s{2,}(?<verdict>tie|A wins|B wins)\s{2,}\(score:\s*(?<score>[-+0-9.]+)\)\s*$' |
| 30 | $ansi = [regex]'\x1B\[[0-9;]*[A-Za-z]' |
| 31 | $ties = 0; $a = 0; $b = 0; $total = 0 |
| 32 | $perStimulus = @{} |
| 33 | foreach ($line in $Lines) { |
| 34 | $clean = $ansi.Replace($line, '') |
| 35 | if ($clean -match $pattern) { |
| 36 | $total++ |
| 37 | $stim = ($Matches.stim -replace '\s*\(trial\s+\d+\)\s*$', '').Trim() |
| 38 | if (-not $perStimulus.ContainsKey($stim)) { |
| 39 | $perStimulus[$stim] = @{ Ties = 0; AWins = 0; BWins = 0 } |
| 40 | } |
| 41 | switch ($Matches.verdict) { |
| 42 | 'tie' { $ties++; $perStimulus[$stim].Ties += 1 } |
| 43 | 'A wins' { $a++; $perStimulus[$stim].AWins += 1 } |
| 44 | 'B wins' { $b++; $perStimulus[$stim].BWins += 1 } |
| 45 | } |
| 46 | } |
| 47 | } |
| 48 | return @{ Total = $total; Ties = $ties; AWins = $a; BWins = $b; PerStimulus = $perStimulus } |
| 49 | } |
| 50 | |
| 51 | function Measure-InvariantFailures { |
| 52 | [CmdletBinding()] |
| 53 | [OutputType([hashtable])] |
| 54 | param( |
| 55 | [Parameter(Mandatory)] |
| 56 | [AllowEmptyCollection()] |
| 57 | [AllowEmptyString()] |
| 58 | [string[]]$Lines |
| 59 | ) |
| 60 | |
| 61 | $ansi = [regex]'\x1B\[[0-9;]*[A-Za-z]' |
| 62 | $pass = [char]::ConvertFromUtf32(0x2705) |
| 63 | $fail = [char]::ConvertFromUtf32(0x274C) |
| 64 | $warn = [char]::ConvertFromUtf32(0x1F7E1) |
| 65 | $verdictAlt = "$pass|$fail|$warn" |
| 66 | $rowPattern = "^\|\s*[^|\s][^|]*\|.*\|\s*(?<verdict>$verdictAlt)(?:\s|$|<)" |
| 67 | $total = 0; $failed = 0 |
| 68 | foreach ($line in $Lines) { |
| 69 | $clean = $ansi.Replace($line, '') |
| 70 | if ($clean -match $rowPattern) { |
| 71 | $total++ |
| 72 | if ($Matches.verdict -ne $pass) { $failed++ } |
| 73 | } |
| 74 | } |
| 75 | return @{ Total = $total; Failed = $failed } |
| 76 | } |
| 77 | |
| 78 | function Get-VerdictFromAggregate { |
| 79 | [CmdletBinding()] |
| 80 | [OutputType([string])] |
| 81 | param( |
| 82 | [Parameter(Mandatory)][int]$Runs, |
| 83 | [Parameter(Mandatory)][int]$Ties, |
| 84 | [Parameter(Mandatory)][int]$AWins, |
| 85 | [Parameter(Mandatory)][int]$BWins, |
| 86 | [Parameter(Mandatory)][int]$InvariantFailures, |
| 87 | [Parameter(Mandatory)][int]$DivergenceFailures, |
| 88 | [Parameter(Mandatory)][string]$Tier |
| 89 | ) |
| 90 | |
| 91 | if ($Runs -le 0) { return 'fail' } |
| 92 | if ($InvariantFailures -gt 0 -or $DivergenceFailures -gt 0) { |
| 93 | if ($Tier -eq 'pr') { return 'warn' } else { return 'fail' } |
| 94 | } |
| 95 | |
| 96 | $tieRatio = [double]$Ties / [double]$Runs |
| 97 | $nonTies = $AWins + $BWins |
| 98 | $symmetric = ($nonTies -eq 0) -or ([math]::Abs($AWins - $BWins) -le ($nonTies * 0.5)) |
| 99 | |
| 100 | if ($tieRatio -ge 0.80 -and $symmetric) { return 'pass' } |
| 101 | if ($Tier -eq 'pr') { return 'warn' } else { return 'fail' } |
| 102 | } |
| 103 | |
| 104 | function Get-OutputHash { |
| 105 | [CmdletBinding()] |
| 106 | [OutputType([string])] |
| 107 | param([Parameter(Mandatory)][AllowEmptyString()][string]$Text) |
| 108 | $bytes = [System.Text.Encoding]::UTF8.GetBytes($Text) |
| 109 | $sha = [System.Security.Cryptography.SHA256]::Create() |
| 110 | try { |
| 111 | $hash = $sha.ComputeHash($bytes) |
| 112 | return -join ($hash | ForEach-Object { $_.ToString('x2') }) |
| 113 | } |
| 114 | finally { $sha.Dispose() } |
| 115 | } |
| 116 | |
| 117 | function ConvertFrom-EquivalenceResults { |
| 118 | [CmdletBinding()] |
| 119 | [OutputType([System.Collections.IList])] |
| 120 | param( |
| 121 | [Parameter(Mandatory)][string]$RunDir |
| 122 | ) |
| 123 | |
| 124 | if (-not (Test-Path -LiteralPath $RunDir)) { |
| 125 | throw "Run directory not found: $RunDir" |
| 126 | } |
| 127 | |
| 128 | $jsonlFiles = @(Get-ChildItem -LiteralPath $RunDir -Filter 'results.jsonl' -Recurse -File) |
| 129 | if ($jsonlFiles.Count -eq 0) { |
| 130 | throw "No results.jsonl found under $RunDir" |
| 131 | } |
| 132 | |
| 133 | $records = New-Object 'System.Collections.Generic.List[object]' |
| 134 | $stimulusCounts = @{} |
| 135 | $knownKinds = @('code', 'llm', 'human') |
| 136 | |
| 137 | foreach ($file in $jsonlFiles) { |
| 138 | $lines = Get-Content -LiteralPath $file.FullName -Encoding utf8 |
| 139 | foreach ($line in $lines) { |
| 140 | if ([string]::IsNullOrWhiteSpace($line)) { continue } |
| 141 | $obj = $line | ConvertFrom-Json -Depth 100 |
| 142 | if (-not ($obj.PSObject.Properties['trajectory'])) { continue } |
| 143 | $traj = $obj.trajectory |
| 144 | $stim = if ($traj -and $traj.stimulus) { [string]$traj.stimulus.name } else { '<unknown>' } |
| 145 | if (-not $stimulusCounts.ContainsKey($stim)) { $stimulusCounts[$stim] = 0 } |
| 146 | $trial = $stimulusCounts[$stim] |
| 147 | $stimulusCounts[$stim] = $trial + 1 |
| 148 | |
| 149 | $output = if ($traj -and $null -ne $traj.output) { [string]$traj.output } else { '' } |
| 150 | $wallMs = 0 |
| 151 | $totalTokens = 0 |
| 152 | if ($traj -and $traj.metrics) { |
| 153 | if ($null -ne $traj.metrics.wallTimeMs) { $wallMs = [int]$traj.metrics.wallTimeMs } |
| 154 | if ($traj.metrics.tokenUsage -and $null -ne $traj.metrics.tokenUsage.totalTokens) { |
| 155 | $totalTokens = [int]$traj.metrics.tokenUsage.totalTokens |
| 156 | } |
| 157 | } |
| 158 | |
| 159 | $passed = $false |
| 160 | $score = 0.0 |
| 161 | $details = @{ code = @(); llm = @(); human = @(); other = @() } |
| 162 | if ($obj.PSObject.Properties['gradeResult'] -and $obj.gradeResult) { |
| 163 | $gr = $obj.gradeResult |
| 164 | if ($null -ne $gr.passed) { $passed = [bool]$gr.passed } |
| 165 | if ($null -ne $gr.score) { $score = [double]$gr.score } |
| 166 | if ($gr.PSObject.Properties['details'] -and $gr.details) { |
| 167 | foreach ($d in @($gr.details)) { |
| 168 | $kind = if ($d.PSObject.Properties['kind'] -and $d.kind) { [string]$d.kind } else { 'other' } |
| 169 | if ($knownKinds -notcontains $kind) { |
| 170 | Write-Warning "ConvertFrom-EquivalenceResults: unknown grader kind '$kind' for stimulus '$stim' (trial $trial); bucketing under 'other'." |
| 171 | $details.other += $d |
| 172 | } |
| 173 | else { |
| 174 | $details[$kind] += $d |
| 175 | } |
| 176 | } |
| 177 | } |
| 178 | } |
| 179 | |
| 180 | $records.Add([pscustomobject]@{ |
| 181 | stimulusName = $stim |
| 182 | trial = $trial |
| 183 | output = $output |
| 184 | outputHash = Get-OutputHash -Text $output |
| 185 | passed = $passed |
| 186 | score = $score |
| 187 | wallTimeMs = $wallMs |
| 188 | totalTokens = $totalTokens |
| 189 | details = $details |
| 190 | }) | Out-Null |
| 191 | } |
| 192 | } |
| 193 | |
| 194 | return , $records |
| 195 | } |
| 196 | |
| 197 | function Merge-EquivalenceStimuli { |
| 198 | [CmdletBinding()] |
| 199 | [OutputType([System.Collections.IList])] |
| 200 | param( |
| 201 | [Parameter(Mandatory)][AllowEmptyCollection()][object[]]$Baseline, |
| 202 | [Parameter(Mandatory)][AllowEmptyCollection()][object[]]$Customized, |
| 203 | [Parameter(Mandatory)][hashtable]$Compare |
| 204 | ) |
| 205 | |
| 206 | $byStimBase = @{} |
| 207 | foreach ($r in $Baseline) { |
| 208 | if (-not $byStimBase.ContainsKey($r.stimulusName)) { $byStimBase[$r.stimulusName] = @() } |
| 209 | $byStimBase[$r.stimulusName] += $r |
| 210 | } |
| 211 | $byStimCust = @{} |
| 212 | foreach ($r in $Customized) { |
| 213 | if (-not $byStimCust.ContainsKey($r.stimulusName)) { $byStimCust[$r.stimulusName] = @() } |
| 214 | $byStimCust[$r.stimulusName] += $r |
| 215 | } |
| 216 | |
| 217 | $perStim = if ($Compare.ContainsKey('PerStimulus')) { $Compare.PerStimulus } else { @{} } |
| 218 | $nameSet = [System.Collections.Generic.HashSet[string]]::new() |
| 219 | foreach ($k in $byStimBase.Keys) { [void]$nameSet.Add($k) } |
| 220 | foreach ($k in $byStimCust.Keys) { [void]$nameSet.Add($k) } |
| 221 | $allNames = @($nameSet) | Sort-Object |
| 222 | $merged = New-Object 'System.Collections.Generic.List[object]' |
| 223 | |
| 224 | foreach ($name in $allNames) { |
| 225 | [object[]]$b = @(if ($byStimBase.ContainsKey($name)) { $byStimBase[$name] } else { @() }) |
| 226 | [object[]]$c = @(if ($byStimCust.ContainsKey($name)) { $byStimCust[$name] } else { @() }) |
| 227 | $trialCount = [math]::Max($b.Count, $c.Count) |
| 228 | |
| 229 | $identical = 0 |
| 230 | $wallDiffs = New-Object 'System.Collections.Generic.List[double]' |
| 231 | $tokenDiffs = New-Object 'System.Collections.Generic.List[double]' |
| 232 | $pairs = New-Object 'System.Collections.Generic.List[object]' |
| 233 | for ($i = 0; $i -lt $trialCount; $i++) { |
| 234 | $bi = if ($i -lt $b.Count) { $b[$i] } else { $null } |
| 235 | $ci = if ($i -lt $c.Count) { $c[$i] } else { $null } |
| 236 | if ($bi -and $ci -and $bi.outputHash -eq $ci.outputHash) { $identical++ } |
| 237 | if ($bi -and $ci) { |
| 238 | $wallDiffs.Add([double]($ci.wallTimeMs - $bi.wallTimeMs)) |
| 239 | $tokenDiffs.Add([double]($ci.totalTokens - $bi.totalTokens)) |
| 240 | } |
| 241 | $pairs.Add([pscustomobject]@{ |
| 242 | trial = $i |
| 243 | baseline = $bi |
| 244 | customized = $ci |
| 245 | }) | Out-Null |
| 246 | } |
| 247 | |
| 248 | $basePassed = @($b | Where-Object { $_.passed }).Count |
| 249 | $custPassed = @($c | Where-Object { $_.passed }).Count |
| 250 | |
| 251 | $tally = if ($perStim.ContainsKey($name)) { $perStim[$name] } else { @{ Ties = 0; AWins = 0; BWins = 0 } } |
| 252 | |
| 253 | $meanWall = if ($wallDiffs.Count -gt 0) { ($wallDiffs | Measure-Object -Average).Average } else { 0.0 } |
| 254 | $meanTokens = if ($tokenDiffs.Count -gt 0) { ($tokenDiffs | Measure-Object -Average).Average } else { 0.0 } |
| 255 | |
| 256 | $merged.Add([pscustomobject]@{ |
| 257 | stimulusName = $name |
| 258 | baselineTrials = $b.Count |
| 259 | customizedTrials = $c.Count |
| 260 | baselinePassed = $basePassed |
| 261 | customizedPassed = $custPassed |
| 262 | baselinePassRate = if ($b.Count -gt 0) { [math]::Round($basePassed / [double]$b.Count, 4) } else { 0.0 } |
| 263 | customizedPassRate = if ($c.Count -gt 0) { [math]::Round($custPassed / [double]$c.Count, 4) } else { 0.0 } |
| 264 | identicalCount = $identical |
| 265 | identicalTotal = $trialCount |
| 266 | ties = [int]$tally.Ties |
| 267 | aWins = [int]$tally.AWins |
| 268 | bWins = [int]$tally.BWins |
| 269 | meanWallTimeDeltaMs = [math]::Round($meanWall, 2) |
| 270 | meanTokenDelta = [math]::Round($meanTokens, 2) |
| 271 | trials = $pairs |
| 272 | }) | Out-Null |
| 273 | } |
| 274 | |
| 275 | return , $merged |
| 276 | } |
| 277 | |
| 278 | function Edit-HtmlEscape { |
| 279 | [CmdletBinding()] |
| 280 | [OutputType([string])] |
| 281 | param([Parameter(Mandatory)][AllowEmptyString()][AllowNull()][string]$Text) |
| 282 | if ($null -eq $Text) { return '' } |
| 283 | return ($Text -replace '&', '&' -replace '<', '<' -replace '>', '>' -replace '"', '"' -replace "'", ''') |
| 284 | } |
| 285 | |
| 286 | function Get-VariantMetadata { |
| 287 | [CmdletBinding()] |
| 288 | [OutputType([hashtable])] |
| 289 | param( |
| 290 | [Parameter(Mandatory)] |
| 291 | [string]$VariantYamlPath, |
| 292 | [Parameter(Mandatory)] |
| 293 | [hashtable]$Default |
| 294 | ) |
| 295 | |
| 296 | $variant = @{} |
| 297 | foreach ($key in $Default.Keys) { $variant[$key] = $Default[$key] } |
| 298 | |
| 299 | if (-not (Test-Path -LiteralPath $VariantYamlPath)) { return $variant } |
| 300 | if (-not (Get-Module -ListAvailable -Name 'powershell-yaml')) { return $variant } |
| 301 | |
| 302 | try { |
| 303 | Import-Module powershell-yaml -ErrorAction Stop |
| 304 | $raw = Get-Content -LiteralPath $VariantYamlPath -Raw |
| 305 | $parsed = ConvertFrom-Yaml -Yaml $raw |
| 306 | if ($parsed) { |
| 307 | foreach ($key in @('kind', 'name', 'label', 'description', 'applied')) { |
| 308 | if ($parsed.ContainsKey($key)) { $variant[$key] = $parsed[$key] } |
| 309 | } |
| 310 | } |
| 311 | } |
| 312 | catch { |
| 313 | Write-Verbose "Failed to parse variant metadata at ${VariantYamlPath}: $($_.Exception.Message)" |
| 314 | } |
| 315 | |
| 316 | if (-not $variant.ContainsKey('applied') -or $null -eq $variant.applied) { $variant.applied = @() } |
| 317 | return $variant |
| 318 | } |
| 319 | |
| 320 | function ConvertTo-EquivalenceHtml { |
| 321 | [CmdletBinding()] |
| 322 | [OutputType([string])] |
| 323 | param( |
| 324 | [Parameter(Mandatory)][AllowEmptyCollection()][object[]]$Stimuli, |
| 325 | [Parameter(Mandatory)][string]$Model, |
| 326 | [Parameter(Mandatory)][string]$RunId, |
| 327 | [Parameter(Mandatory)][string]$Agent, |
| 328 | [hashtable]$Variants |
| 329 | ) |
| 330 | |
| 331 | $generatedAt = (Get-Date).ToUniversalTime().ToString('o') |
| 332 | $totalStimuli = $Stimuli.Count |
| 333 | $totalTrials = ($Stimuli | Measure-Object -Property identicalTotal -Sum).Sum |
| 334 | if (-not $totalTrials) { $totalTrials = 0 } |
| 335 | $totalIdentical = ($Stimuli | Measure-Object -Property identicalCount -Sum).Sum |
| 336 | if (-not $totalIdentical) { $totalIdentical = 0 } |
| 337 | $identicalPct = if ($totalTrials -gt 0) { [math]::Round(100 * $totalIdentical / [double]$totalTrials, 1) } else { 0 } |
| 338 | |
| 339 | $defaultVariantA = @{ kind = 'baseline'; name = 'baseline'; label = 'Baseline (A)'; description = ''; applied = @() } |
| 340 | $defaultVariantB = @{ kind = 'unknown'; name = 'customized'; label = 'Customized (B)'; description = ''; applied = @() } |
| 341 | $variantA = if ($Variants -and $Variants.a) { $Variants.a } else { $defaultVariantA } |
| 342 | $variantB = if ($Variants -and $Variants.b) { $Variants.b } else { $defaultVariantB } |
| 343 | $subject = if ($Variants -and $Variants.subject) { [string]$Variants.subject } else { [string]$variantB.name } |
| 344 | |
| 345 | $payload = [ordered]@{ |
| 346 | model = $Model |
| 347 | runId = $RunId |
| 348 | generatedAt = $generatedAt |
| 349 | totalStimuli = $totalStimuli |
| 350 | totalTrials = $totalTrials |
| 351 | identicalPct = $identicalPct |
| 352 | variants = @{ a = $variantA; b = $variantB; subject = $subject } |
| 353 | stimuli = $Stimuli |
| 354 | } |
| 355 | $json = $payload | ConvertTo-Json -Depth 100 -Compress |
| 356 | # Escape sequences that could break out of a <script> tag context (including '/' for </script> defense in depth). |
| 357 | $json = $json -replace '<', '\u003c' -replace '>', '\u003e' -replace '&', '\u0026' -replace '/', '\/' |
| 358 | |
| 359 | $modelEsc = Edit-HtmlEscape $Model |
| 360 | $runIdEsc = Edit-HtmlEscape $RunId |
| 361 | $agentEsc = Edit-HtmlEscape $Agent |
| 362 | $aLabelEsc = Edit-HtmlEscape ([string]$variantA.label) |
| 363 | $bLabelEsc = Edit-HtmlEscape ([string]$variantB.label) |
| 364 | $aKindEsc = Edit-HtmlEscape ([string]$variantA.kind) |
| 365 | $bKindEsc = Edit-HtmlEscape ([string]$variantB.kind) |
| 366 | $aDescEsc = Edit-HtmlEscape ([string]$variantA.description) |
| 367 | $bDescEsc = Edit-HtmlEscape ([string]$variantB.description) |
| 368 | $aAppliedList = if ($variantA.applied -and @($variantA.applied).Count -gt 0) { (@($variantA.applied) | ForEach-Object { '<li>' + (Edit-HtmlEscape ([string]$_)) + '</li>' }) -join '' } else { '<li><em>(none)</em></li>' } |
| 369 | $bAppliedList = if ($variantB.applied -and @($variantB.applied).Count -gt 0) { (@($variantB.applied) | ForEach-Object { '<li>' + (Edit-HtmlEscape ([string]$_)) + '</li>' }) -join '' } else { '<li><em>(none)</em></li>' } |
| 370 | $genEsc = Edit-HtmlEscape $generatedAt |
| 371 | |
| 372 | $css = @' |
| 373 | :root { color-scheme: light dark; } |
| 374 | body { font-family: -apple-system, Segoe UI, Roboto, sans-serif; margin: 0; padding: 1rem; } |
| 375 | header { border-bottom: 1px solid #888; padding-bottom: 0.5rem; margin-bottom: 1rem; } |
| 376 | header h1 { margin: 0 0 0.25rem 0; font-size: 1.4rem; } |
| 377 | .meta { font-size: 0.85rem; color: #666; } |
| 378 | .totals { display: flex; gap: 1.5rem; margin-top: 0.5rem; } |
| 379 | .totals div { font-size: 0.9rem; } |
| 380 | .totals strong { font-size: 1.1rem; } |
| 381 | .variant-strip { display: flex; gap: 1rem; margin: 1rem 0; flex-wrap: wrap; } |
| 382 | .variant-card { flex: 1; min-width: 280px; padding: 0.75rem 1rem; background: #f3f6fb; border: 1px solid #d0d7e2; border-radius: 6px; font-size: 0.85rem; } |
| 383 | .variant-card strong { color: #1a3a6b; } |
| 384 | .variant-kind { font-size: 0.75rem; color: #555; } |
| 385 | .variant-desc { margin-top: 0.35rem; color: #444; } |
| 386 | .variant-applied { margin-top: 0.5rem; font-size: 0.8rem; } |
| 387 | .variant-applied ul { margin: 0.15rem 0 0 1rem; padding: 0; } |
| 388 | @media (prefers-color-scheme: dark) { |
| 389 | .variant-card { background: #1a2230; border-color: #344056; } |
| 390 | .variant-card strong { color: #8ab4ff; } |
| 391 | .variant-kind { color: #aaa; } |
| 392 | .variant-desc { color: #ddd; } |
| 393 | } |
| 394 | input[type=search] { padding: 0.35rem 0.5rem; width: 320px; max-width: 100%; margin-bottom: 0.5rem; } |
| 395 | table { border-collapse: collapse; width: 100%; font-size: 0.85rem; } |
| 396 | th, td { border: 1px solid #ccc; padding: 0.35rem 0.5rem; text-align: left; } |
| 397 | th { background: #f0f0f0; cursor: pointer; user-select: none; position: sticky; top: 0; } |
| 398 | tr.summary:hover { background: #f6f6ff; cursor: pointer; } |
| 399 | tr.details { display: none; background: #fafafa; } |
| 400 | tr.details.open { display: table-row; } |
| 401 | tr.details td { padding: 0.75rem; } |
| 402 | .kind-group { margin-bottom: 0.75rem; } |
| 403 | .kind-group h4 { margin: 0.25rem 0; font-size: 0.9rem; } |
| 404 | .grader { font-size: 0.8rem; margin-left: 1rem; } |
| 405 | .diff { display: grid; grid-template-columns: 1fr 1fr; gap: 0.5rem; margin-top: 0.5rem; } |
| 406 | .diff h5 { margin: 0 0 0.25rem 0; font-size: 0.8rem; } |
| 407 | pre { background: #f5f5f5; padding: 0.5rem; border: 1px solid #ddd; overflow: auto; white-space: pre-wrap; max-height: 240px; margin: 0; } |
| 408 | .verdict-pass { color: #0a7d28; font-weight: bold; } |
| 409 | .verdict-warn { color: #b8860b; font-weight: bold; } |
| 410 | .verdict-fail { color: #b30000; font-weight: bold; } |
| 411 | @media (prefers-color-scheme: dark) { |
| 412 | th { background: #2a2a2a; } |
| 413 | tr.details { background: #1c1c1c; } |
| 414 | pre { background: #161616; border-color: #333; } |
| 415 | .meta { color: #aaa; } |
| 416 | } |
| 417 | '@ |
| 418 | |
| 419 | $js = @' |
| 420 | (function () { |
| 421 | var data = JSON.parse(document.getElementById('data').textContent); |
| 422 | var tbody = document.getElementById('rows'); |
| 423 | var search = document.getElementById('search'); |
| 424 | var sortKey = 'stimulusName'; |
| 425 | var sortDir = 1; |
| 426 | var aLabel = (data.variants && data.variants.a && data.variants.a.label) || 'Variant A'; |
| 427 | var bLabel = (data.variants && data.variants.b && data.variants.b.label) || 'Variant B'; |
| 428 | |
| 429 | function escapeHtml(s) { |
| 430 | return String(s == null ? '' : s) |
| 431 | .replace(/&/g, '&') |
| 432 | .replace(/</g, '<') |
| 433 | .replace(/>/g, '>') |
| 434 | .replace(/"/g, '"') |
| 435 | .replace(/'/g, '''); |
| 436 | } |
| 437 | |
| 438 | function verdictGlyph(s) { |
| 439 | if (s.identicalTotal === 0) return '<span class="verdict-warn">?</span>'; |
| 440 | var pct = s.identicalCount / s.identicalTotal; |
| 441 | if (pct === 1 && s.baselinePassRate === s.customizedPassRate) return '<span class="verdict-pass">=</span>'; |
| 442 | if (pct >= 0.8) return '<span class="verdict-warn">~</span>'; |
| 443 | return '<span class="verdict-fail">!=</span>'; |
| 444 | } |
| 445 | |
| 446 | function renderRows() { |
| 447 | var filter = search.value.toLowerCase(); |
| 448 | var rows = data.stimuli.filter(function (s) { |
| 449 | return !filter || s.stimulusName.toLowerCase().indexOf(filter) !== -1; |
| 450 | }).slice().sort(function (a, b) { |
| 451 | var av = a[sortKey], bv = b[sortKey]; |
| 452 | if (typeof av === 'string') return av.localeCompare(bv) * sortDir; |
| 453 | return ((av || 0) - (bv || 0)) * sortDir; |
| 454 | }); |
| 455 | tbody.innerHTML = rows.map(function (s, i) { |
| 456 | var trials = (s.trials || []).map(function (t) { |
| 457 | var bi = t.baseline || {}; |
| 458 | var ci = t.customized || {}; |
| 459 | var detailsHtml = ['code', 'llm', 'human', 'other'].map(function (kind) { |
| 460 | var bg = (bi.details && bi.details[kind]) || []; |
| 461 | var cg = (ci.details && ci.details[kind]) || []; |
| 462 | if (bg.length === 0 && cg.length === 0) return ''; |
| 463 | var fmt = function (g) { |
| 464 | return '<div class="grader">' + escapeHtml(g.name || '') + |
| 465 | ' — passed=' + escapeHtml(g.passed) + |
| 466 | ' score=' + escapeHtml(g.score) + |
| 467 | (g.evidence ? ' <em>' + escapeHtml(g.evidence) + '</em>' : '') + |
| 468 | '</div>'; |
| 469 | }; |
| 470 | return '<div class="kind-group"><h4>' + escapeHtml(kind) + '</h4>' + |
| 471 | '<div><strong>' + escapeHtml(aLabel) + ':</strong>' + bg.map(fmt).join('') + '</div>' + |
| 472 | '<div><strong>' + escapeHtml(bLabel) + ':</strong>' + cg.map(fmt).join('') + '</div></div>'; |
| 473 | }).join(''); |
| 474 | return '<div><strong>Trial ' + t.trial + '</strong>' + detailsHtml + |
| 475 | '<div class="diff"><div><h5>' + escapeHtml(aLabel) + ' output</h5><pre>' + escapeHtml(bi.output || '') + '</pre></div>' + |
| 476 | '<div><h5>' + escapeHtml(bLabel) + ' output</h5><pre>' + escapeHtml(ci.output || '') + '</pre></div></div></div>'; |
| 477 | }).join('<hr/>'); |
| 478 | |
| 479 | return '<tr class="summary" data-i="' + i + '">' + |
| 480 | '<td>' + escapeHtml(s.stimulusName) + '</td>' + |
| 481 | '<td>' + (s.baselinePassRate * 100).toFixed(1) + '%</td>' + |
| 482 | '<td>' + (s.customizedPassRate * 100).toFixed(1) + '%</td>' + |
| 483 | '<td>' + s.identicalCount + '/' + s.identicalTotal + '</td>' + |
| 484 | '<td>' + s.ties + '</td><td>' + s.aWins + '</td><td>' + s.bWins + '</td>' + |
| 485 | '<td>' + s.meanWallTimeDeltaMs + '</td>' + |
| 486 | '<td>' + s.meanTokenDelta + '</td>' + |
| 487 | '<td>' + verdictGlyph(s) + '</td>' + |
| 488 | '</tr>' + |
| 489 | '<tr class="details" data-i="' + i + '"><td colspan="10">' + trials + '</td></tr>'; |
| 490 | }).join(''); |
| 491 | } |
| 492 | |
| 493 | tbody.addEventListener('click', function (e) { |
| 494 | var tr = e.target.closest('tr.summary'); |
| 495 | if (!tr) return; |
| 496 | var i = tr.getAttribute('data-i'); |
| 497 | var det = tbody.querySelector('tr.details[data-i="' + i + '"]'); |
| 498 | if (det) det.classList.toggle('open'); |
| 499 | }); |
| 500 | |
| 501 | document.querySelectorAll('th[data-key]').forEach(function (th) { |
| 502 | th.addEventListener('click', function () { |
| 503 | var k = th.getAttribute('data-key'); |
| 504 | if (sortKey === k) { sortDir = -sortDir; } else { sortKey = k; sortDir = 1; } |
| 505 | renderRows(); |
| 506 | }); |
| 507 | }); |
| 508 | |
| 509 | search.addEventListener('input', renderRows); |
| 510 | renderRows(); |
| 511 | })(); |
| 512 | '@ |
| 513 | |
| 514 | $html = @" |
| 515 | <!doctype html> |
| 516 | <html lang="en"> |
| 517 | <head> |
| 518 | <meta charset="utf-8"> |
| 519 | <title>Baseline Equivalence Dashboard — $modelEsc — $runIdEsc</title> |
| 520 | <style> |
| 521 | $css |
| 522 | </style> |
| 523 | </head> |
| 524 | <body> |
| 525 | <header> |
| 526 | <h1>Baseline Equivalence Dashboard</h1> |
| 527 | <div class="meta">Agent: <strong>$agentEsc</strong> · Model: <strong>$modelEsc</strong> · Run: <strong>$runIdEsc</strong> · Generated: $genEsc</div> |
| 528 | <div class="totals"> |
| 529 | <div>Stimuli: <strong>$totalStimuli</strong></div> |
| 530 | <div>Total trials: <strong>$totalTrials</strong></div> |
| 531 | <div>Identical outputs: <strong>${identicalPct}%</strong></div> |
| 532 | </div> |
| 533 | <div class="variant-strip"> |
| 534 | <div class="variant-card"> |
| 535 | <div><strong>Variant A — $aLabelEsc</strong> <span class="variant-kind">[$aKindEsc]</span></div> |
| 536 | <div class="variant-desc">$aDescEsc</div> |
| 537 | <div class="variant-applied"><div>Applied:</div><ul>$aAppliedList</ul></div> |
| 538 | </div> |
| 539 | <div class="variant-card"> |
| 540 | <div><strong>Variant B — $bLabelEsc</strong> <span class="variant-kind">[$bKindEsc]</span></div> |
| 541 | <div class="variant-desc">$bDescEsc</div> |
| 542 | <div class="variant-applied"><div>Applied:</div><ul>$bAppliedList</ul></div> |
| 543 | </div> |
| 544 | </div> |
| 545 | </header> |
| 546 | <input id="search" type="search" placeholder="filter stimuli…"> |
| 547 | <table> |
| 548 | <thead><tr> |
| 549 | <th data-key="stimulusName">Stimulus</th> |
| 550 | <th data-key="baselinePassRate">$aLabelEsc pass</th> |
| 551 | <th data-key="customizedPassRate">$bLabelEsc pass</th> |
| 552 | <th data-key="identicalCount">Identical</th> |
| 553 | <th data-key="ties">Ties</th> |
| 554 | <th data-key="aWins">$aLabelEsc wins</th> |
| 555 | <th data-key="bWins">$bLabelEsc wins</th> |
| 556 | <th data-key="meanWallTimeDeltaMs">Wall Δ (ms)</th> |
| 557 | <th data-key="meanTokenDelta">Tokens Δ</th> |
| 558 | <th>Verdict</th> |
| 559 | </tr></thead> |
| 560 | <tbody id="rows"></tbody> |
| 561 | </table> |
| 562 | <script id="data" type="application/json">$json</script> |
| 563 | <script> |
| 564 | $js |
| 565 | </script> |
| 566 | </body> |
| 567 | </html> |
| 568 | "@ |
| 569 | |
| 570 | return $html |
| 571 | } |
| 572 | |
| 573 | function Get-AppliedArtifacts { |
| 574 | <# |
| 575 | .SYNOPSIS |
| 576 | Discovers the customization artifacts materialized under a workspace root. |
| 577 | .PARAMETER WorkspaceRoot |
| 578 | Absolute path to the materialized customized workspace (typically |
| 579 | evals/baseline-equivalence/customized/workspace). When missing, empty, |
| 580 | or not a directory the function returns an empty array without erroring. |
| 581 | .OUTPUTS |
| 582 | System.String[] of workspace-relative artifact paths using forward |
| 583 | slashes, sorted and de-duplicated by exact path. |
| 584 | .EXAMPLE |
| 585 | Get-AppliedArtifacts -WorkspaceRoot 'C:/repo/evals/baseline-equivalence/customized/workspace' |
| 586 | #> |
| 587 | [CmdletBinding()] |
| 588 | [OutputType([string[]])] |
| 589 | param( |
| 590 | [Parameter(Mandatory)] |
| 591 | [AllowEmptyString()] |
| 592 | [AllowNull()] |
| 593 | [string]$WorkspaceRoot |
| 594 | ) |
| 595 | |
| 596 | if ([string]::IsNullOrWhiteSpace($WorkspaceRoot)) { return @() } |
| 597 | if (-not (Test-Path -LiteralPath $WorkspaceRoot -PathType Container)) { return @() } |
| 598 | |
| 599 | $kinds = @( |
| 600 | @{ Anchor = '.github/agents'; Filter = '*.agent.md' }, |
| 601 | @{ Anchor = '.github/skills'; Filter = 'SKILL.md' }, |
| 602 | @{ Anchor = '.github/instructions'; Filter = '*.instructions.md' }, |
| 603 | @{ Anchor = '.github/prompts'; Filter = '*.prompt.md' } |
| 604 | ) |
| 605 | |
| 606 | $relatives = New-Object 'System.Collections.Generic.List[string]' |
| 607 | foreach ($kind in $kinds) { |
| 608 | $anchorPath = Join-Path $WorkspaceRoot $kind.Anchor |
| 609 | if (-not (Test-Path -LiteralPath $anchorPath -PathType Container)) { continue } |
| 610 | $files = Get-ChildItem -LiteralPath $anchorPath -Recurse -Filter $kind.Filter -File -ErrorAction SilentlyContinue |
| 611 | foreach ($file in $files) { |
| 612 | $rel = [IO.Path]::GetRelativePath($WorkspaceRoot, $file.FullName) -replace '\\', '/' |
| 613 | $relatives.Add($rel) |
| 614 | } |
| 615 | } |
| 616 | |
| 617 | return @($relatives | Sort-Object -Unique) |
| 618 | } |
| 619 | |
| 620 | Export-ModuleMember -Function ` |
| 621 | Measure-CompareTrials, ` |
| 622 | Measure-InvariantFailures, ` |
| 623 | Get-VerdictFromAggregate, ` |
| 624 | Get-OutputHash, ` |
| 625 | ConvertFrom-EquivalenceResults, ` |
| 626 | Merge-EquivalenceStimuli, ` |
| 627 | Edit-HtmlEscape, ` |
| 628 | Get-VariantMetadata, ` |
| 629 | ConvertTo-EquivalenceHtml, ` |
| 630 | Get-AppliedArtifacts |
| 631 | |