microsoft/hve-core

Public

mirrored from https://github.com/microsoft/hve-coreAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
feat/1873-devcontainer

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

scripts/evals/lib/EquivalenceParsing.psm1

630lines · modecode

1# Copyright (c) Microsoft Corporation.
2# SPDX-License-Identifier: MIT
3
4#Requires -Version 7.0
5
6<#
7.SYNOPSIS
8 Shared parsing, aggregation, and rendering helpers for baseline-equivalence eval runs.
9
10.DESCRIPTION
11 Consolidates the compare-log and results.jsonl parsers used by
12 `Invoke-BaselineEquivalence.ps1` and the dashboard generator
13 `New-EquivalenceDashboard.ps1`. All public functions are exported via
14 `Export-ModuleMember` at the bottom of the file.
15#>
16
17Set-StrictMode -Version Latest
18
19function Measure-CompareTrials {
20 [CmdletBinding()]
21 [OutputType([hashtable])]
22 param(
23 [Parameter(Mandatory)]
24 [AllowEmptyCollection()]
25 [AllowEmptyString()]
26 [string[]]$Lines
27 )
28
29 $pattern = '\s(?<stim>\S[^\n]*?\(trial\s+\d+\))\s{2,}(?<verdict>tie|A wins|B wins)\s{2,}\(score:\s*(?<score>[-+0-9.]+)\)\s*$'
30 $ansi = [regex]'\x1B\[[0-9;]*[A-Za-z]'
31 $ties = 0; $a = 0; $b = 0; $total = 0
32 $perStimulus = @{}
33 foreach ($line in $Lines) {
34 $clean = $ansi.Replace($line, '')
35 if ($clean -match $pattern) {
36 $total++
37 $stim = ($Matches.stim -replace '\s*\(trial\s+\d+\)\s*$', '').Trim()
38 if (-not $perStimulus.ContainsKey($stim)) {
39 $perStimulus[$stim] = @{ Ties = 0; AWins = 0; BWins = 0 }
40 }
41 switch ($Matches.verdict) {
42 'tie' { $ties++; $perStimulus[$stim].Ties += 1 }
43 'A wins' { $a++; $perStimulus[$stim].AWins += 1 }
44 'B wins' { $b++; $perStimulus[$stim].BWins += 1 }
45 }
46 }
47 }
48 return @{ Total = $total; Ties = $ties; AWins = $a; BWins = $b; PerStimulus = $perStimulus }
49}
50
51function Measure-InvariantFailures {
52 [CmdletBinding()]
53 [OutputType([hashtable])]
54 param(
55 [Parameter(Mandatory)]
56 [AllowEmptyCollection()]
57 [AllowEmptyString()]
58 [string[]]$Lines
59 )
60
61 $ansi = [regex]'\x1B\[[0-9;]*[A-Za-z]'
62 $pass = [char]::ConvertFromUtf32(0x2705)
63 $fail = [char]::ConvertFromUtf32(0x274C)
64 $warn = [char]::ConvertFromUtf32(0x1F7E1)
65 $verdictAlt = "$pass|$fail|$warn"
66 $rowPattern = "^\|\s*[^|\s][^|]*\|.*\|\s*(?<verdict>$verdictAlt)(?:\s|$|<)"
67 $total = 0; $failed = 0
68 foreach ($line in $Lines) {
69 $clean = $ansi.Replace($line, '')
70 if ($clean -match $rowPattern) {
71 $total++
72 if ($Matches.verdict -ne $pass) { $failed++ }
73 }
74 }
75 return @{ Total = $total; Failed = $failed }
76}
77
78function Get-VerdictFromAggregate {
79 [CmdletBinding()]
80 [OutputType([string])]
81 param(
82 [Parameter(Mandatory)][int]$Runs,
83 [Parameter(Mandatory)][int]$Ties,
84 [Parameter(Mandatory)][int]$AWins,
85 [Parameter(Mandatory)][int]$BWins,
86 [Parameter(Mandatory)][int]$InvariantFailures,
87 [Parameter(Mandatory)][int]$DivergenceFailures,
88 [Parameter(Mandatory)][string]$Tier
89 )
90
91 if ($Runs -le 0) { return 'fail' }
92 if ($InvariantFailures -gt 0 -or $DivergenceFailures -gt 0) {
93 if ($Tier -eq 'pr') { return 'warn' } else { return 'fail' }
94 }
95
96 $tieRatio = [double]$Ties / [double]$Runs
97 $nonTies = $AWins + $BWins
98 $symmetric = ($nonTies -eq 0) -or ([math]::Abs($AWins - $BWins) -le ($nonTies * 0.5))
99
100 if ($tieRatio -ge 0.80 -and $symmetric) { return 'pass' }
101 if ($Tier -eq 'pr') { return 'warn' } else { return 'fail' }
102}
103
104function Get-OutputHash {
105 [CmdletBinding()]
106 [OutputType([string])]
107 param([Parameter(Mandatory)][AllowEmptyString()][string]$Text)
108 $bytes = [System.Text.Encoding]::UTF8.GetBytes($Text)
109 $sha = [System.Security.Cryptography.SHA256]::Create()
110 try {
111 $hash = $sha.ComputeHash($bytes)
112 return -join ($hash | ForEach-Object { $_.ToString('x2') })
113 }
114 finally { $sha.Dispose() }
115}
116
117function ConvertFrom-EquivalenceResults {
118 [CmdletBinding()]
119 [OutputType([System.Collections.IList])]
120 param(
121 [Parameter(Mandatory)][string]$RunDir
122 )
123
124 if (-not (Test-Path -LiteralPath $RunDir)) {
125 throw "Run directory not found: $RunDir"
126 }
127
128 $jsonlFiles = @(Get-ChildItem -LiteralPath $RunDir -Filter 'results.jsonl' -Recurse -File)
129 if ($jsonlFiles.Count -eq 0) {
130 throw "No results.jsonl found under $RunDir"
131 }
132
133 $records = New-Object 'System.Collections.Generic.List[object]'
134 $stimulusCounts = @{}
135 $knownKinds = @('code', 'llm', 'human')
136
137 foreach ($file in $jsonlFiles) {
138 $lines = Get-Content -LiteralPath $file.FullName -Encoding utf8
139 foreach ($line in $lines) {
140 if ([string]::IsNullOrWhiteSpace($line)) { continue }
141 $obj = $line | ConvertFrom-Json -Depth 100
142 if (-not ($obj.PSObject.Properties['trajectory'])) { continue }
143 $traj = $obj.trajectory
144 $stim = if ($traj -and $traj.stimulus) { [string]$traj.stimulus.name } else { '<unknown>' }
145 if (-not $stimulusCounts.ContainsKey($stim)) { $stimulusCounts[$stim] = 0 }
146 $trial = $stimulusCounts[$stim]
147 $stimulusCounts[$stim] = $trial + 1
148
149 $output = if ($traj -and $null -ne $traj.output) { [string]$traj.output } else { '' }
150 $wallMs = 0
151 $totalTokens = 0
152 if ($traj -and $traj.metrics) {
153 if ($null -ne $traj.metrics.wallTimeMs) { $wallMs = [int]$traj.metrics.wallTimeMs }
154 if ($traj.metrics.tokenUsage -and $null -ne $traj.metrics.tokenUsage.totalTokens) {
155 $totalTokens = [int]$traj.metrics.tokenUsage.totalTokens
156 }
157 }
158
159 $passed = $false
160 $score = 0.0
161 $details = @{ code = @(); llm = @(); human = @(); other = @() }
162 if ($obj.PSObject.Properties['gradeResult'] -and $obj.gradeResult) {
163 $gr = $obj.gradeResult
164 if ($null -ne $gr.passed) { $passed = [bool]$gr.passed }
165 if ($null -ne $gr.score) { $score = [double]$gr.score }
166 if ($gr.PSObject.Properties['details'] -and $gr.details) {
167 foreach ($d in @($gr.details)) {
168 $kind = if ($d.PSObject.Properties['kind'] -and $d.kind) { [string]$d.kind } else { 'other' }
169 if ($knownKinds -notcontains $kind) {
170 Write-Warning "ConvertFrom-EquivalenceResults: unknown grader kind '$kind' for stimulus '$stim' (trial $trial); bucketing under 'other'."
171 $details.other += $d
172 }
173 else {
174 $details[$kind] += $d
175 }
176 }
177 }
178 }
179
180 $records.Add([pscustomobject]@{
181 stimulusName = $stim
182 trial = $trial
183 output = $output
184 outputHash = Get-OutputHash -Text $output
185 passed = $passed
186 score = $score
187 wallTimeMs = $wallMs
188 totalTokens = $totalTokens
189 details = $details
190 }) | Out-Null
191 }
192 }
193
194 return , $records
195}
196
197function Merge-EquivalenceStimuli {
198 [CmdletBinding()]
199 [OutputType([System.Collections.IList])]
200 param(
201 [Parameter(Mandatory)][AllowEmptyCollection()][object[]]$Baseline,
202 [Parameter(Mandatory)][AllowEmptyCollection()][object[]]$Customized,
203 [Parameter(Mandatory)][hashtable]$Compare
204 )
205
206 $byStimBase = @{}
207 foreach ($r in $Baseline) {
208 if (-not $byStimBase.ContainsKey($r.stimulusName)) { $byStimBase[$r.stimulusName] = @() }
209 $byStimBase[$r.stimulusName] += $r
210 }
211 $byStimCust = @{}
212 foreach ($r in $Customized) {
213 if (-not $byStimCust.ContainsKey($r.stimulusName)) { $byStimCust[$r.stimulusName] = @() }
214 $byStimCust[$r.stimulusName] += $r
215 }
216
217 $perStim = if ($Compare.ContainsKey('PerStimulus')) { $Compare.PerStimulus } else { @{} }
218 $nameSet = [System.Collections.Generic.HashSet[string]]::new()
219 foreach ($k in $byStimBase.Keys) { [void]$nameSet.Add($k) }
220 foreach ($k in $byStimCust.Keys) { [void]$nameSet.Add($k) }
221 $allNames = @($nameSet) | Sort-Object
222 $merged = New-Object 'System.Collections.Generic.List[object]'
223
224 foreach ($name in $allNames) {
225 [object[]]$b = @(if ($byStimBase.ContainsKey($name)) { $byStimBase[$name] } else { @() })
226 [object[]]$c = @(if ($byStimCust.ContainsKey($name)) { $byStimCust[$name] } else { @() })
227 $trialCount = [math]::Max($b.Count, $c.Count)
228
229 $identical = 0
230 $wallDiffs = New-Object 'System.Collections.Generic.List[double]'
231 $tokenDiffs = New-Object 'System.Collections.Generic.List[double]'
232 $pairs = New-Object 'System.Collections.Generic.List[object]'
233 for ($i = 0; $i -lt $trialCount; $i++) {
234 $bi = if ($i -lt $b.Count) { $b[$i] } else { $null }
235 $ci = if ($i -lt $c.Count) { $c[$i] } else { $null }
236 if ($bi -and $ci -and $bi.outputHash -eq $ci.outputHash) { $identical++ }
237 if ($bi -and $ci) {
238 $wallDiffs.Add([double]($ci.wallTimeMs - $bi.wallTimeMs))
239 $tokenDiffs.Add([double]($ci.totalTokens - $bi.totalTokens))
240 }
241 $pairs.Add([pscustomobject]@{
242 trial = $i
243 baseline = $bi
244 customized = $ci
245 }) | Out-Null
246 }
247
248 $basePassed = @($b | Where-Object { $_.passed }).Count
249 $custPassed = @($c | Where-Object { $_.passed }).Count
250
251 $tally = if ($perStim.ContainsKey($name)) { $perStim[$name] } else { @{ Ties = 0; AWins = 0; BWins = 0 } }
252
253 $meanWall = if ($wallDiffs.Count -gt 0) { ($wallDiffs | Measure-Object -Average).Average } else { 0.0 }
254 $meanTokens = if ($tokenDiffs.Count -gt 0) { ($tokenDiffs | Measure-Object -Average).Average } else { 0.0 }
255
256 $merged.Add([pscustomobject]@{
257 stimulusName = $name
258 baselineTrials = $b.Count
259 customizedTrials = $c.Count
260 baselinePassed = $basePassed
261 customizedPassed = $custPassed
262 baselinePassRate = if ($b.Count -gt 0) { [math]::Round($basePassed / [double]$b.Count, 4) } else { 0.0 }
263 customizedPassRate = if ($c.Count -gt 0) { [math]::Round($custPassed / [double]$c.Count, 4) } else { 0.0 }
264 identicalCount = $identical
265 identicalTotal = $trialCount
266 ties = [int]$tally.Ties
267 aWins = [int]$tally.AWins
268 bWins = [int]$tally.BWins
269 meanWallTimeDeltaMs = [math]::Round($meanWall, 2)
270 meanTokenDelta = [math]::Round($meanTokens, 2)
271 trials = $pairs
272 }) | Out-Null
273 }
274
275 return , $merged
276}
277
278function Edit-HtmlEscape {
279 [CmdletBinding()]
280 [OutputType([string])]
281 param([Parameter(Mandatory)][AllowEmptyString()][AllowNull()][string]$Text)
282 if ($null -eq $Text) { return '' }
283 return ($Text -replace '&', '&amp;' -replace '<', '&lt;' -replace '>', '&gt;' -replace '"', '&quot;' -replace "'", '&#39;')
284}
285
286function Get-VariantMetadata {
287 [CmdletBinding()]
288 [OutputType([hashtable])]
289 param(
290 [Parameter(Mandatory)]
291 [string]$VariantYamlPath,
292 [Parameter(Mandatory)]
293 [hashtable]$Default
294 )
295
296 $variant = @{}
297 foreach ($key in $Default.Keys) { $variant[$key] = $Default[$key] }
298
299 if (-not (Test-Path -LiteralPath $VariantYamlPath)) { return $variant }
300 if (-not (Get-Module -ListAvailable -Name 'powershell-yaml')) { return $variant }
301
302 try {
303 Import-Module powershell-yaml -ErrorAction Stop
304 $raw = Get-Content -LiteralPath $VariantYamlPath -Raw
305 $parsed = ConvertFrom-Yaml -Yaml $raw
306 if ($parsed) {
307 foreach ($key in @('kind', 'name', 'label', 'description', 'applied')) {
308 if ($parsed.ContainsKey($key)) { $variant[$key] = $parsed[$key] }
309 }
310 }
311 }
312 catch {
313 Write-Verbose "Failed to parse variant metadata at ${VariantYamlPath}: $($_.Exception.Message)"
314 }
315
316 if (-not $variant.ContainsKey('applied') -or $null -eq $variant.applied) { $variant.applied = @() }
317 return $variant
318}
319
320function ConvertTo-EquivalenceHtml {
321 [CmdletBinding()]
322 [OutputType([string])]
323 param(
324 [Parameter(Mandatory)][AllowEmptyCollection()][object[]]$Stimuli,
325 [Parameter(Mandatory)][string]$Model,
326 [Parameter(Mandatory)][string]$RunId,
327 [Parameter(Mandatory)][string]$Agent,
328 [hashtable]$Variants
329 )
330
331 $generatedAt = (Get-Date).ToUniversalTime().ToString('o')
332 $totalStimuli = $Stimuli.Count
333 $totalTrials = ($Stimuli | Measure-Object -Property identicalTotal -Sum).Sum
334 if (-not $totalTrials) { $totalTrials = 0 }
335 $totalIdentical = ($Stimuli | Measure-Object -Property identicalCount -Sum).Sum
336 if (-not $totalIdentical) { $totalIdentical = 0 }
337 $identicalPct = if ($totalTrials -gt 0) { [math]::Round(100 * $totalIdentical / [double]$totalTrials, 1) } else { 0 }
338
339 $defaultVariantA = @{ kind = 'baseline'; name = 'baseline'; label = 'Baseline (A)'; description = ''; applied = @() }
340 $defaultVariantB = @{ kind = 'unknown'; name = 'customized'; label = 'Customized (B)'; description = ''; applied = @() }
341 $variantA = if ($Variants -and $Variants.a) { $Variants.a } else { $defaultVariantA }
342 $variantB = if ($Variants -and $Variants.b) { $Variants.b } else { $defaultVariantB }
343 $subject = if ($Variants -and $Variants.subject) { [string]$Variants.subject } else { [string]$variantB.name }
344
345 $payload = [ordered]@{
346 model = $Model
347 runId = $RunId
348 generatedAt = $generatedAt
349 totalStimuli = $totalStimuli
350 totalTrials = $totalTrials
351 identicalPct = $identicalPct
352 variants = @{ a = $variantA; b = $variantB; subject = $subject }
353 stimuli = $Stimuli
354 }
355 $json = $payload | ConvertTo-Json -Depth 100 -Compress
356 # Escape sequences that could break out of a <script> tag context (including '/' for </script> defense in depth).
357 $json = $json -replace '<', '\u003c' -replace '>', '\u003e' -replace '&', '\u0026' -replace '/', '\/'
358
359 $modelEsc = Edit-HtmlEscape $Model
360 $runIdEsc = Edit-HtmlEscape $RunId
361 $agentEsc = Edit-HtmlEscape $Agent
362 $aLabelEsc = Edit-HtmlEscape ([string]$variantA.label)
363 $bLabelEsc = Edit-HtmlEscape ([string]$variantB.label)
364 $aKindEsc = Edit-HtmlEscape ([string]$variantA.kind)
365 $bKindEsc = Edit-HtmlEscape ([string]$variantB.kind)
366 $aDescEsc = Edit-HtmlEscape ([string]$variantA.description)
367 $bDescEsc = Edit-HtmlEscape ([string]$variantB.description)
368 $aAppliedList = if ($variantA.applied -and @($variantA.applied).Count -gt 0) { (@($variantA.applied) | ForEach-Object { '<li>' + (Edit-HtmlEscape ([string]$_)) + '</li>' }) -join '' } else { '<li><em>(none)</em></li>' }
369 $bAppliedList = if ($variantB.applied -and @($variantB.applied).Count -gt 0) { (@($variantB.applied) | ForEach-Object { '<li>' + (Edit-HtmlEscape ([string]$_)) + '</li>' }) -join '' } else { '<li><em>(none)</em></li>' }
370 $genEsc = Edit-HtmlEscape $generatedAt
371
372 $css = @'
373:root { color-scheme: light dark; }
374body { font-family: -apple-system, Segoe UI, Roboto, sans-serif; margin: 0; padding: 1rem; }
375header { border-bottom: 1px solid #888; padding-bottom: 0.5rem; margin-bottom: 1rem; }
376header h1 { margin: 0 0 0.25rem 0; font-size: 1.4rem; }
377.meta { font-size: 0.85rem; color: #666; }
378.totals { display: flex; gap: 1.5rem; margin-top: 0.5rem; }
379.totals div { font-size: 0.9rem; }
380.totals strong { font-size: 1.1rem; }
381.variant-strip { display: flex; gap: 1rem; margin: 1rem 0; flex-wrap: wrap; }
382.variant-card { flex: 1; min-width: 280px; padding: 0.75rem 1rem; background: #f3f6fb; border: 1px solid #d0d7e2; border-radius: 6px; font-size: 0.85rem; }
383.variant-card strong { color: #1a3a6b; }
384.variant-kind { font-size: 0.75rem; color: #555; }
385.variant-desc { margin-top: 0.35rem; color: #444; }
386.variant-applied { margin-top: 0.5rem; font-size: 0.8rem; }
387.variant-applied ul { margin: 0.15rem 0 0 1rem; padding: 0; }
388@media (prefers-color-scheme: dark) {
389 .variant-card { background: #1a2230; border-color: #344056; }
390 .variant-card strong { color: #8ab4ff; }
391 .variant-kind { color: #aaa; }
392 .variant-desc { color: #ddd; }
393}
394input[type=search] { padding: 0.35rem 0.5rem; width: 320px; max-width: 100%; margin-bottom: 0.5rem; }
395table { border-collapse: collapse; width: 100%; font-size: 0.85rem; }
396th, td { border: 1px solid #ccc; padding: 0.35rem 0.5rem; text-align: left; }
397th { background: #f0f0f0; cursor: pointer; user-select: none; position: sticky; top: 0; }
398tr.summary:hover { background: #f6f6ff; cursor: pointer; }
399tr.details { display: none; background: #fafafa; }
400tr.details.open { display: table-row; }
401tr.details td { padding: 0.75rem; }
402.kind-group { margin-bottom: 0.75rem; }
403.kind-group h4 { margin: 0.25rem 0; font-size: 0.9rem; }
404.grader { font-size: 0.8rem; margin-left: 1rem; }
405.diff { display: grid; grid-template-columns: 1fr 1fr; gap: 0.5rem; margin-top: 0.5rem; }
406.diff h5 { margin: 0 0 0.25rem 0; font-size: 0.8rem; }
407pre { background: #f5f5f5; padding: 0.5rem; border: 1px solid #ddd; overflow: auto; white-space: pre-wrap; max-height: 240px; margin: 0; }
408.verdict-pass { color: #0a7d28; font-weight: bold; }
409.verdict-warn { color: #b8860b; font-weight: bold; }
410.verdict-fail { color: #b30000; font-weight: bold; }
411@media (prefers-color-scheme: dark) {
412 th { background: #2a2a2a; }
413 tr.details { background: #1c1c1c; }
414 pre { background: #161616; border-color: #333; }
415 .meta { color: #aaa; }
416}
417'@
418
419 $js = @'
420(function () {
421 var data = JSON.parse(document.getElementById('data').textContent);
422 var tbody = document.getElementById('rows');
423 var search = document.getElementById('search');
424 var sortKey = 'stimulusName';
425 var sortDir = 1;
426 var aLabel = (data.variants && data.variants.a && data.variants.a.label) || 'Variant A';
427 var bLabel = (data.variants && data.variants.b && data.variants.b.label) || 'Variant B';
428
429 function escapeHtml(s) {
430 return String(s == null ? '' : s)
431 .replace(/&/g, '&amp;')
432 .replace(/</g, '&lt;')
433 .replace(/>/g, '&gt;')
434 .replace(/"/g, '&quot;')
435 .replace(/'/g, '&#39;');
436 }
437
438 function verdictGlyph(s) {
439 if (s.identicalTotal === 0) return '<span class="verdict-warn">?</span>';
440 var pct = s.identicalCount / s.identicalTotal;
441 if (pct === 1 && s.baselinePassRate === s.customizedPassRate) return '<span class="verdict-pass">=</span>';
442 if (pct >= 0.8) return '<span class="verdict-warn">~</span>';
443 return '<span class="verdict-fail">!=</span>';
444 }
445
446 function renderRows() {
447 var filter = search.value.toLowerCase();
448 var rows = data.stimuli.filter(function (s) {
449 return !filter || s.stimulusName.toLowerCase().indexOf(filter) !== -1;
450 }).slice().sort(function (a, b) {
451 var av = a[sortKey], bv = b[sortKey];
452 if (typeof av === 'string') return av.localeCompare(bv) * sortDir;
453 return ((av || 0) - (bv || 0)) * sortDir;
454 });
455 tbody.innerHTML = rows.map(function (s, i) {
456 var trials = (s.trials || []).map(function (t) {
457 var bi = t.baseline || {};
458 var ci = t.customized || {};
459 var detailsHtml = ['code', 'llm', 'human', 'other'].map(function (kind) {
460 var bg = (bi.details && bi.details[kind]) || [];
461 var cg = (ci.details && ci.details[kind]) || [];
462 if (bg.length === 0 && cg.length === 0) return '';
463 var fmt = function (g) {
464 return '<div class="grader">' + escapeHtml(g.name || '') +
465 ' &mdash; passed=' + escapeHtml(g.passed) +
466 ' score=' + escapeHtml(g.score) +
467 (g.evidence ? ' <em>' + escapeHtml(g.evidence) + '</em>' : '') +
468 '</div>';
469 };
470 return '<div class="kind-group"><h4>' + escapeHtml(kind) + '</h4>' +
471 '<div><strong>' + escapeHtml(aLabel) + ':</strong>' + bg.map(fmt).join('') + '</div>' +
472 '<div><strong>' + escapeHtml(bLabel) + ':</strong>' + cg.map(fmt).join('') + '</div></div>';
473 }).join('');
474 return '<div><strong>Trial ' + t.trial + '</strong>' + detailsHtml +
475 '<div class="diff"><div><h5>' + escapeHtml(aLabel) + ' output</h5><pre>' + escapeHtml(bi.output || '') + '</pre></div>' +
476 '<div><h5>' + escapeHtml(bLabel) + ' output</h5><pre>' + escapeHtml(ci.output || '') + '</pre></div></div></div>';
477 }).join('<hr/>');
478
479 return '<tr class="summary" data-i="' + i + '">' +
480 '<td>' + escapeHtml(s.stimulusName) + '</td>' +
481 '<td>' + (s.baselinePassRate * 100).toFixed(1) + '%</td>' +
482 '<td>' + (s.customizedPassRate * 100).toFixed(1) + '%</td>' +
483 '<td>' + s.identicalCount + '/' + s.identicalTotal + '</td>' +
484 '<td>' + s.ties + '</td><td>' + s.aWins + '</td><td>' + s.bWins + '</td>' +
485 '<td>' + s.meanWallTimeDeltaMs + '</td>' +
486 '<td>' + s.meanTokenDelta + '</td>' +
487 '<td>' + verdictGlyph(s) + '</td>' +
488 '</tr>' +
489 '<tr class="details" data-i="' + i + '"><td colspan="10">' + trials + '</td></tr>';
490 }).join('');
491 }
492
493 tbody.addEventListener('click', function (e) {
494 var tr = e.target.closest('tr.summary');
495 if (!tr) return;
496 var i = tr.getAttribute('data-i');
497 var det = tbody.querySelector('tr.details[data-i="' + i + '"]');
498 if (det) det.classList.toggle('open');
499 });
500
501 document.querySelectorAll('th[data-key]').forEach(function (th) {
502 th.addEventListener('click', function () {
503 var k = th.getAttribute('data-key');
504 if (sortKey === k) { sortDir = -sortDir; } else { sortKey = k; sortDir = 1; }
505 renderRows();
506 });
507 });
508
509 search.addEventListener('input', renderRows);
510 renderRows();
511})();
512'@
513
514 $html = @"
515<!doctype html>
516<html lang="en">
517<head>
518<meta charset="utf-8">
519<title>Baseline Equivalence Dashboard &mdash; $modelEsc &mdash; $runIdEsc</title>
520<style>
521$css
522</style>
523</head>
524<body>
525<header>
526<h1>Baseline Equivalence Dashboard</h1>
527<div class="meta">Agent: <strong>$agentEsc</strong> &middot; Model: <strong>$modelEsc</strong> &middot; Run: <strong>$runIdEsc</strong> &middot; Generated: $genEsc</div>
528<div class="totals">
529<div>Stimuli: <strong>$totalStimuli</strong></div>
530<div>Total trials: <strong>$totalTrials</strong></div>
531<div>Identical outputs: <strong>${identicalPct}%</strong></div>
532</div>
533<div class="variant-strip">
534<div class="variant-card">
535<div><strong>Variant A &mdash; $aLabelEsc</strong> <span class="variant-kind">[$aKindEsc]</span></div>
536<div class="variant-desc">$aDescEsc</div>
537<div class="variant-applied"><div>Applied:</div><ul>$aAppliedList</ul></div>
538</div>
539<div class="variant-card">
540<div><strong>Variant B &mdash; $bLabelEsc</strong> <span class="variant-kind">[$bKindEsc]</span></div>
541<div class="variant-desc">$bDescEsc</div>
542<div class="variant-applied"><div>Applied:</div><ul>$bAppliedList</ul></div>
543</div>
544</div>
545</header>
546<input id="search" type="search" placeholder="filter stimuli&hellip;">
547<table>
548<thead><tr>
549<th data-key="stimulusName">Stimulus</th>
550<th data-key="baselinePassRate">$aLabelEsc pass</th>
551<th data-key="customizedPassRate">$bLabelEsc pass</th>
552<th data-key="identicalCount">Identical</th>
553<th data-key="ties">Ties</th>
554<th data-key="aWins">$aLabelEsc wins</th>
555<th data-key="bWins">$bLabelEsc wins</th>
556<th data-key="meanWallTimeDeltaMs">Wall &Delta; (ms)</th>
557<th data-key="meanTokenDelta">Tokens &Delta;</th>
558<th>Verdict</th>
559</tr></thead>
560<tbody id="rows"></tbody>
561</table>
562<script id="data" type="application/json">$json</script>
563<script>
564$js
565</script>
566</body>
567</html>
568"@
569
570 return $html
571}
572
573function Get-AppliedArtifacts {
574 <#
575 .SYNOPSIS
576 Discovers the customization artifacts materialized under a workspace root.
577 .PARAMETER WorkspaceRoot
578 Absolute path to the materialized customized workspace (typically
579 evals/baseline-equivalence/customized/workspace). When missing, empty,
580 or not a directory the function returns an empty array without erroring.
581 .OUTPUTS
582 System.String[] of workspace-relative artifact paths using forward
583 slashes, sorted and de-duplicated by exact path.
584 .EXAMPLE
585 Get-AppliedArtifacts -WorkspaceRoot 'C:/repo/evals/baseline-equivalence/customized/workspace'
586 #>
587 [CmdletBinding()]
588 [OutputType([string[]])]
589 param(
590 [Parameter(Mandatory)]
591 [AllowEmptyString()]
592 [AllowNull()]
593 [string]$WorkspaceRoot
594 )
595
596 if ([string]::IsNullOrWhiteSpace($WorkspaceRoot)) { return @() }
597 if (-not (Test-Path -LiteralPath $WorkspaceRoot -PathType Container)) { return @() }
598
599 $kinds = @(
600 @{ Anchor = '.github/agents'; Filter = '*.agent.md' },
601 @{ Anchor = '.github/skills'; Filter = 'SKILL.md' },
602 @{ Anchor = '.github/instructions'; Filter = '*.instructions.md' },
603 @{ Anchor = '.github/prompts'; Filter = '*.prompt.md' }
604 )
605
606 $relatives = New-Object 'System.Collections.Generic.List[string]'
607 foreach ($kind in $kinds) {
608 $anchorPath = Join-Path $WorkspaceRoot $kind.Anchor
609 if (-not (Test-Path -LiteralPath $anchorPath -PathType Container)) { continue }
610 $files = Get-ChildItem -LiteralPath $anchorPath -Recurse -Filter $kind.Filter -File -ErrorAction SilentlyContinue
611 foreach ($file in $files) {
612 $rel = [IO.Path]::GetRelativePath($WorkspaceRoot, $file.FullName) -replace '\\', '/'
613 $relatives.Add($rel)
614 }
615 }
616
617 return @($relatives | Sort-Object -Unique)
618}
619
620Export-ModuleMember -Function `
621 Measure-CompareTrials, `
622 Measure-InvariantFailures, `
623 Get-VerdictFromAggregate, `
624 Get-OutputHash, `
625 ConvertFrom-EquivalenceResults, `
626 Merge-EquivalenceStimuli, `
627 Edit-HtmlEscape, `
628 Get-VariantMetadata, `
629 ConvertTo-EquivalenceHtml, `
630 Get-AppliedArtifacts
631