microsoft/hve-core
Publicmirrored fromhttps://github.com/microsoft/hve-coreAvailable
scripts/evals/Modules/CorpusReader.psm1
90lines · modecode
| 1 | # Copyright (c) Microsoft Corporation. |
| 2 | # SPDX-License-Identifier: MIT |
| 3 | # CorpusReader.psm1 |
| 4 | # Purpose: Read AI corpus markdown files with YAML frontmatter stripping for moderation input. |
| 5 | #Requires -Version 7.0 |
| 6 | |
| 7 | <# |
| 8 | .SYNOPSIS |
| 9 | Returns the markdown body of a file with the YAML frontmatter block removed. |
| 10 | |
| 11 | .DESCRIPTION |
| 12 | Reads a UTF-8 markdown file and strips a leading YAML frontmatter block delimited |
| 13 | by `---` on the first line and a matching `---` line that follows. When no |
| 14 | frontmatter is present the original content is returned unchanged. |
| 15 | |
| 16 | .PARAMETER Path |
| 17 | Absolute or relative path to the markdown file. |
| 18 | |
| 19 | .OUTPUTS |
| 20 | System.String - File body without frontmatter. |
| 21 | #> |
| 22 | function Get-CorpusArtifactBody { |
| 23 | [CmdletBinding()] |
| 24 | [OutputType([string])] |
| 25 | param( |
| 26 | [Parameter(Mandatory = $true)] |
| 27 | [string]$Path |
| 28 | ) |
| 29 | |
| 30 | if (-not (Test-Path -LiteralPath $Path)) { |
| 31 | throw "Corpus file not found: $Path" |
| 32 | } |
| 33 | |
| 34 | $content = Get-Content -LiteralPath $Path -Raw -Encoding utf8 |
| 35 | if ([string]::IsNullOrEmpty($content)) { |
| 36 | return '' |
| 37 | } |
| 38 | |
| 39 | # Match leading frontmatter: --- on line 1, body, closing --- on its own line. |
| 40 | $pattern = '^---\r?\n(?:.*?\r?\n)*?---\r?\n' |
| 41 | return [regex]::Replace($content, $pattern, '', [System.Text.RegularExpressions.RegexOptions]::Singleline) |
| 42 | } |
| 43 | |
| 44 | <# |
| 45 | .SYNOPSIS |
| 46 | Filters a changed-artifacts manifest to AI corpus markdown paths. |
| 47 | |
| 48 | .DESCRIPTION |
| 49 | Reads `logs/changed-ai-artifacts.json` (or a compatible structure) and returns the |
| 50 | file paths under `.github/agents`, `.github/prompts`, `.github/instructions`, and |
| 51 | `.github/skills` with `.md` extension. Removed entries are excluded. |
| 52 | |
| 53 | .PARAMETER ManifestPath |
| 54 | Path to the changed-artifacts JSON manifest. |
| 55 | |
| 56 | .OUTPUTS |
| 57 | System.String[] - Repository-relative paths of corpus markdown files to moderate. |
| 58 | #> |
| 59 | function Get-CorpusArtifactPaths { |
| 60 | [CmdletBinding()] |
| 61 | [OutputType([string[]])] |
| 62 | param( |
| 63 | [Parameter(Mandatory = $true)] |
| 64 | [string]$ManifestPath |
| 65 | ) |
| 66 | |
| 67 | if (-not (Test-Path -LiteralPath $ManifestPath)) { |
| 68 | throw "Manifest not found: $ManifestPath" |
| 69 | } |
| 70 | |
| 71 | $manifest = Get-Content -LiteralPath $ManifestPath -Raw -Encoding utf8 | ConvertFrom-Json |
| 72 | if (-not $manifest.artifacts) { |
| 73 | return @() |
| 74 | } |
| 75 | |
| 76 | $pattern = '^\.github/(agents|prompts|instructions|skills)/.+\.md$' |
| 77 | $paths = foreach ($artifact in $manifest.artifacts) { |
| 78 | $path = ($artifact.path -replace '\\', '/') |
| 79 | if ($artifact.status -ne 'removed' -and $path -match $pattern) { |
| 80 | $path |
| 81 | } |
| 82 | } |
| 83 | |
| 84 | return @($paths) |
| 85 | } |
| 86 | |
| 87 | Export-ModuleMember -Function @( |
| 88 | 'Get-CorpusArtifactBody', |
| 89 | 'Get-CorpusArtifactPaths' |
| 90 | ) |
| 91 | |