microsoft/hve-core
Publicmirrored fromhttps://github.com/microsoft/hve-coreAvailable
scripts/linting/Modules/AdrBodyParser.psm1
465lines · modecode
| 1 | # Copyright (c) Microsoft Corporation. |
| 2 | # SPDX-License-Identifier: MIT |
| 3 | |
| 4 | # AdrBodyParser.psm1 |
| 5 | # |
| 6 | # Purpose: Shared body-section parser for ADR consistency validation. |
| 7 | # Extracts H2 sections, bullets, table rows, and path-shaped tokens |
| 8 | # from Architecture Decision Record markdown for downstream rule checks. |
| 9 | # Author: HVE Core Team |
| 10 | |
| 11 | #Requires -Version 7.0 |
| 12 | |
| 13 | #region Parsing Helpers |
| 14 | |
| 15 | function Remove-AdrFencedCodeBlocks { |
| 16 | <# |
| 17 | .SYNOPSIS |
| 18 | Removes fenced code blocks and inline code spans from ADR body text. |
| 19 | .DESCRIPTION |
| 20 | Strips ``` and ~~~ delimited fenced code blocks line-by-line so downstream |
| 21 | section parsers do not pick up bullets, headings, or path tokens that appear |
| 22 | inside code samples. Fences are matched on the trimmed line start to tolerate |
| 23 | leading whitespace inside lists. Single-backtick inline code spans are also |
| 24 | removed so path-shaped tokens inside `code` do not leak into rule scans. |
| 25 | .PARAMETER Text |
| 26 | The raw ADR body text (frontmatter already stripped). |
| 27 | .PARAMETER PreserveInlineCode |
| 28 | When set, retains single-backtick inline code spans. Multi-line fenced |
| 29 | blocks are still stripped. Use this when the caller needs to detect |
| 30 | path-shaped tokens that authors place inside inline code. |
| 31 | .OUTPUTS |
| 32 | The same text with fenced code-block lines (and optionally inline code |
| 33 | spans) removed. |
| 34 | #> |
| 35 | [CmdletBinding()] |
| 36 | [OutputType([string])] |
| 37 | param( |
| 38 | [Parameter(Mandatory = $true)] |
| 39 | [AllowEmptyString()] |
| 40 | [string]$Text, |
| 41 | |
| 42 | [switch]$PreserveInlineCode |
| 43 | ) |
| 44 | |
| 45 | if ([string]::IsNullOrEmpty($Text)) { return '' } |
| 46 | |
| 47 | $lines = $Text -split "`r?`n" |
| 48 | $sb = [System.Text.StringBuilder]::new() |
| 49 | $inFence = $false |
| 50 | $fenceMarker = $null |
| 51 | |
| 52 | foreach ($line in $lines) { |
| 53 | $trimmed = $line.TrimStart() |
| 54 | if (-not $inFence) { |
| 55 | if ($trimmed -match '^(```+|~~~+)') { |
| 56 | $inFence = $true |
| 57 | $fenceMarker = $matches[1].Substring(0, 1) |
| 58 | continue |
| 59 | } |
| 60 | [void]$sb.AppendLine($line) |
| 61 | } |
| 62 | else { |
| 63 | if ($trimmed -match "^($([regex]::Escape($fenceMarker))){3,}\s*$") { |
| 64 | $inFence = $false |
| 65 | $fenceMarker = $null |
| 66 | } |
| 67 | continue |
| 68 | } |
| 69 | } |
| 70 | |
| 71 | $result = $sb.ToString() |
| 72 | if (-not $PreserveInlineCode) { |
| 73 | $result = $result -replace '`[^`]*`', '' |
| 74 | } |
| 75 | return $result |
| 76 | } |
| 77 | |
| 78 | function Get-AdrH2Section { |
| 79 | <# |
| 80 | .SYNOPSIS |
| 81 | Returns the body of a single ATX H2 section by heading text. |
| 82 | .DESCRIPTION |
| 83 | Locates a heading line of the form '## <HeadingText>' (case-insensitive, |
| 84 | leading/trailing whitespace tolerated) and returns all text up to the next |
| 85 | '## ' heading or end of input. Returns an empty string when the heading is |
| 86 | not found. |
| 87 | .PARAMETER Text |
| 88 | ADR body text with fenced code blocks already removed. |
| 89 | .PARAMETER HeadingText |
| 90 | Plain heading text (without leading '## '). |
| 91 | .OUTPUTS |
| 92 | The section body text or an empty string when missing. |
| 93 | #> |
| 94 | [CmdletBinding()] |
| 95 | [OutputType([string])] |
| 96 | param( |
| 97 | [Parameter(Mandatory = $true)] |
| 98 | [AllowEmptyString()] |
| 99 | [string]$Text, |
| 100 | |
| 101 | [Parameter(Mandatory = $true)] |
| 102 | [ValidateNotNullOrEmpty()] |
| 103 | [string]$HeadingText |
| 104 | ) |
| 105 | |
| 106 | if ([string]::IsNullOrEmpty($Text)) { return '' } |
| 107 | |
| 108 | $lines = $Text -split "`r?`n" |
| 109 | $startIndex = -1 |
| 110 | $headingPattern = '^\s*##\s+' + [regex]::Escape($HeadingText) + '\s*$' |
| 111 | |
| 112 | for ($i = 0; $i -lt $lines.Count; $i++) { |
| 113 | if ($lines[$i] -match $headingPattern) { |
| 114 | $startIndex = $i + 1 |
| 115 | break |
| 116 | } |
| 117 | } |
| 118 | |
| 119 | if ($startIndex -lt 0) { return '' } |
| 120 | |
| 121 | $endIndex = $lines.Count |
| 122 | for ($j = $startIndex; $j -lt $lines.Count; $j++) { |
| 123 | if ($lines[$j] -match '^\s*##\s+\S') { |
| 124 | $endIndex = $j |
| 125 | break |
| 126 | } |
| 127 | } |
| 128 | |
| 129 | return ($lines[$startIndex..($endIndex - 1)] -join "`n") |
| 130 | } |
| 131 | |
| 132 | function Get-AdrH3SectionInH2 { |
| 133 | <# |
| 134 | .SYNOPSIS |
| 135 | Returns the body of an ATX H3 subsection nested inside a named H2 section. |
| 136 | .DESCRIPTION |
| 137 | First locates the parent H2 via Get-AdrH2Section, then within that section |
| 138 | scans for an '### <HeadingText>' heading (case-insensitive, leading/trailing |
| 139 | whitespace tolerated) and returns text up to the next '### ' heading or the |
| 140 | end of the parent H2. Returns an empty string when either heading is missing. |
| 141 | |
| 142 | This supports MADR v4 canonical structure where 'Consequences' and |
| 143 | 'Confirmation' appear as H3 children of '## Decision Outcome' rather than |
| 144 | as standalone H2 sections. |
| 145 | .PARAMETER Text |
| 146 | ADR body text with fenced code blocks already removed. |
| 147 | .PARAMETER ParentH2 |
| 148 | Plain heading text of the enclosing H2 (without leading '## '). |
| 149 | .PARAMETER HeadingText |
| 150 | Plain heading text of the H3 subsection (without leading '### '). |
| 151 | .OUTPUTS |
| 152 | The H3 subsection body text or an empty string when missing. |
| 153 | #> |
| 154 | [CmdletBinding()] |
| 155 | [OutputType([string])] |
| 156 | param( |
| 157 | [Parameter(Mandatory = $true)] |
| 158 | [AllowEmptyString()] |
| 159 | [string]$Text, |
| 160 | |
| 161 | [Parameter(Mandatory = $true)] |
| 162 | [ValidateNotNullOrEmpty()] |
| 163 | [string]$ParentH2, |
| 164 | |
| 165 | [Parameter(Mandatory = $true)] |
| 166 | [ValidateNotNullOrEmpty()] |
| 167 | [string]$HeadingText |
| 168 | ) |
| 169 | |
| 170 | $parent = Get-AdrH2Section -Text $Text -HeadingText $ParentH2 |
| 171 | if ([string]::IsNullOrEmpty($parent)) { return '' } |
| 172 | |
| 173 | $lines = $parent -split "`r?`n" |
| 174 | $startIndex = -1 |
| 175 | $headingPattern = '^\s*###\s+' + [regex]::Escape($HeadingText) + '\s*$' |
| 176 | |
| 177 | for ($i = 0; $i -lt $lines.Count; $i++) { |
| 178 | if ($lines[$i] -match $headingPattern) { |
| 179 | $startIndex = $i + 1 |
| 180 | break |
| 181 | } |
| 182 | } |
| 183 | |
| 184 | if ($startIndex -lt 0) { return '' } |
| 185 | |
| 186 | $endIndex = $lines.Count |
| 187 | for ($j = $startIndex; $j -lt $lines.Count; $j++) { |
| 188 | if ($lines[$j] -match '^\s*###\s+\S') { |
| 189 | $endIndex = $j |
| 190 | break |
| 191 | } |
| 192 | } |
| 193 | |
| 194 | return ($lines[$startIndex..($endIndex - 1)] -join "`n") |
| 195 | } |
| 196 | |
| 197 | function Get-AdrBulletItems { |
| 198 | <# |
| 199 | .SYNOPSIS |
| 200 | Extracts top-level bullet items from a markdown section. |
| 201 | .DESCRIPTION |
| 202 | Returns the trimmed text of every bullet that begins with '*', '-', or '+' |
| 203 | at column 0-3 (CommonMark allows up to three leading spaces before a list |
| 204 | marker). Nested bullets indented four or more spaces are excluded. |
| 205 | .PARAMETER SectionText |
| 206 | Section body text returned by Get-AdrH2Section. |
| 207 | .OUTPUTS |
| 208 | String array of bullet item text. |
| 209 | #> |
| 210 | [CmdletBinding()] |
| 211 | [OutputType([string[]])] |
| 212 | param( |
| 213 | [Parameter(Mandatory = $true)] |
| 214 | [AllowEmptyString()] |
| 215 | [string]$SectionText |
| 216 | ) |
| 217 | |
| 218 | $items = [System.Collections.Generic.List[string]]::new() |
| 219 | if ([string]::IsNullOrEmpty($SectionText)) { return @() } |
| 220 | |
| 221 | foreach ($line in ($SectionText -split "`r?`n")) { |
| 222 | if ($line -match '^[ \t]{0,3}[\*\-\+]\s+(.+)$') { |
| 223 | $items.Add($matches[1].Trim()) |
| 224 | } |
| 225 | } |
| 226 | |
| 227 | return $items.ToArray() |
| 228 | } |
| 229 | |
| 230 | function Get-AdrTableRows { |
| 231 | <# |
| 232 | .SYNOPSIS |
| 233 | Extracts the first-column cell value from every data row in a markdown table. |
| 234 | .DESCRIPTION |
| 235 | Detects pipe-delimited markdown tables, skips the header and the alignment |
| 236 | separator row (the row containing only '-', ':', spaces, and pipes), and |
| 237 | returns the trimmed first-column value of every remaining data row. |
| 238 | .PARAMETER SectionText |
| 239 | Section body text returned by Get-AdrH2Section. |
| 240 | .OUTPUTS |
| 241 | String array of first-column cell values. |
| 242 | #> |
| 243 | [CmdletBinding()] |
| 244 | [OutputType([string[]])] |
| 245 | param( |
| 246 | [Parameter(Mandatory = $true)] |
| 247 | [AllowEmptyString()] |
| 248 | [string]$SectionText |
| 249 | ) |
| 250 | |
| 251 | $rows = [System.Collections.Generic.List[string]]::new() |
| 252 | if ([string]::IsNullOrEmpty($SectionText)) { return @() } |
| 253 | |
| 254 | $lines = $SectionText -split "`r?`n" |
| 255 | $sawHeader = $false |
| 256 | $sawSeparator = $false |
| 257 | |
| 258 | foreach ($line in $lines) { |
| 259 | $trimmed = $line.Trim() |
| 260 | if (-not $trimmed.StartsWith('|')) { |
| 261 | if ($sawSeparator) { $sawHeader = $false; $sawSeparator = $false } |
| 262 | continue |
| 263 | } |
| 264 | |
| 265 | if (-not $sawHeader) { |
| 266 | $sawHeader = $true |
| 267 | continue |
| 268 | } |
| 269 | |
| 270 | if (-not $sawSeparator) { |
| 271 | if ($trimmed -match '^\|[\s\-:|]+\|$') { |
| 272 | $sawSeparator = $true |
| 273 | continue |
| 274 | } |
| 275 | $sawHeader = $false |
| 276 | continue |
| 277 | } |
| 278 | |
| 279 | $cells = $trimmed.Trim('|') -split '\|' |
| 280 | if ($cells.Count -ge 1) { |
| 281 | $first = $cells[0].Trim() |
| 282 | if ($first) { $rows.Add($first) } |
| 283 | } |
| 284 | } |
| 285 | |
| 286 | return $rows.ToArray() |
| 287 | } |
| 288 | |
| 289 | function Get-AdrPathTokens { |
| 290 | <# |
| 291 | .SYNOPSIS |
| 292 | Extracts repository-relative path-shaped tokens from a section. |
| 293 | .DESCRIPTION |
| 294 | Scans a section's text (including code spans wrapped in backticks) and |
| 295 | returns tokens that look like repo-relative paths: they contain at least |
| 296 | one forward slash, either end in a recognized file extension or a trailing |
| 297 | slash, and consist of path-safe characters. Markdown link text and inline |
| 298 | code spans are both considered. |
| 299 | .PARAMETER SectionText |
| 300 | Section body text returned by Get-AdrH2Section. |
| 301 | .OUTPUTS |
| 302 | Distinct string array of path-shaped tokens preserving first-seen order. |
| 303 | #> |
| 304 | [CmdletBinding()] |
| 305 | [OutputType([string[]])] |
| 306 | param( |
| 307 | [Parameter(Mandatory = $true)] |
| 308 | [AllowEmptyString()] |
| 309 | [string]$SectionText |
| 310 | ) |
| 311 | |
| 312 | $seen = [System.Collections.Generic.HashSet[string]]::new([System.StringComparer]::Ordinal) |
| 313 | $ordered = [System.Collections.Generic.List[string]]::new() |
| 314 | if ([string]::IsNullOrEmpty($SectionText)) { return @() } |
| 315 | |
| 316 | $pattern = '(?<![A-Za-z0-9_\-./])([A-Za-z0-9_.-]+(?:/[A-Za-z0-9_.-]+)*(?:/\.[A-Za-z0-9_][A-Za-z0-9_.-]*|\.[A-Za-z0-9]{1,8}|/))(?![A-Za-z0-9_.-])' |
| 317 | foreach ($match in [regex]::Matches($SectionText, $pattern)) { |
| 318 | $token = $match.Groups[1].Value.Trim() |
| 319 | if (-not $token) { continue } |
| 320 | if ($seen.Add($token)) { $ordered.Add($token) } |
| 321 | } |
| 322 | |
| 323 | return $ordered.ToArray() |
| 324 | } |
| 325 | |
| 326 | function Get-AdrBadConsequenceBullets { |
| 327 | <# |
| 328 | .SYNOPSIS |
| 329 | Extracts bullets under the 'Bad' subsection of '## Consequences'. |
| 330 | .DESCRIPTION |
| 331 | Returns top-level bullets that appear after the first heading or bold-prefixed |
| 332 | line whose text begins with 'Bad' (case-insensitive) within the Consequences |
| 333 | section, and stops at the next sibling heading or bold-prefixed group. |
| 334 | .PARAMETER ConsequencesText |
| 335 | Body text of the '## Consequences' section. |
| 336 | .OUTPUTS |
| 337 | String array of bullet item text under the Bad subsection. |
| 338 | #> |
| 339 | [CmdletBinding()] |
| 340 | [OutputType([string[]])] |
| 341 | param( |
| 342 | [Parameter(Mandatory = $true)] |
| 343 | [AllowEmptyString()] |
| 344 | [string]$ConsequencesText |
| 345 | ) |
| 346 | |
| 347 | if ([string]::IsNullOrEmpty($ConsequencesText)) { return @() } |
| 348 | |
| 349 | $lines = $ConsequencesText -split "`r?`n" |
| 350 | $inBad = $false |
| 351 | $items = [System.Collections.Generic.List[string]]::new() |
| 352 | |
| 353 | foreach ($line in $lines) { |
| 354 | $trimmed = $line.Trim() |
| 355 | $isGroupStart = $trimmed -match '^(#{3,6}\s+|[\*_]{1,2})\s*Bad\b' |
| 356 | $isOtherGroup = $trimmed -match '^(#{3,6}\s+|[\*_]{1,2})\s*(Good|Neutral)\b' |
| 357 | |
| 358 | if ($isGroupStart) { $inBad = $true; continue } |
| 359 | if ($inBad -and $isOtherGroup) { $inBad = $false; continue } |
| 360 | |
| 361 | if ($inBad -and ($line -match '^[ \t]{0,3}[\*\-\+]\s+(.+)$')) { |
| 362 | $items.Add($matches[1].Trim()) |
| 363 | } |
| 364 | } |
| 365 | |
| 366 | return $items.ToArray() |
| 367 | } |
| 368 | |
| 369 | #endregion Parsing Helpers |
| 370 | |
| 371 | #region Public API |
| 372 | |
| 373 | function Get-AdrBodySections { |
| 374 | <# |
| 375 | .SYNOPSIS |
| 376 | Parses an ADR markdown body into a structured object for consistency checks. |
| 377 | .DESCRIPTION |
| 378 | Strips fenced code blocks, locates ATX H2 sections, and extracts the bullet |
| 379 | items, table rows, and path-shaped tokens needed by the ADR consistency rule |
| 380 | registry. Returns a single object whose property names mirror the rule |
| 381 | registry's expectations. |
| 382 | |
| 383 | The parser recognizes these sections (case-insensitive, ATX style only): |
| 384 | * '## Affected Components' - bullet list under heading |
| 385 | * '## Decision Drivers' - bullet list under heading |
| 386 | * '## Decision Outcome' - first markdown table; first column collected |
| 387 | * 'Consequences' - bullets under the 'Bad' subsection. |
| 388 | Looked up at '## Consequences' (H2) first; |
| 389 | falls back to '### Consequences' nested in |
| 390 | '## Decision Outcome' (MADR v4 canonical). |
| 391 | * '## Risks and Mitigations'- first markdown table; first column collected |
| 392 | * 'Confirmation' - raw section text retained. |
| 393 | Looked up at '## Confirmation' (H2) first; |
| 394 | falls back to '### Confirmation' nested in |
| 395 | '## Decision Outcome' (MADR v4 canonical). |
| 396 | * '## Context' - path tokens extracted |
| 397 | * '## More Information' - path tokens extracted |
| 398 | |
| 399 | .PARAMETER Text |
| 400 | ADR body markdown with frontmatter already stripped. |
| 401 | .OUTPUTS |
| 402 | PSCustomObject with the following properties: |
| 403 | AffectedComponents [string[]] |
| 404 | DecisionDrivers [string[]] |
| 405 | DecisionOutcomeMatrixDrivers [string[]] |
| 406 | BadConsequences [string[]] |
| 407 | RisksAndMitigationsRisks [string[]] |
| 408 | Confirmation [string] |
| 409 | ContextPathTokens [string[]] |
| 410 | MoreInformationPathTokens [string[]] |
| 411 | ConfirmationPathTokens [string[]] |
| 412 | .EXAMPLE |
| 413 | $body = Get-Content ./adr.md -Raw |
| 414 | $sections = Get-AdrBodySections -Text $body |
| 415 | $sections.AffectedComponents |
| 416 | #> |
| 417 | [CmdletBinding()] |
| 418 | [OutputType([pscustomobject])] |
| 419 | param( |
| 420 | [Parameter(Mandatory = $true)] |
| 421 | [AllowEmptyString()] |
| 422 | [string]$Text |
| 423 | ) |
| 424 | |
| 425 | $sanitized = Remove-AdrFencedCodeBlocks -Text $Text |
| 426 | $sanitizedKeepInline = Remove-AdrFencedCodeBlocks -Text $Text -PreserveInlineCode |
| 427 | |
| 428 | $affectedSection = Get-AdrH2Section -Text $sanitized -HeadingText 'Affected Components' |
| 429 | $driversSection = Get-AdrH2Section -Text $sanitized -HeadingText 'Decision Drivers' |
| 430 | $outcomeSection = Get-AdrH2Section -Text $sanitized -HeadingText 'Decision Outcome' |
| 431 | $consequencesSection = Get-AdrH2Section -Text $sanitized -HeadingText 'Consequences' |
| 432 | if ([string]::IsNullOrEmpty($consequencesSection)) { |
| 433 | $consequencesSection = Get-AdrH3SectionInH2 -Text $sanitized -ParentH2 'Decision Outcome' -HeadingText 'Consequences' |
| 434 | } |
| 435 | $risksSection = Get-AdrH2Section -Text $sanitized -HeadingText 'Risks and Mitigations' |
| 436 | $confirmationSection = Get-AdrH2Section -Text $sanitized -HeadingText 'Confirmation' |
| 437 | if ([string]::IsNullOrEmpty($confirmationSection)) { |
| 438 | $confirmationSection = Get-AdrH3SectionInH2 -Text $sanitized -ParentH2 'Decision Outcome' -HeadingText 'Confirmation' |
| 439 | } |
| 440 | |
| 441 | # Path-token sections retain inline code spans so authors can cite affected |
| 442 | # components inside `backticks`, which is the idiomatic markdown form. |
| 443 | $contextSectionInline = Get-AdrH2Section -Text $sanitizedKeepInline -HeadingText 'Context' |
| 444 | $moreInfoSectionInline = Get-AdrH2Section -Text $sanitizedKeepInline -HeadingText 'More Information' |
| 445 | $confirmationSectionInline = Get-AdrH2Section -Text $sanitizedKeepInline -HeadingText 'Confirmation' |
| 446 | if ([string]::IsNullOrEmpty($confirmationSectionInline)) { |
| 447 | $confirmationSectionInline = Get-AdrH3SectionInH2 -Text $sanitizedKeepInline -ParentH2 'Decision Outcome' -HeadingText 'Confirmation' |
| 448 | } |
| 449 | |
| 450 | return [pscustomobject]@{ |
| 451 | AffectedComponents = Get-AdrBulletItems -SectionText $affectedSection |
| 452 | DecisionDrivers = Get-AdrBulletItems -SectionText $driversSection |
| 453 | DecisionOutcomeMatrixDrivers = Get-AdrTableRows -SectionText $outcomeSection |
| 454 | BadConsequences = Get-AdrBadConsequenceBullets -ConsequencesText $consequencesSection |
| 455 | RisksAndMitigationsRisks = Get-AdrTableRows -SectionText $risksSection |
| 456 | Confirmation = $confirmationSection |
| 457 | ContextPathTokens = Get-AdrPathTokens -SectionText $contextSectionInline |
| 458 | MoreInformationPathTokens = Get-AdrPathTokens -SectionText $moreInfoSectionInline |
| 459 | ConfirmationPathTokens = Get-AdrPathTokens -SectionText $confirmationSectionInline |
| 460 | } |
| 461 | } |
| 462 | |
| 463 | #endregion Public API |
| 464 | |
| 465 | Export-ModuleMember -Function @('Get-AdrBodySections', 'Remove-AdrFencedCodeBlocks') |