microsoft/hve-core

Public

mirrored fromhttps://github.com/microsoft/hve-coreAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
feat/1873-devcontainer

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

scripts/linting/Modules/AdrBodyParser.psm1

465lines · modecode

1# Copyright (c) Microsoft Corporation.
2# SPDX-License-Identifier: MIT
3
4# AdrBodyParser.psm1
5#
6# Purpose: Shared body-section parser for ADR consistency validation.
7# Extracts H2 sections, bullets, table rows, and path-shaped tokens
8# from Architecture Decision Record markdown for downstream rule checks.
9# Author: HVE Core Team
10
11#Requires -Version 7.0
12
13#region Parsing Helpers
14
15function Remove-AdrFencedCodeBlocks {
16 <#
17 .SYNOPSIS
18 Removes fenced code blocks and inline code spans from ADR body text.
19 .DESCRIPTION
20 Strips ``` and ~~~ delimited fenced code blocks line-by-line so downstream
21 section parsers do not pick up bullets, headings, or path tokens that appear
22 inside code samples. Fences are matched on the trimmed line start to tolerate
23 leading whitespace inside lists. Single-backtick inline code spans are also
24 removed so path-shaped tokens inside `code` do not leak into rule scans.
25 .PARAMETER Text
26 The raw ADR body text (frontmatter already stripped).
27 .PARAMETER PreserveInlineCode
28 When set, retains single-backtick inline code spans. Multi-line fenced
29 blocks are still stripped. Use this when the caller needs to detect
30 path-shaped tokens that authors place inside inline code.
31 .OUTPUTS
32 The same text with fenced code-block lines (and optionally inline code
33 spans) removed.
34 #>
35 [CmdletBinding()]
36 [OutputType([string])]
37 param(
38 [Parameter(Mandatory = $true)]
39 [AllowEmptyString()]
40 [string]$Text,
41
42 [switch]$PreserveInlineCode
43 )
44
45 if ([string]::IsNullOrEmpty($Text)) { return '' }
46
47 $lines = $Text -split "`r?`n"
48 $sb = [System.Text.StringBuilder]::new()
49 $inFence = $false
50 $fenceMarker = $null
51
52 foreach ($line in $lines) {
53 $trimmed = $line.TrimStart()
54 if (-not $inFence) {
55 if ($trimmed -match '^(```+|~~~+)') {
56 $inFence = $true
57 $fenceMarker = $matches[1].Substring(0, 1)
58 continue
59 }
60 [void]$sb.AppendLine($line)
61 }
62 else {
63 if ($trimmed -match "^($([regex]::Escape($fenceMarker))){3,}\s*$") {
64 $inFence = $false
65 $fenceMarker = $null
66 }
67 continue
68 }
69 }
70
71 $result = $sb.ToString()
72 if (-not $PreserveInlineCode) {
73 $result = $result -replace '`[^`]*`', ''
74 }
75 return $result
76}
77
78function Get-AdrH2Section {
79 <#
80 .SYNOPSIS
81 Returns the body of a single ATX H2 section by heading text.
82 .DESCRIPTION
83 Locates a heading line of the form '## <HeadingText>' (case-insensitive,
84 leading/trailing whitespace tolerated) and returns all text up to the next
85 '## ' heading or end of input. Returns an empty string when the heading is
86 not found.
87 .PARAMETER Text
88 ADR body text with fenced code blocks already removed.
89 .PARAMETER HeadingText
90 Plain heading text (without leading '## ').
91 .OUTPUTS
92 The section body text or an empty string when missing.
93 #>
94 [CmdletBinding()]
95 [OutputType([string])]
96 param(
97 [Parameter(Mandatory = $true)]
98 [AllowEmptyString()]
99 [string]$Text,
100
101 [Parameter(Mandatory = $true)]
102 [ValidateNotNullOrEmpty()]
103 [string]$HeadingText
104 )
105
106 if ([string]::IsNullOrEmpty($Text)) { return '' }
107
108 $lines = $Text -split "`r?`n"
109 $startIndex = -1
110 $headingPattern = '^\s*##\s+' + [regex]::Escape($HeadingText) + '\s*$'
111
112 for ($i = 0; $i -lt $lines.Count; $i++) {
113 if ($lines[$i] -match $headingPattern) {
114 $startIndex = $i + 1
115 break
116 }
117 }
118
119 if ($startIndex -lt 0) { return '' }
120
121 $endIndex = $lines.Count
122 for ($j = $startIndex; $j -lt $lines.Count; $j++) {
123 if ($lines[$j] -match '^\s*##\s+\S') {
124 $endIndex = $j
125 break
126 }
127 }
128
129 return ($lines[$startIndex..($endIndex - 1)] -join "`n")
130}
131
132function Get-AdrH3SectionInH2 {
133 <#
134 .SYNOPSIS
135 Returns the body of an ATX H3 subsection nested inside a named H2 section.
136 .DESCRIPTION
137 First locates the parent H2 via Get-AdrH2Section, then within that section
138 scans for an '### <HeadingText>' heading (case-insensitive, leading/trailing
139 whitespace tolerated) and returns text up to the next '### ' heading or the
140 end of the parent H2. Returns an empty string when either heading is missing.
141
142 This supports MADR v4 canonical structure where 'Consequences' and
143 'Confirmation' appear as H3 children of '## Decision Outcome' rather than
144 as standalone H2 sections.
145 .PARAMETER Text
146 ADR body text with fenced code blocks already removed.
147 .PARAMETER ParentH2
148 Plain heading text of the enclosing H2 (without leading '## ').
149 .PARAMETER HeadingText
150 Plain heading text of the H3 subsection (without leading '### ').
151 .OUTPUTS
152 The H3 subsection body text or an empty string when missing.
153 #>
154 [CmdletBinding()]
155 [OutputType([string])]
156 param(
157 [Parameter(Mandatory = $true)]
158 [AllowEmptyString()]
159 [string]$Text,
160
161 [Parameter(Mandatory = $true)]
162 [ValidateNotNullOrEmpty()]
163 [string]$ParentH2,
164
165 [Parameter(Mandatory = $true)]
166 [ValidateNotNullOrEmpty()]
167 [string]$HeadingText
168 )
169
170 $parent = Get-AdrH2Section -Text $Text -HeadingText $ParentH2
171 if ([string]::IsNullOrEmpty($parent)) { return '' }
172
173 $lines = $parent -split "`r?`n"
174 $startIndex = -1
175 $headingPattern = '^\s*###\s+' + [regex]::Escape($HeadingText) + '\s*$'
176
177 for ($i = 0; $i -lt $lines.Count; $i++) {
178 if ($lines[$i] -match $headingPattern) {
179 $startIndex = $i + 1
180 break
181 }
182 }
183
184 if ($startIndex -lt 0) { return '' }
185
186 $endIndex = $lines.Count
187 for ($j = $startIndex; $j -lt $lines.Count; $j++) {
188 if ($lines[$j] -match '^\s*###\s+\S') {
189 $endIndex = $j
190 break
191 }
192 }
193
194 return ($lines[$startIndex..($endIndex - 1)] -join "`n")
195}
196
197function Get-AdrBulletItems {
198 <#
199 .SYNOPSIS
200 Extracts top-level bullet items from a markdown section.
201 .DESCRIPTION
202 Returns the trimmed text of every bullet that begins with '*', '-', or '+'
203 at column 0-3 (CommonMark allows up to three leading spaces before a list
204 marker). Nested bullets indented four or more spaces are excluded.
205 .PARAMETER SectionText
206 Section body text returned by Get-AdrH2Section.
207 .OUTPUTS
208 String array of bullet item text.
209 #>
210 [CmdletBinding()]
211 [OutputType([string[]])]
212 param(
213 [Parameter(Mandatory = $true)]
214 [AllowEmptyString()]
215 [string]$SectionText
216 )
217
218 $items = [System.Collections.Generic.List[string]]::new()
219 if ([string]::IsNullOrEmpty($SectionText)) { return @() }
220
221 foreach ($line in ($SectionText -split "`r?`n")) {
222 if ($line -match '^[ \t]{0,3}[\*\-\+]\s+(.+)$') {
223 $items.Add($matches[1].Trim())
224 }
225 }
226
227 return $items.ToArray()
228}
229
230function Get-AdrTableRows {
231 <#
232 .SYNOPSIS
233 Extracts the first-column cell value from every data row in a markdown table.
234 .DESCRIPTION
235 Detects pipe-delimited markdown tables, skips the header and the alignment
236 separator row (the row containing only '-', ':', spaces, and pipes), and
237 returns the trimmed first-column value of every remaining data row.
238 .PARAMETER SectionText
239 Section body text returned by Get-AdrH2Section.
240 .OUTPUTS
241 String array of first-column cell values.
242 #>
243 [CmdletBinding()]
244 [OutputType([string[]])]
245 param(
246 [Parameter(Mandatory = $true)]
247 [AllowEmptyString()]
248 [string]$SectionText
249 )
250
251 $rows = [System.Collections.Generic.List[string]]::new()
252 if ([string]::IsNullOrEmpty($SectionText)) { return @() }
253
254 $lines = $SectionText -split "`r?`n"
255 $sawHeader = $false
256 $sawSeparator = $false
257
258 foreach ($line in $lines) {
259 $trimmed = $line.Trim()
260 if (-not $trimmed.StartsWith('|')) {
261 if ($sawSeparator) { $sawHeader = $false; $sawSeparator = $false }
262 continue
263 }
264
265 if (-not $sawHeader) {
266 $sawHeader = $true
267 continue
268 }
269
270 if (-not $sawSeparator) {
271 if ($trimmed -match '^\|[\s\-:|]+\|$') {
272 $sawSeparator = $true
273 continue
274 }
275 $sawHeader = $false
276 continue
277 }
278
279 $cells = $trimmed.Trim('|') -split '\|'
280 if ($cells.Count -ge 1) {
281 $first = $cells[0].Trim()
282 if ($first) { $rows.Add($first) }
283 }
284 }
285
286 return $rows.ToArray()
287}
288
289function Get-AdrPathTokens {
290 <#
291 .SYNOPSIS
292 Extracts repository-relative path-shaped tokens from a section.
293 .DESCRIPTION
294 Scans a section's text (including code spans wrapped in backticks) and
295 returns tokens that look like repo-relative paths: they contain at least
296 one forward slash, either end in a recognized file extension or a trailing
297 slash, and consist of path-safe characters. Markdown link text and inline
298 code spans are both considered.
299 .PARAMETER SectionText
300 Section body text returned by Get-AdrH2Section.
301 .OUTPUTS
302 Distinct string array of path-shaped tokens preserving first-seen order.
303 #>
304 [CmdletBinding()]
305 [OutputType([string[]])]
306 param(
307 [Parameter(Mandatory = $true)]
308 [AllowEmptyString()]
309 [string]$SectionText
310 )
311
312 $seen = [System.Collections.Generic.HashSet[string]]::new([System.StringComparer]::Ordinal)
313 $ordered = [System.Collections.Generic.List[string]]::new()
314 if ([string]::IsNullOrEmpty($SectionText)) { return @() }
315
316 $pattern = '(?<![A-Za-z0-9_\-./])([A-Za-z0-9_.-]+(?:/[A-Za-z0-9_.-]+)*(?:/\.[A-Za-z0-9_][A-Za-z0-9_.-]*|\.[A-Za-z0-9]{1,8}|/))(?![A-Za-z0-9_.-])'
317 foreach ($match in [regex]::Matches($SectionText, $pattern)) {
318 $token = $match.Groups[1].Value.Trim()
319 if (-not $token) { continue }
320 if ($seen.Add($token)) { $ordered.Add($token) }
321 }
322
323 return $ordered.ToArray()
324}
325
326function Get-AdrBadConsequenceBullets {
327 <#
328 .SYNOPSIS
329 Extracts bullets under the 'Bad' subsection of '## Consequences'.
330 .DESCRIPTION
331 Returns top-level bullets that appear after the first heading or bold-prefixed
332 line whose text begins with 'Bad' (case-insensitive) within the Consequences
333 section, and stops at the next sibling heading or bold-prefixed group.
334 .PARAMETER ConsequencesText
335 Body text of the '## Consequences' section.
336 .OUTPUTS
337 String array of bullet item text under the Bad subsection.
338 #>
339 [CmdletBinding()]
340 [OutputType([string[]])]
341 param(
342 [Parameter(Mandatory = $true)]
343 [AllowEmptyString()]
344 [string]$ConsequencesText
345 )
346
347 if ([string]::IsNullOrEmpty($ConsequencesText)) { return @() }
348
349 $lines = $ConsequencesText -split "`r?`n"
350 $inBad = $false
351 $items = [System.Collections.Generic.List[string]]::new()
352
353 foreach ($line in $lines) {
354 $trimmed = $line.Trim()
355 $isGroupStart = $trimmed -match '^(#{3,6}\s+|[\*_]{1,2})\s*Bad\b'
356 $isOtherGroup = $trimmed -match '^(#{3,6}\s+|[\*_]{1,2})\s*(Good|Neutral)\b'
357
358 if ($isGroupStart) { $inBad = $true; continue }
359 if ($inBad -and $isOtherGroup) { $inBad = $false; continue }
360
361 if ($inBad -and ($line -match '^[ \t]{0,3}[\*\-\+]\s+(.+)$')) {
362 $items.Add($matches[1].Trim())
363 }
364 }
365
366 return $items.ToArray()
367}
368
369#endregion Parsing Helpers
370
371#region Public API
372
373function Get-AdrBodySections {
374 <#
375 .SYNOPSIS
376 Parses an ADR markdown body into a structured object for consistency checks.
377 .DESCRIPTION
378 Strips fenced code blocks, locates ATX H2 sections, and extracts the bullet
379 items, table rows, and path-shaped tokens needed by the ADR consistency rule
380 registry. Returns a single object whose property names mirror the rule
381 registry's expectations.
382
383 The parser recognizes these sections (case-insensitive, ATX style only):
384 * '## Affected Components' - bullet list under heading
385 * '## Decision Drivers' - bullet list under heading
386 * '## Decision Outcome' - first markdown table; first column collected
387 * 'Consequences' - bullets under the 'Bad' subsection.
388 Looked up at '## Consequences' (H2) first;
389 falls back to '### Consequences' nested in
390 '## Decision Outcome' (MADR v4 canonical).
391 * '## Risks and Mitigations'- first markdown table; first column collected
392 * 'Confirmation' - raw section text retained.
393 Looked up at '## Confirmation' (H2) first;
394 falls back to '### Confirmation' nested in
395 '## Decision Outcome' (MADR v4 canonical).
396 * '## Context' - path tokens extracted
397 * '## More Information' - path tokens extracted
398
399 .PARAMETER Text
400 ADR body markdown with frontmatter already stripped.
401 .OUTPUTS
402 PSCustomObject with the following properties:
403 AffectedComponents [string[]]
404 DecisionDrivers [string[]]
405 DecisionOutcomeMatrixDrivers [string[]]
406 BadConsequences [string[]]
407 RisksAndMitigationsRisks [string[]]
408 Confirmation [string]
409 ContextPathTokens [string[]]
410 MoreInformationPathTokens [string[]]
411 ConfirmationPathTokens [string[]]
412 .EXAMPLE
413 $body = Get-Content ./adr.md -Raw
414 $sections = Get-AdrBodySections -Text $body
415 $sections.AffectedComponents
416 #>
417 [CmdletBinding()]
418 [OutputType([pscustomobject])]
419 param(
420 [Parameter(Mandatory = $true)]
421 [AllowEmptyString()]
422 [string]$Text
423 )
424
425 $sanitized = Remove-AdrFencedCodeBlocks -Text $Text
426 $sanitizedKeepInline = Remove-AdrFencedCodeBlocks -Text $Text -PreserveInlineCode
427
428 $affectedSection = Get-AdrH2Section -Text $sanitized -HeadingText 'Affected Components'
429 $driversSection = Get-AdrH2Section -Text $sanitized -HeadingText 'Decision Drivers'
430 $outcomeSection = Get-AdrH2Section -Text $sanitized -HeadingText 'Decision Outcome'
431 $consequencesSection = Get-AdrH2Section -Text $sanitized -HeadingText 'Consequences'
432 if ([string]::IsNullOrEmpty($consequencesSection)) {
433 $consequencesSection = Get-AdrH3SectionInH2 -Text $sanitized -ParentH2 'Decision Outcome' -HeadingText 'Consequences'
434 }
435 $risksSection = Get-AdrH2Section -Text $sanitized -HeadingText 'Risks and Mitigations'
436 $confirmationSection = Get-AdrH2Section -Text $sanitized -HeadingText 'Confirmation'
437 if ([string]::IsNullOrEmpty($confirmationSection)) {
438 $confirmationSection = Get-AdrH3SectionInH2 -Text $sanitized -ParentH2 'Decision Outcome' -HeadingText 'Confirmation'
439 }
440
441 # Path-token sections retain inline code spans so authors can cite affected
442 # components inside `backticks`, which is the idiomatic markdown form.
443 $contextSectionInline = Get-AdrH2Section -Text $sanitizedKeepInline -HeadingText 'Context'
444 $moreInfoSectionInline = Get-AdrH2Section -Text $sanitizedKeepInline -HeadingText 'More Information'
445 $confirmationSectionInline = Get-AdrH2Section -Text $sanitizedKeepInline -HeadingText 'Confirmation'
446 if ([string]::IsNullOrEmpty($confirmationSectionInline)) {
447 $confirmationSectionInline = Get-AdrH3SectionInH2 -Text $sanitizedKeepInline -ParentH2 'Decision Outcome' -HeadingText 'Confirmation'
448 }
449
450 return [pscustomobject]@{
451 AffectedComponents = Get-AdrBulletItems -SectionText $affectedSection
452 DecisionDrivers = Get-AdrBulletItems -SectionText $driversSection
453 DecisionOutcomeMatrixDrivers = Get-AdrTableRows -SectionText $outcomeSection
454 BadConsequences = Get-AdrBadConsequenceBullets -ConsequencesText $consequencesSection
455 RisksAndMitigationsRisks = Get-AdrTableRows -SectionText $risksSection
456 Confirmation = $confirmationSection
457 ContextPathTokens = Get-AdrPathTokens -SectionText $contextSectionInline
458 MoreInformationPathTokens = Get-AdrPathTokens -SectionText $moreInfoSectionInline
459 ConfirmationPathTokens = Get-AdrPathTokens -SectionText $confirmationSectionInline
460 }
461}
462
463#endregion Public API
464
465Export-ModuleMember -Function @('Get-AdrBodySections', 'Remove-AdrFencedCodeBlocks')