microsoft/qdk

Public

mirrored fromhttps://github.com/microsoft/qdkAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
cesarzc/ssa-panic

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

compiler/qsc_data_structures/src/line_column.rs

209lines · modecode

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4#[cfg(test)]
5mod tests;
6
7use crate::span::Span;
8
9/// A line and column pair that describes a position in a string.
10#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash)]
11pub struct Position {
12 /// Line offset.
13 pub line: u32,
14 /// Column offset.
15 /// When created using [`Encoding::Utf8`], this is the byte offset.
16 /// When created using [`Encoding::Utf16`], this is the code unit (word) offset.
17 pub column: u32,
18}
19
20/// The encoding to use when creating a [`Position`] from a source string and offset.
21///
22/// For reference, when **indexing directly** into a string:
23/// Rust uses UTF-8 byte offsets (only available through conversion into a byte array)
24/// JavaScript uses UTF-16 code unit offsets
25/// Python uses character offsets
26///
27/// Additionally, all these languages provide alternate ways of iterating over
28/// strings, which sometimes use code units and sometimes use characters,
29/// confusing our understanding of what "nth character" means.
30///
31/// Therefore it's important to be aware of exactly how this column offset will be treated
32/// by the code that's using it. e.g. in LSP (language server protocol) and
33/// DAP (debug adapter protocol) it's explicitly defined to use UTF-16 code units.
34#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash)]
35pub enum Encoding {
36 Utf8,
37 Utf16,
38}
39
40impl Position {
41 /// For a given string and utf-8 byte offset, returns a [`Position`]
42 /// that corresponds to that offset.
43 ///
44 /// The column information is expressed in code units, depending on the passed in `encoding`.
45 /// [`Encoding::Utf8`] will use byte offsets, and [`Encoding::Utf16`] will use
46 /// utf-16 code unit (word) offsets.
47 ///
48 /// If the given offset is past the end of the string, returns the position of
49 /// the end of the string (e.g. for "hello" the end of the string is line=0, column=5).
50 ///
51 /// Note that this function does not validate whether the passed in offset
52 /// is a valid utf8 char boundary. If an invalid offset is passed in,
53 /// the next char boundary will be returned.
54 #[must_use]
55 pub fn from_utf8_byte_offset(
56 encoding: Encoding,
57 contents: &str,
58 utf8_byte_offset: u32,
59 ) -> Self {
60 positions_from_utf8_byte_offsets(encoding, contents, [utf8_byte_offset])[0]
61 }
62
63 /// For a given string, returns the utf-8 byte offset that corresponds
64 /// to this [`Position`] in that string.
65 ///
66 /// The column information in the position is interpreted based on the
67 /// passed in `encoding`. [`Encoding::Utf8`] will treat the column information
68 /// as byte offsets, and [`Encoding::Utf16`] will use utf-16 code unit (word) offsets.
69 ///
70 /// If the position is past the end of the string, returns the offset of
71 /// the end of the string (e.g. for "hello" returns 5).
72 #[must_use]
73 pub fn to_utf8_byte_offset(&self, encoding: Encoding, contents: &str) -> u32 {
74 let mut column: u32 = 0;
75 let mut line: u32 = 0;
76
77 for (byte_offset, c) in contents.char_indices() {
78 if c == '\n' {
79 line += 1;
80 column = 0;
81 } else {
82 column += num_code_units(encoding, c);
83 }
84
85 if line > self.line || (line == self.line && column > self.column) {
86 // We moved past the target line+column
87 return u32(byte_offset);
88 }
89 }
90
91 // return eof if we move past the end of the string
92 u32(contents.len())
93 }
94}
95
96/// Same as a [`Span`], but represented with [`Position`]s instead of offsets.
97#[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
98pub struct Range {
99 pub start: Position,
100 pub end: Position,
101}
102
103impl Range {
104 /// For a given string and a span expressed in utf-8 byte offsets,
105 /// returns a [`Range`] that corresponds to that span.
106 ///
107 /// Any offsets that go past the end of the string are mapped to the end of the string.
108 /// (e.g. for "hello", the span (0-10) would map to (0,0-0,5)).
109 ///
110 /// This function does not validate span range. If `span.hi > span.lo`, the
111 /// end offset is simply mapped to the end of the string.
112 /// (e.g. for "hello", the invalid span (3-0) would map to (0,3-0,5)).
113 ///
114 /// This function also does not validate whether the passed in offset
115 /// is a valid utf8 char boundary. If an invalid offset is passed in,
116 /// the next char boundary will be returned.
117 #[must_use]
118 pub fn from_span(encoding: Encoding, contents: &str, span: &Span) -> Self {
119 let [start, end] = positions_from_utf8_byte_offsets(encoding, contents, [span.lo, span.hi]);
120 Self { start, end }
121 }
122
123 #[must_use]
124 pub fn empty(&self) -> bool {
125 self.start == self.end
126 }
127}
128
129/// For a given string and array of utf-8 byte offsets, returns the [`Position`]s
130/// corresponding to the byte offsets.
131///
132/// Since this function iterates over the string once, the array *MUST* be in sorted order.
133///
134/// This function takes a (const sized) array to avoid allocations.
135fn positions_from_utf8_byte_offsets<const N: usize>(
136 encoding: Encoding,
137 contents: &str,
138 sorted_utf8_byte_offsets: [u32; N],
139) -> [Position; N] {
140 // The below example contains characters that are encoded
141 // with different numbers of code units in utf-8 and utf-16,
142 // to demonstrate how code unit offset will differ depending
143 // on the chosen encoding. Characters can be encoded with:
144 // One code unit (byte) in utf-8, one code unit (word) in utf-16
145 // Multiple code units in utf-8, one code unit in utf-16
146 // Multiple code units in utf-8, surrogate pair in utf-16
147
148 // chars | 𝑓 ( 𝑥 ⃗ ) Σ <eof>
149 // unicode code point | 1d453 28 1d465 20d7 29 3a3
150 // utf-8 units (bytes) | f09d9193 28 f09d91a5 e28397 29 cea3
151 // utf-16 units | d835 dc53 0028 d835 dc65 20d7 0029 03a3
152 // char offset | 0 1 2 3 4 5 6
153 // utf-8 byte offset | 0 4 5 9 12 13 15
154 // utf-16 code unit offset | 0 2 3 5 6 7 8
155
156 let mut positions = [Position { line: 0, column: 0 }; N];
157 let mut i = 0;
158 let mut column: u32 = 0;
159 let mut line: u32 = 0;
160
161 for (char_index, c) in contents.char_indices() {
162 if i == N {
163 // We've run out of offsets to look for
164 break;
165 }
166
167 // We've moved past the next requested offset
168 while char_index >= sorted_utf8_byte_offsets[i] as usize {
169 positions[i] = Position { line, column };
170 i += 1;
171
172 if i == N {
173 // We've run out of offsets to look for
174 break;
175 }
176 }
177
178 // Windows (\r\n) line endings will be handled
179 // fine here, with the \r counting as an extra char
180 // at the end of the line.
181 if c == '\n' {
182 line += 1;
183 column = 0;
184 } else {
185 column += num_code_units(encoding, c);
186 }
187 }
188
189 // If any offsets couldn't be mapped, map them to <eof>
190 while i < N {
191 positions[i] = Position { line, column };
192 i += 1;
193 }
194
195 positions
196}
197
198fn num_code_units(encoding: Encoding, c: char) -> u32 {
199 match encoding {
200 Encoding::Utf8 => u32(c.len_utf8()),
201 Encoding::Utf16 => u32(c.len_utf16()),
202 }
203}
204
205fn u32(value: usize) -> u32 {
206 // Compiler uses u32 for offsets, so safe to assume all strings
207 // we deal with are shorter than u32::MAX.
208 u32::try_from(value).expect("value should fit in u32")
209}
210