microsoft/qdk

Public

mirrored fromhttps://github.com/microsoft/qdkAvailable

Watch0 Fork0 Star0

Code Commits Issues Pull requests Actions Insights Security

cesarzc/ssa-panic

Find a branch or tag

Branches

cesarzc/ssa-panic

Clone

HTTPS

Download ZIP

qdk/compiler/qsc_data_structures/src

compiler/qsc_data_structures/src/line_column.rs

209lines · modecode

Raw Download

Latest commit unavailable.

unknown

1	`// Copyright (c) Microsoft Corporation.`
2	`// Licensed under the MIT License.`
3
4	`#[cfg(test)]`
5	`mod tests;`
6
7	`use crate::span::Span;`
8
9	`/// A line and column pair that describes a position in a string.`
10	`#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash)]`
11	`pub struct Position {`
12	`/// Line offset.`
13	`pub line: u32,`
14	`/// Column offset.`
15	/// When created using [`Encoding::Utf8`], this is the byte offset.
16	/// When created using [`Encoding::Utf16`], this is the code unit (word) offset.
17	`pub column: u32,`
18	`}`
19
20	/// The encoding to use when creating a [`Position`] from a source string and offset.
21	`///`
22	`/// For reference, when indexing directly into a string:`
23	`/// Rust uses UTF-8 byte offsets (only available through conversion into a byte array)`
24	`/// JavaScript uses UTF-16 code unit offsets`
25	`/// Python uses character offsets`
26	`///`
27	`/// Additionally, all these languages provide alternate ways of iterating over`
28	`/// strings, which sometimes use code units and sometimes use characters,`
29	`/// confusing our understanding of what "nth character" means.`
30	`///`
31	`/// Therefore it's important to be aware of exactly how this column offset will be treated`
32	`/// by the code that's using it. e.g. in LSP (language server protocol) and`
33	`/// DAP (debug adapter protocol) it's explicitly defined to use UTF-16 code units.`
34	`#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash)]`
35	`pub enum Encoding {`
36	`Utf8,`
37	`Utf16,`
38	`}`
39
40	`impl Position {`
41	/// For a given string and utf-8 byte offset, returns a [`Position`]
42	`/// that corresponds to that offset.`
43	`///`
44	/// The column information is expressed in code units, depending on the passed in `encoding`.
45	/// [`Encoding::Utf8`] will use byte offsets, and [`Encoding::Utf16`] will use
46	`/// utf-16 code unit (word) offsets.`
47	`///`
48	`/// If the given offset is past the end of the string, returns the position of`
49	`/// the end of the string (e.g. for "hello" the end of the string is line=0, column=5).`
50	`///`
51	`/// Note that this function does not validate whether the passed in offset`
52	`/// is a valid utf8 char boundary. If an invalid offset is passed in,`
53	`/// the next char boundary will be returned.`
54	`#[must_use]`
55	`pub fn from_utf8_byte_offset(`
56	`encoding: Encoding,`
57	`contents: &str,`
58	`utf8_byte_offset: u32,`
59	`) -> Self {`
60	`positions_from_utf8_byte_offsets(encoding, contents, [utf8_byte_offset])[0]`
61	`}`
62
63	`/// For a given string, returns the utf-8 byte offset that corresponds`
64	/// to this [`Position`] in that string.
65	`///`
66	`/// The column information in the position is interpreted based on the`
67	/// passed in `encoding`. [`Encoding::Utf8`] will treat the column information
68	/// as byte offsets, and [`Encoding::Utf16`] will use utf-16 code unit (word) offsets.
69	`///`
70	`/// If the position is past the end of the string, returns the offset of`
71	`/// the end of the string (e.g. for "hello" returns 5).`
72	`#[must_use]`
73	`pub fn to_utf8_byte_offset(&self, encoding: Encoding, contents: &str) -> u32 {`
74	`let mut column: u32 = 0;`
75	`let mut line: u32 = 0;`
76
77	`for (byte_offset, c) in contents.char_indices() {`
78	`if c == '\n' {`
79	`line += 1;`
80	`column = 0;`
81	`} else {`
82	`column += num_code_units(encoding, c);`
83	`}`
84
85	`if line > self.line \|\| (line == self.line && column > self.column) {`
86	`// We moved past the target line+column`
87	`return u32(byte_offset);`
88	`}`
89	`}`
90
91	`// return eof if we move past the end of the string`
92	`u32(contents.len())`
93	`}`
94	`}`
95
96	/// Same as a [`Span`], but represented with [`Position`]s instead of offsets.
97	`#[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]`
98	`pub struct Range {`
99	`pub start: Position,`
100	`pub end: Position,`
101	`}`
102
103	`impl Range {`
104	`/// For a given string and a span expressed in utf-8 byte offsets,`
105	/// returns a [`Range`] that corresponds to that span.
106	`///`
107	`/// Any offsets that go past the end of the string are mapped to the end of the string.`
108	`/// (e.g. for "hello", the span (0-10) would map to (0,0-0,5)).`
109	`///`
110	/// This function does not validate span range. If `span.hi > span.lo`, the
111	`/// end offset is simply mapped to the end of the string.`
112	`/// (e.g. for "hello", the invalid span (3-0) would map to (0,3-0,5)).`
113	`///`
114	`/// This function also does not validate whether the passed in offset`
115	`/// is a valid utf8 char boundary. If an invalid offset is passed in,`
116	`/// the next char boundary will be returned.`
117	`#[must_use]`
118	`pub fn from_span(encoding: Encoding, contents: &str, span: &Span) -> Self {`
119	`let [start, end] = positions_from_utf8_byte_offsets(encoding, contents, [span.lo, span.hi]);`
120	`Self { start, end }`
121	`}`
122
123	`#[must_use]`
124	`pub fn empty(&self) -> bool {`
125	`self.start == self.end`
126	`}`
127	`}`
128
129	/// For a given string and array of utf-8 byte offsets, returns the [`Position`]s
130	`/// corresponding to the byte offsets.`
131	`///`
132	`/// Since this function iterates over the string once, the array MUST be in sorted order.`
133	`///`
134	`/// This function takes a (const sized) array to avoid allocations.`
135	`fn positions_from_utf8_byte_offsets<const N: usize>(`
136	`encoding: Encoding,`
137	`contents: &str,`
138	`sorted_utf8_byte_offsets: [u32; N],`
139	`) -> [Position; N] {`
140	`// The below example contains characters that are encoded`
141	`// with different numbers of code units in utf-8 and utf-16,`
142	`// to demonstrate how code unit offset will differ depending`
143	`// on the chosen encoding. Characters can be encoded with:`
144	`// One code unit (byte) in utf-8, one code unit (word) in utf-16`
145	`// Multiple code units in utf-8, one code unit in utf-16`
146	`// Multiple code units in utf-8, surrogate pair in utf-16`
147
148	`// chars \| 𝑓 ( 𝑥 ⃗ ) Σ <eof>`
149	`// unicode code point \| 1d453 28 1d465 20d7 29 3a3`
150	`// utf-8 units (bytes) \| f09d9193 28 f09d91a5 e28397 29 cea3`
151	`// utf-16 units \| d835 dc53 0028 d835 dc65 20d7 0029 03a3`
152	`// char offset \| 0 1 2 3 4 5 6`
153	`// utf-8 byte offset \| 0 4 5 9 12 13 15`
154	`// utf-16 code unit offset \| 0 2 3 5 6 7 8`
155
156	`let mut positions = [Position { line: 0, column: 0 }; N];`
157	`let mut i = 0;`
158	`let mut column: u32 = 0;`
159	`let mut line: u32 = 0;`
160
161	`for (char_index, c) in contents.char_indices() {`
162	`if i == N {`
163	`// We've run out of offsets to look for`
164	`break;`
165	`}`
166
167	`// We've moved past the next requested offset`
168	`while char_index >= sorted_utf8_byte_offsets[i] as usize {`
169	`positions[i] = Position { line, column };`
170	`i += 1;`
171
172	`if i == N {`
173	`// We've run out of offsets to look for`
174	`break;`
175	`}`
176	`}`
177
178	`// Windows (\r\n) line endings will be handled`
179	`// fine here, with the \r counting as an extra char`
180	`// at the end of the line.`
181	`if c == '\n' {`
182	`line += 1;`
183	`column = 0;`
184	`} else {`
185	`column += num_code_units(encoding, c);`
186	`}`
187	`}`
188
189	`// If any offsets couldn't be mapped, map them to <eof>`
190	`while i < N {`
191	`positions[i] = Position { line, column };`
192	`i += 1;`
193	`}`
194
195	`positions`
196	`}`
197
198	`fn num_code_units(encoding: Encoding, c: char) -> u32 {`
199	`match encoding {`
200	`Encoding::Utf8 => u32(c.len_utf8()),`
201	`Encoding::Utf16 => u32(c.len_utf16()),`
202	`}`
203	`}`
204
205	`fn u32(value: usize) -> u32 {`
206	`// Compiler uses u32 for offsets, so safe to assume all strings`
207	`// we deal with are shorter than u32::MAX.`
208	`u32::try_from(value).expect("value should fit in u32")`
209	`}`
210

microsoft/qdk

Branches

Tags

Clone