microsoft/qdk
Publicmirrored fromhttps://github.com/microsoft/qdkAvailable
compiler/qsc_data_structures/src/line_column.rs
209lines · modecode
| 1 | // Copyright (c) Microsoft Corporation. |
| 2 | // Licensed under the MIT License. |
| 3 | |
| 4 | #[cfg(test)] |
| 5 | mod tests; |
| 6 | |
| 7 | use crate::span::Span; |
| 8 | |
| 9 | /// A line and column pair that describes a position in a string. |
| 10 | #[derive(Clone, Copy, PartialEq, Debug, Eq, Hash)] |
| 11 | pub struct Position { |
| 12 | /// Line offset. |
| 13 | pub line: u32, |
| 14 | /// Column offset. |
| 15 | /// When created using [`Encoding::Utf8`], this is the byte offset. |
| 16 | /// When created using [`Encoding::Utf16`], this is the code unit (word) offset. |
| 17 | pub column: u32, |
| 18 | } |
| 19 | |
| 20 | /// The encoding to use when creating a [`Position`] from a source string and offset. |
| 21 | /// |
| 22 | /// For reference, when **indexing directly** into a string: |
| 23 | /// Rust uses UTF-8 byte offsets (only available through conversion into a byte array) |
| 24 | /// JavaScript uses UTF-16 code unit offsets |
| 25 | /// Python uses character offsets |
| 26 | /// |
| 27 | /// Additionally, all these languages provide alternate ways of iterating over |
| 28 | /// strings, which sometimes use code units and sometimes use characters, |
| 29 | /// confusing our understanding of what "nth character" means. |
| 30 | /// |
| 31 | /// Therefore it's important to be aware of exactly how this column offset will be treated |
| 32 | /// by the code that's using it. e.g. in LSP (language server protocol) and |
| 33 | /// DAP (debug adapter protocol) it's explicitly defined to use UTF-16 code units. |
| 34 | #[derive(Clone, Copy, PartialEq, Debug, Eq, Hash)] |
| 35 | pub enum Encoding { |
| 36 | Utf8, |
| 37 | Utf16, |
| 38 | } |
| 39 | |
| 40 | impl Position { |
| 41 | /// For a given string and utf-8 byte offset, returns a [`Position`] |
| 42 | /// that corresponds to that offset. |
| 43 | /// |
| 44 | /// The column information is expressed in code units, depending on the passed in `encoding`. |
| 45 | /// [`Encoding::Utf8`] will use byte offsets, and [`Encoding::Utf16`] will use |
| 46 | /// utf-16 code unit (word) offsets. |
| 47 | /// |
| 48 | /// If the given offset is past the end of the string, returns the position of |
| 49 | /// the end of the string (e.g. for "hello" the end of the string is line=0, column=5). |
| 50 | /// |
| 51 | /// Note that this function does not validate whether the passed in offset |
| 52 | /// is a valid utf8 char boundary. If an invalid offset is passed in, |
| 53 | /// the next char boundary will be returned. |
| 54 | #[must_use] |
| 55 | pub fn from_utf8_byte_offset( |
| 56 | encoding: Encoding, |
| 57 | contents: &str, |
| 58 | utf8_byte_offset: u32, |
| 59 | ) -> Self { |
| 60 | positions_from_utf8_byte_offsets(encoding, contents, [utf8_byte_offset])[0] |
| 61 | } |
| 62 | |
| 63 | /// For a given string, returns the utf-8 byte offset that corresponds |
| 64 | /// to this [`Position`] in that string. |
| 65 | /// |
| 66 | /// The column information in the position is interpreted based on the |
| 67 | /// passed in `encoding`. [`Encoding::Utf8`] will treat the column information |
| 68 | /// as byte offsets, and [`Encoding::Utf16`] will use utf-16 code unit (word) offsets. |
| 69 | /// |
| 70 | /// If the position is past the end of the string, returns the offset of |
| 71 | /// the end of the string (e.g. for "hello" returns 5). |
| 72 | #[must_use] |
| 73 | pub fn to_utf8_byte_offset(&self, encoding: Encoding, contents: &str) -> u32 { |
| 74 | let mut column: u32 = 0; |
| 75 | let mut line: u32 = 0; |
| 76 | |
| 77 | for (byte_offset, c) in contents.char_indices() { |
| 78 | if c == '\n' { |
| 79 | line += 1; |
| 80 | column = 0; |
| 81 | } else { |
| 82 | column += num_code_units(encoding, c); |
| 83 | } |
| 84 | |
| 85 | if line > self.line || (line == self.line && column > self.column) { |
| 86 | // We moved past the target line+column |
| 87 | return u32(byte_offset); |
| 88 | } |
| 89 | } |
| 90 | |
| 91 | // return eof if we move past the end of the string |
| 92 | u32(contents.len()) |
| 93 | } |
| 94 | } |
| 95 | |
| 96 | /// Same as a [`Span`], but represented with [`Position`]s instead of offsets. |
| 97 | #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)] |
| 98 | pub struct Range { |
| 99 | pub start: Position, |
| 100 | pub end: Position, |
| 101 | } |
| 102 | |
| 103 | impl Range { |
| 104 | /// For a given string and a span expressed in utf-8 byte offsets, |
| 105 | /// returns a [`Range`] that corresponds to that span. |
| 106 | /// |
| 107 | /// Any offsets that go past the end of the string are mapped to the end of the string. |
| 108 | /// (e.g. for "hello", the span (0-10) would map to (0,0-0,5)). |
| 109 | /// |
| 110 | /// This function does not validate span range. If `span.hi > span.lo`, the |
| 111 | /// end offset is simply mapped to the end of the string. |
| 112 | /// (e.g. for "hello", the invalid span (3-0) would map to (0,3-0,5)). |
| 113 | /// |
| 114 | /// This function also does not validate whether the passed in offset |
| 115 | /// is a valid utf8 char boundary. If an invalid offset is passed in, |
| 116 | /// the next char boundary will be returned. |
| 117 | #[must_use] |
| 118 | pub fn from_span(encoding: Encoding, contents: &str, span: &Span) -> Self { |
| 119 | let [start, end] = positions_from_utf8_byte_offsets(encoding, contents, [span.lo, span.hi]); |
| 120 | Self { start, end } |
| 121 | } |
| 122 | |
| 123 | #[must_use] |
| 124 | pub fn empty(&self) -> bool { |
| 125 | self.start == self.end |
| 126 | } |
| 127 | } |
| 128 | |
| 129 | /// For a given string and array of utf-8 byte offsets, returns the [`Position`]s |
| 130 | /// corresponding to the byte offsets. |
| 131 | /// |
| 132 | /// Since this function iterates over the string once, the array *MUST* be in sorted order. |
| 133 | /// |
| 134 | /// This function takes a (const sized) array to avoid allocations. |
| 135 | fn positions_from_utf8_byte_offsets<const N: usize>( |
| 136 | encoding: Encoding, |
| 137 | contents: &str, |
| 138 | sorted_utf8_byte_offsets: [u32; N], |
| 139 | ) -> [Position; N] { |
| 140 | // The below example contains characters that are encoded |
| 141 | // with different numbers of code units in utf-8 and utf-16, |
| 142 | // to demonstrate how code unit offset will differ depending |
| 143 | // on the chosen encoding. Characters can be encoded with: |
| 144 | // One code unit (byte) in utf-8, one code unit (word) in utf-16 |
| 145 | // Multiple code units in utf-8, one code unit in utf-16 |
| 146 | // Multiple code units in utf-8, surrogate pair in utf-16 |
| 147 | |
| 148 | // chars | 𝑓 ( 𝑥 ⃗ ) Σ <eof> |
| 149 | // unicode code point | 1d453 28 1d465 20d7 29 3a3 |
| 150 | // utf-8 units (bytes) | f09d9193 28 f09d91a5 e28397 29 cea3 |
| 151 | // utf-16 units | d835 dc53 0028 d835 dc65 20d7 0029 03a3 |
| 152 | // char offset | 0 1 2 3 4 5 6 |
| 153 | // utf-8 byte offset | 0 4 5 9 12 13 15 |
| 154 | // utf-16 code unit offset | 0 2 3 5 6 7 8 |
| 155 | |
| 156 | let mut positions = [Position { line: 0, column: 0 }; N]; |
| 157 | let mut i = 0; |
| 158 | let mut column: u32 = 0; |
| 159 | let mut line: u32 = 0; |
| 160 | |
| 161 | for (char_index, c) in contents.char_indices() { |
| 162 | if i == N { |
| 163 | // We've run out of offsets to look for |
| 164 | break; |
| 165 | } |
| 166 | |
| 167 | // We've moved past the next requested offset |
| 168 | while char_index >= sorted_utf8_byte_offsets[i] as usize { |
| 169 | positions[i] = Position { line, column }; |
| 170 | i += 1; |
| 171 | |
| 172 | if i == N { |
| 173 | // We've run out of offsets to look for |
| 174 | break; |
| 175 | } |
| 176 | } |
| 177 | |
| 178 | // Windows (\r\n) line endings will be handled |
| 179 | // fine here, with the \r counting as an extra char |
| 180 | // at the end of the line. |
| 181 | if c == '\n' { |
| 182 | line += 1; |
| 183 | column = 0; |
| 184 | } else { |
| 185 | column += num_code_units(encoding, c); |
| 186 | } |
| 187 | } |
| 188 | |
| 189 | // If any offsets couldn't be mapped, map them to <eof> |
| 190 | while i < N { |
| 191 | positions[i] = Position { line, column }; |
| 192 | i += 1; |
| 193 | } |
| 194 | |
| 195 | positions |
| 196 | } |
| 197 | |
| 198 | fn num_code_units(encoding: Encoding, c: char) -> u32 { |
| 199 | match encoding { |
| 200 | Encoding::Utf8 => u32(c.len_utf8()), |
| 201 | Encoding::Utf16 => u32(c.len_utf16()), |
| 202 | } |
| 203 | } |
| 204 | |
| 205 | fn u32(value: usize) -> u32 { |
| 206 | // Compiler uses u32 for offsets, so safe to assume all strings |
| 207 | // we deal with are shorter than u32::MAX. |
| 208 | u32::try_from(value).expect("value should fit in u32") |
| 209 | } |
| 210 | |