// Copyright (c) Microsoft Corporation. // Licensed under the MIT License. #[cfg(test)] mod tests; use crate::span::Span; /// A line and column pair that describes a position in a string. #[derive(Clone, Copy, PartialEq, Debug, Eq, Hash)] pub struct Position { /// Line offset. pub line: u32, /// Column offset. /// When created using [`Encoding::Utf8`], this is the byte offset. /// When created using [`Encoding::Utf16`], this is the code unit (word) offset. pub column: u32, } /// The encoding to use when creating a [`Position`] from a source string and offset. /// /// For reference, when **indexing directly** into a string: /// Rust uses UTF-8 byte offsets (only available through conversion into a byte array) /// JavaScript uses UTF-16 code unit offsets /// Python uses character offsets /// /// Additionally, all these languages provide alternate ways of iterating over /// strings, which sometimes use code units and sometimes use characters, /// confusing our understanding of what "nth character" means. /// /// Therefore it's important to be aware of exactly how this column offset will be treated /// by the code that's using it. e.g. in LSP (language server protocol) and /// DAP (debug adapter protocol) it's explicitly defined to use UTF-16 code units. #[derive(Clone, Copy, PartialEq, Debug, Eq, Hash)] pub enum Encoding { Utf8, Utf16, } impl Position { /// For a given string and utf-8 byte offset, returns a [`Position`] /// that corresponds to that offset. /// /// The column information is expressed in code units, depending on the passed in `encoding`. /// [`Encoding::Utf8`] will use byte offsets, and [`Encoding::Utf16`] will use /// utf-16 code unit (word) offsets. /// /// If the given offset is past the end of the string, returns the position of /// the end of the string (e.g. for "hello" the end of the string is line=0, column=5). /// /// Note that this function does not validate whether the passed in offset /// is a valid utf8 char boundary. If an invalid offset is passed in, /// the next char boundary will be returned. #[must_use] pub fn from_utf8_byte_offset( encoding: Encoding, contents: &str, utf8_byte_offset: u32, ) -> Self { positions_from_utf8_byte_offsets(encoding, contents, [utf8_byte_offset])[0] } /// For a given string, returns the utf-8 byte offset that corresponds /// to this [`Position`] in that string. /// /// The column information in the position is interpreted based on the /// passed in `encoding`. [`Encoding::Utf8`] will treat the column information /// as byte offsets, and [`Encoding::Utf16`] will use utf-16 code unit (word) offsets. /// /// If the position is past the end of the string, returns the offset of /// the end of the string (e.g. for "hello" returns 5). #[must_use] pub fn to_utf8_byte_offset(&self, encoding: Encoding, contents: &str) -> u32 { let mut column: u32 = 0; let mut line: u32 = 0; for (byte_offset, c) in contents.char_indices() { if c == '\n' { line += 1; column = 0; } else { column += num_code_units(encoding, c); } if line > self.line || (line == self.line && column > self.column) { // We moved past the target line+column return u32(byte_offset); } } // return eof if we move past the end of the string u32(contents.len()) } } /// Same as a [`Span`], but represented with [`Position`]s instead of offsets. #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)] pub struct Range { pub start: Position, pub end: Position, } impl Range { /// For a given string and a span expressed in utf-8 byte offsets, /// returns a [`Range`] that corresponds to that span. /// /// Any offsets that go past the end of the string are mapped to the end of the string. /// (e.g. for "hello", the span (0-10) would map to (0,0-0,5)). /// /// This function does not validate span range. If `span.hi > span.lo`, the /// end offset is simply mapped to the end of the string. /// (e.g. for "hello", the invalid span (3-0) would map to (0,3-0,5)). /// /// This function also does not validate whether the passed in offset /// is a valid utf8 char boundary. If an invalid offset is passed in, /// the next char boundary will be returned. #[must_use] pub fn from_span(encoding: Encoding, contents: &str, span: &Span) -> Self { let [start, end] = positions_from_utf8_byte_offsets(encoding, contents, [span.lo, span.hi]); Self { start, end } } #[must_use] pub fn empty(&self) -> bool { self.start == self.end } } /// For a given string and array of utf-8 byte offsets, returns the [`Position`]s /// corresponding to the byte offsets. /// /// Since this function iterates over the string once, the array *MUST* be in sorted order. /// /// This function takes a (const sized) array to avoid allocations. fn positions_from_utf8_byte_offsets( encoding: Encoding, contents: &str, sorted_utf8_byte_offsets: [u32; N], ) -> [Position; N] { // The below example contains characters that are encoded // with different numbers of code units in utf-8 and utf-16, // to demonstrate how code unit offset will differ depending // on the chosen encoding. Characters can be encoded with: // One code unit (byte) in utf-8, one code unit (word) in utf-16 // Multiple code units in utf-8, one code unit in utf-16 // Multiple code units in utf-8, surrogate pair in utf-16 // chars | 𝑓 ( 𝑥 ⃗ ) Σ // unicode code point | 1d453 28 1d465 20d7 29 3a3 // utf-8 units (bytes) | f09d9193 28 f09d91a5 e28397 29 cea3 // utf-16 units | d835 dc53 0028 d835 dc65 20d7 0029 03a3 // char offset | 0 1 2 3 4 5 6 // utf-8 byte offset | 0 4 5 9 12 13 15 // utf-16 code unit offset | 0 2 3 5 6 7 8 let mut positions = [Position { line: 0, column: 0 }; N]; let mut i = 0; let mut column: u32 = 0; let mut line: u32 = 0; for (char_index, c) in contents.char_indices() { if i == N { // We've run out of offsets to look for break; } // We've moved past the next requested offset while char_index >= sorted_utf8_byte_offsets[i] as usize { positions[i] = Position { line, column }; i += 1; if i == N { // We've run out of offsets to look for break; } } // Windows (\r\n) line endings will be handled // fine here, with the \r counting as an extra char // at the end of the line. if c == '\n' { line += 1; column = 0; } else { column += num_code_units(encoding, c); } } // If any offsets couldn't be mapped, map them to while i < N { positions[i] = Position { line, column }; i += 1; } positions } fn num_code_units(encoding: Encoding, c: char) -> u32 { match encoding { Encoding::Utf8 => u32(c.len_utf8()), Encoding::Utf16 => u32(c.len_utf16()), } } fn u32(value: usize) -> u32 { // Compiler uses u32 for offsets, so safe to assume all strings // we deal with are shorter than u32::MAX. u32::try_from(value).expect("value should fit in u32") }