microsoft/qdk
Publicmirrored fromhttps://github.com/microsoft/qdkAvailable
compiler/qsc_data_structures/src/line_column/tests.rs
398lines · modecode
| 1 | // Copyright (c) Microsoft Corporation. |
| 2 | // Licensed under the MIT License. |
| 3 | |
| 4 | use crate::span::Span; |
| 5 | |
| 6 | use super::{Encoding, Position, Range}; |
| 7 | use expect_test::expect; |
| 8 | use std::fmt::Write; |
| 9 | |
| 10 | #[test] |
| 11 | fn empty_string() { |
| 12 | let contents = ""; |
| 13 | let pos = Position::from_utf8_byte_offset(Encoding::Utf8, contents, 0); |
| 14 | expect![[r" |
| 15 | Position { |
| 16 | line: 0, |
| 17 | column: 0, |
| 18 | } |
| 19 | "]] |
| 20 | .assert_debug_eq(&pos); |
| 21 | } |
| 22 | |
| 23 | #[test] |
| 24 | fn offset_out_of_bounds() { |
| 25 | let contents = "hello"; |
| 26 | let pos = Position::from_utf8_byte_offset(Encoding::Utf8, contents, 10); |
| 27 | // Sould return the <eof> position |
| 28 | expect![[r" |
| 29 | Position { |
| 30 | line: 0, |
| 31 | column: 5, |
| 32 | } |
| 33 | "]] |
| 34 | .assert_debug_eq(&pos); |
| 35 | } |
| 36 | |
| 37 | #[allow(clippy::cast_possible_truncation)] |
| 38 | #[test] |
| 39 | fn position_out_of_bounds() { |
| 40 | let contents = "hello"; |
| 41 | // A position that is off range for the given string |
| 42 | let pos = Position { |
| 43 | line: 10, |
| 44 | column: 10, |
| 45 | }; |
| 46 | let offset = pos.to_utf8_byte_offset(Encoding::Utf8, contents); |
| 47 | // Sould return the <eof> offset |
| 48 | assert!(offset == contents.len() as u32); |
| 49 | } |
| 50 | |
| 51 | #[test] |
| 52 | fn one_line() { |
| 53 | let contents = "Hello, world!"; |
| 54 | check_all_offsets( |
| 55 | contents, |
| 56 | &expect![[r" |
| 57 | byte | utf-8 | utf-16 | char |
| 58 | 0 | 0, 0 | 0, 0 | 'H' |
| 59 | 1 | 0, 1 | 0, 1 | 'e' |
| 60 | 2 | 0, 2 | 0, 2 | 'l' |
| 61 | 3 | 0, 3 | 0, 3 | 'l' |
| 62 | 4 | 0, 4 | 0, 4 | 'o' |
| 63 | 5 | 0, 5 | 0, 5 | ',' |
| 64 | 6 | 0, 6 | 0, 6 | ' ' |
| 65 | 7 | 0, 7 | 0, 7 | 'w' |
| 66 | 8 | 0, 8 | 0, 8 | 'o' |
| 67 | 9 | 0, 9 | 0, 9 | 'r' |
| 68 | 10 | 0,10 | 0,10 | 'l' |
| 69 | 11 | 0,11 | 0,11 | 'd' |
| 70 | 12 | 0,12 | 0,12 | '!' |
| 71 | 13 | 0,13 | 0,13 | <eof> |
| 72 | "]], |
| 73 | ); |
| 74 | } |
| 75 | |
| 76 | #[test] |
| 77 | fn lines() { |
| 78 | let contents = "line1\nline2\nline3"; |
| 79 | check_all_offsets( |
| 80 | contents, |
| 81 | &expect![[r" |
| 82 | byte | utf-8 | utf-16 | char |
| 83 | 0 | 0, 0 | 0, 0 | 'l' |
| 84 | 1 | 0, 1 | 0, 1 | 'i' |
| 85 | 2 | 0, 2 | 0, 2 | 'n' |
| 86 | 3 | 0, 3 | 0, 3 | 'e' |
| 87 | 4 | 0, 4 | 0, 4 | '1' |
| 88 | 5 | 0, 5 | 0, 5 | '\n' |
| 89 | 6 | 1, 0 | 1, 0 | 'l' |
| 90 | 7 | 1, 1 | 1, 1 | 'i' |
| 91 | 8 | 1, 2 | 1, 2 | 'n' |
| 92 | 9 | 1, 3 | 1, 3 | 'e' |
| 93 | 10 | 1, 4 | 1, 4 | '2' |
| 94 | 11 | 1, 5 | 1, 5 | '\n' |
| 95 | 12 | 2, 0 | 2, 0 | 'l' |
| 96 | 13 | 2, 1 | 2, 1 | 'i' |
| 97 | 14 | 2, 2 | 2, 2 | 'n' |
| 98 | 15 | 2, 3 | 2, 3 | 'e' |
| 99 | 16 | 2, 4 | 2, 4 | '3' |
| 100 | 17 | 2, 5 | 2, 5 | <eof> |
| 101 | "]], |
| 102 | ); |
| 103 | } |
| 104 | |
| 105 | #[test] |
| 106 | fn newline_at_end() { |
| 107 | let contents = "Hello, world!\n"; |
| 108 | check_all_offsets( |
| 109 | contents, |
| 110 | &expect![[r" |
| 111 | byte | utf-8 | utf-16 | char |
| 112 | 0 | 0, 0 | 0, 0 | 'H' |
| 113 | 1 | 0, 1 | 0, 1 | 'e' |
| 114 | 2 | 0, 2 | 0, 2 | 'l' |
| 115 | 3 | 0, 3 | 0, 3 | 'l' |
| 116 | 4 | 0, 4 | 0, 4 | 'o' |
| 117 | 5 | 0, 5 | 0, 5 | ',' |
| 118 | 6 | 0, 6 | 0, 6 | ' ' |
| 119 | 7 | 0, 7 | 0, 7 | 'w' |
| 120 | 8 | 0, 8 | 0, 8 | 'o' |
| 121 | 9 | 0, 9 | 0, 9 | 'r' |
| 122 | 10 | 0,10 | 0,10 | 'l' |
| 123 | 11 | 0,11 | 0,11 | 'd' |
| 124 | 12 | 0,12 | 0,12 | '!' |
| 125 | 13 | 0,13 | 0,13 | '\n' |
| 126 | 14 | 1, 0 | 1, 0 | <eof> |
| 127 | "]], |
| 128 | ); |
| 129 | } |
| 130 | |
| 131 | #[test] |
| 132 | fn windows_crlf_line_breaks() { |
| 133 | let contents = "line1\r\nline2\r\n"; |
| 134 | check_all_offsets( |
| 135 | contents, |
| 136 | &expect![[r" |
| 137 | byte | utf-8 | utf-16 | char |
| 138 | 0 | 0, 0 | 0, 0 | 'l' |
| 139 | 1 | 0, 1 | 0, 1 | 'i' |
| 140 | 2 | 0, 2 | 0, 2 | 'n' |
| 141 | 3 | 0, 3 | 0, 3 | 'e' |
| 142 | 4 | 0, 4 | 0, 4 | '1' |
| 143 | 5 | 0, 5 | 0, 5 | '\r' |
| 144 | 6 | 0, 6 | 0, 6 | '\n' |
| 145 | 7 | 1, 0 | 1, 0 | 'l' |
| 146 | 8 | 1, 1 | 1, 1 | 'i' |
| 147 | 9 | 1, 2 | 1, 2 | 'n' |
| 148 | 10 | 1, 3 | 1, 3 | 'e' |
| 149 | 11 | 1, 4 | 1, 4 | '2' |
| 150 | 12 | 1, 5 | 1, 5 | '\r' |
| 151 | 13 | 1, 6 | 1, 6 | '\n' |
| 152 | 14 | 2, 0 | 2, 0 | <eof> |
| 153 | "]], |
| 154 | ); |
| 155 | } |
| 156 | |
| 157 | #[test] |
| 158 | fn utf_8_multibyte() { |
| 159 | // utf-8 encoding has multi-unit characters, utf-16 doesn't |
| 160 | // string | ççç |
| 161 | // chars | ç ç ç |
| 162 | // code points | e7 e7 e7 |
| 163 | // utf-8 units | c3a7 c3a7 c3a7 |
| 164 | // utf-16 units | 00e7 00e7 00e7 |
| 165 | let contents = "ççç\nççç"; |
| 166 | check_all_offsets( |
| 167 | contents, |
| 168 | &expect![[r" |
| 169 | byte | utf-8 | utf-16 | char |
| 170 | 0 | 0, 0 | 0, 0 | 'ç' |
| 171 | 1 | 0, 2 | 0, 1 | |
| 172 | 2 | 0, 2 | 0, 1 | 'ç' |
| 173 | 3 | 0, 4 | 0, 2 | |
| 174 | 4 | 0, 4 | 0, 2 | 'ç' |
| 175 | 5 | 0, 6 | 0, 3 | |
| 176 | 6 | 0, 6 | 0, 3 | '\n' |
| 177 | 7 | 1, 0 | 1, 0 | 'ç' |
| 178 | 8 | 1, 2 | 1, 1 | |
| 179 | 9 | 1, 2 | 1, 1 | 'ç' |
| 180 | 10 | 1, 4 | 1, 2 | |
| 181 | 11 | 1, 4 | 1, 2 | 'ç' |
| 182 | 12 | 1, 6 | 1, 3 | |
| 183 | 13 | 1, 6 | 1, 3 | <eof> |
| 184 | "]], |
| 185 | ); |
| 186 | } |
| 187 | |
| 188 | #[test] |
| 189 | fn utf_8_multibyte_utf_16_surrogate() { |
| 190 | // both encodings have multi-unit characters |
| 191 | // string | 𝑓𝑓 |
| 192 | // chars | 𝑓 𝑓 |
| 193 | // code points | 1d453 1d453 |
| 194 | // utf-8 units | f09d9193 f09d9193 |
| 195 | // utf-16 units | d835 dc53 d835 dc53 |
| 196 | |
| 197 | let contents = "𝑓𝑓\n𝑓𝑓"; |
| 198 | check_all_offsets( |
| 199 | contents, |
| 200 | &expect![[r" |
| 201 | byte | utf-8 | utf-16 | char |
| 202 | 0 | 0, 0 | 0, 0 | '𝑓' |
| 203 | 1 | 0, 4 | 0, 2 | |
| 204 | 2 | 0, 4 | 0, 2 | |
| 205 | 3 | 0, 4 | 0, 2 | |
| 206 | 4 | 0, 4 | 0, 2 | '𝑓' |
| 207 | 5 | 0, 8 | 0, 4 | |
| 208 | 6 | 0, 8 | 0, 4 | |
| 209 | 7 | 0, 8 | 0, 4 | |
| 210 | 8 | 0, 8 | 0, 4 | '\n' |
| 211 | 9 | 1, 0 | 1, 0 | '𝑓' |
| 212 | 10 | 1, 4 | 1, 2 | |
| 213 | 11 | 1, 4 | 1, 2 | |
| 214 | 12 | 1, 4 | 1, 2 | |
| 215 | 13 | 1, 4 | 1, 2 | '𝑓' |
| 216 | 14 | 1, 8 | 1, 4 | |
| 217 | 15 | 1, 8 | 1, 4 | |
| 218 | 16 | 1, 8 | 1, 4 | |
| 219 | 17 | 1, 8 | 1, 4 | <eof> |
| 220 | "]], |
| 221 | ); |
| 222 | } |
| 223 | |
| 224 | #[test] |
| 225 | fn grapheme_clusters() { |
| 226 | // grapheme clusters, both encodings have multi-unit characters |
| 227 | // string | 𝑓(𝑥⃗) ≔ Σᵢ 𝑥ᵢ 𝑟ᵢ |
| 228 | // chars | 𝑓 ( 𝑥 ⃗ ) ≔ Σ ᵢ 𝑥 ᵢ 𝑟 ᵢ |
| 229 | // code points | 1d453 28 1d465 20d7 29 20 2254 20 3a3 1d62 20 1d465 1d62 20 1d45f 1d62 |
| 230 | // utf-8 units | f09d9193 28 f09d91a5 e28397 29 20 e28994 20 cea3 e1b5a2 20 f09d91a5 e1b5a2 20 f09d919f e1b5a2 |
| 231 | // utf-16 units | d835 dc53 0028 d835 dc65 20d7 0029 0020 2254 0020 03a3 1d62 0020 d835 dc65 1d62 0020 d835 dc5f 1d62 |
| 232 | |
| 233 | let contents = "𝑓(𝑥⃗) ≔ Σᵢ 𝑥ᵢ 𝑟ᵢ"; |
| 234 | check_all_offsets( |
| 235 | contents, |
| 236 | &expect![[r" |
| 237 | byte | utf-8 | utf-16 | char |
| 238 | 0 | 0, 0 | 0, 0 | '𝑓' |
| 239 | 1 | 0, 4 | 0, 2 | |
| 240 | 2 | 0, 4 | 0, 2 | |
| 241 | 3 | 0, 4 | 0, 2 | |
| 242 | 4 | 0, 4 | 0, 2 | '(' |
| 243 | 5 | 0, 5 | 0, 3 | '𝑥' |
| 244 | 6 | 0, 9 | 0, 5 | |
| 245 | 7 | 0, 9 | 0, 5 | |
| 246 | 8 | 0, 9 | 0, 5 | |
| 247 | 9 | 0, 9 | 0, 5 | '\u{20d7}' |
| 248 | 10 | 0,12 | 0, 6 | |
| 249 | 11 | 0,12 | 0, 6 | |
| 250 | 12 | 0,12 | 0, 6 | ')' |
| 251 | 13 | 0,13 | 0, 7 | ' ' |
| 252 | 14 | 0,14 | 0, 8 | '≔' |
| 253 | 15 | 0,17 | 0, 9 | |
| 254 | 16 | 0,17 | 0, 9 | |
| 255 | 17 | 0,17 | 0, 9 | ' ' |
| 256 | 18 | 0,18 | 0,10 | 'Σ' |
| 257 | 19 | 0,20 | 0,11 | |
| 258 | 20 | 0,20 | 0,11 | 'ᵢ' |
| 259 | 21 | 0,23 | 0,12 | |
| 260 | 22 | 0,23 | 0,12 | |
| 261 | 23 | 0,23 | 0,12 | ' ' |
| 262 | 24 | 0,24 | 0,13 | '𝑥' |
| 263 | 25 | 0,28 | 0,15 | |
| 264 | 26 | 0,28 | 0,15 | |
| 265 | 27 | 0,28 | 0,15 | |
| 266 | 28 | 0,28 | 0,15 | 'ᵢ' |
| 267 | 29 | 0,31 | 0,16 | |
| 268 | 30 | 0,31 | 0,16 | |
| 269 | 31 | 0,31 | 0,16 | ' ' |
| 270 | 32 | 0,32 | 0,17 | '𝑟' |
| 271 | 33 | 0,36 | 0,19 | |
| 272 | 34 | 0,36 | 0,19 | |
| 273 | 35 | 0,36 | 0,19 | |
| 274 | 36 | 0,36 | 0,19 | 'ᵢ' |
| 275 | 37 | 0,39 | 0,20 | |
| 276 | 38 | 0,39 | 0,20 | |
| 277 | 39 | 0,39 | 0,20 | <eof> |
| 278 | "]], |
| 279 | ); |
| 280 | } |
| 281 | |
| 282 | #[test] |
| 283 | fn empty_range() { |
| 284 | let contents = "hello"; |
| 285 | let span = Span { lo: 1, hi: 1 }; |
| 286 | let range = Range::from_span(Encoding::Utf8, contents, &span); |
| 287 | expect![[r" |
| 288 | Range { |
| 289 | start: Position { |
| 290 | line: 0, |
| 291 | column: 1, |
| 292 | }, |
| 293 | end: Position { |
| 294 | line: 0, |
| 295 | column: 1, |
| 296 | }, |
| 297 | } |
| 298 | "]] |
| 299 | .assert_debug_eq(&range); |
| 300 | } |
| 301 | |
| 302 | #[test] |
| 303 | fn range_across_lines() { |
| 304 | let contents = "line1\nline2"; |
| 305 | let span = Span { lo: 0, hi: 10 }; |
| 306 | let range = Range::from_span(Encoding::Utf8, contents, &span); |
| 307 | expect![[r" |
| 308 | Range { |
| 309 | start: Position { |
| 310 | line: 0, |
| 311 | column: 0, |
| 312 | }, |
| 313 | end: Position { |
| 314 | line: 1, |
| 315 | column: 4, |
| 316 | }, |
| 317 | } |
| 318 | "]] |
| 319 | .assert_debug_eq(&range); |
| 320 | } |
| 321 | |
| 322 | #[test] |
| 323 | fn range_out_of_bounds() { |
| 324 | let contents = "hello"; |
| 325 | let span = Span { lo: 6, hi: 10 }; |
| 326 | let range = Range::from_span(Encoding::Utf8, contents, &span); |
| 327 | expect![[r" |
| 328 | Range { |
| 329 | start: Position { |
| 330 | line: 0, |
| 331 | column: 5, |
| 332 | }, |
| 333 | end: Position { |
| 334 | line: 0, |
| 335 | column: 5, |
| 336 | }, |
| 337 | } |
| 338 | "]] |
| 339 | .assert_debug_eq(&range); |
| 340 | } |
| 341 | |
| 342 | #[allow(clippy::cast_possible_truncation)] |
| 343 | fn check_all_offsets(contents: &str, expected: &expect_test::Expect) { |
| 344 | let byte_offsets = 0..=contents.len(); |
| 345 | let positions = byte_offsets |
| 346 | .map(|offset| { |
| 347 | ( |
| 348 | offset, |
| 349 | Position::from_utf8_byte_offset( |
| 350 | Encoding::Utf8, |
| 351 | contents, |
| 352 | u32::try_from(offset).expect("offset should fit in u32"), |
| 353 | ), |
| 354 | Position::from_utf8_byte_offset( |
| 355 | Encoding::Utf16, |
| 356 | contents, |
| 357 | u32::try_from(offset).expect("offset should fit in u32"), |
| 358 | ), |
| 359 | ) |
| 360 | }) |
| 361 | .collect::<Vec<_>>(); |
| 362 | |
| 363 | // Generate a table for visual validation |
| 364 | let mut string = String::new(); |
| 365 | let _ = writeln!(string, "byte | utf-8 | utf-16 | char"); |
| 366 | for (offset, utf8pos, utf16pos) in &positions { |
| 367 | let char = if *offset == contents.len() { |
| 368 | " <eof>".to_string() |
| 369 | } else { |
| 370 | contents |
| 371 | .char_indices() |
| 372 | .find_map(|(i, c)| { |
| 373 | if i == *offset { |
| 374 | Some(format!(" {c:?}")) |
| 375 | } else { |
| 376 | None |
| 377 | } |
| 378 | }) |
| 379 | .unwrap_or(String::new()) |
| 380 | }; |
| 381 | |
| 382 | let _ = writeln!( |
| 383 | string, |
| 384 | "{offset: >4} | {: >2},{: >2} | {: >2},{: >2} |{}", |
| 385 | utf8pos.line, utf8pos.column, utf16pos.line, utf16pos.column, char |
| 386 | ); |
| 387 | } |
| 388 | |
| 389 | expected.assert_eq(&string); |
| 390 | |
| 391 | // also validate that we correctly map back to the original utf-8 byte offset |
| 392 | for (offset, utf8pos, utf16pos) in positions { |
| 393 | if contents.is_char_boundary(offset) { |
| 394 | assert!(utf8pos.to_utf8_byte_offset(Encoding::Utf8, contents) == offset as u32); |
| 395 | assert!(utf16pos.to_utf8_byte_offset(Encoding::Utf16, contents) == offset as u32); |
| 396 | } |
| 397 | } |
| 398 | } |
| 399 | |