microsoft/qdk
Publicmirrored from https://github.com/microsoft/qdkAvailable
compiler/qsc_parse/src/lex/raw.rs
432lines · modecode
| 1 | // Copyright (c) Microsoft Corporation. |
| 2 | // Licensed under the MIT License. |
| 3 | |
| 4 | //! The first lexing phase transforms an input string into literals, single-character operators, |
| 5 | //! whitespace, and comments. Keywords are treated as identifiers. The raw token stream is |
| 6 | //! contiguous: there are no gaps between tokens. |
| 7 | //! |
| 8 | //! These are "raw" tokens because single-character operators don't always correspond to Q# |
| 9 | //! operators, and whitespace and comments will later be discarded. Raw tokens are the ingredients |
| 10 | //! that are "cooked" into compound tokens before they can be consumed by the parser. |
| 11 | //! |
| 12 | //! Tokens never contain substrings from the original input, but are simply labels that refer back |
| 13 | //! to offsets in the input. Lexing never fails, but may produce unknown tokens. |
| 14 | |
| 15 | #[cfg(test)] |
| 16 | mod tests; |
| 17 | |
| 18 | use super::{Delim, InterpolatedEnding, InterpolatedStart, Radix}; |
| 19 | use enum_iterator::Sequence; |
| 20 | use std::{ |
| 21 | fmt::{self, Display, Formatter, Write}, |
| 22 | iter::Peekable, |
| 23 | str::CharIndices, |
| 24 | }; |
| 25 | |
| 26 | /// A raw token. |
| 27 | #[derive(Clone, Debug, Eq, PartialEq)] |
| 28 | pub(super) struct Token { |
| 29 | /// The token kind. |
| 30 | pub(super) kind: TokenKind, |
| 31 | /// The byte offset of the token starting character. |
| 32 | pub(super) offset: u32, |
| 33 | } |
| 34 | |
| 35 | #[derive(Clone, Copy, Debug, Eq, PartialEq, Sequence)] |
| 36 | pub(crate) enum TokenKind { |
| 37 | Comment(CommentKind), |
| 38 | Ident, |
| 39 | Number(Number), |
| 40 | Single(Single), |
| 41 | String(StringToken), |
| 42 | Unknown, |
| 43 | Whitespace, |
| 44 | } |
| 45 | |
| 46 | impl Display for TokenKind { |
| 47 | fn fmt(&self, f: &mut Formatter) -> fmt::Result { |
| 48 | match self { |
| 49 | TokenKind::Comment(CommentKind::Normal) => f.write_str("comment"), |
| 50 | TokenKind::Comment(CommentKind::Doc) => f.write_str("doc comment"), |
| 51 | TokenKind::Ident => f.write_str("identifier"), |
| 52 | TokenKind::Number(Number::BigInt(_)) => f.write_str("big integer"), |
| 53 | TokenKind::Number(Number::Float) => f.write_str("float"), |
| 54 | TokenKind::Number(Number::Int(_)) => f.write_str("integer"), |
| 55 | TokenKind::Single(single) => write!(f, "`{single}`"), |
| 56 | TokenKind::String(_) => f.write_str("string"), |
| 57 | TokenKind::Unknown => f.write_str("unknown"), |
| 58 | TokenKind::Whitespace => f.write_str("whitespace"), |
| 59 | } |
| 60 | } |
| 61 | } |
| 62 | |
| 63 | /// A single-character operator token. |
| 64 | #[derive(Clone, Copy, Debug, Eq, PartialEq, Sequence)] |
| 65 | pub(crate) enum Single { |
| 66 | /// `&` |
| 67 | Amp, |
| 68 | /// `'` |
| 69 | Apos, |
| 70 | /// `@` |
| 71 | At, |
| 72 | /// `!` |
| 73 | Bang, |
| 74 | /// `|` |
| 75 | Bar, |
| 76 | /// `^` |
| 77 | Caret, |
| 78 | /// A closing delimiter. |
| 79 | Close(Delim), |
| 80 | /// `:` |
| 81 | Colon, |
| 82 | /// `,` |
| 83 | Comma, |
| 84 | /// `.` |
| 85 | Dot, |
| 86 | /// `=` |
| 87 | Eq, |
| 88 | /// `>` |
| 89 | Gt, |
| 90 | /// `<` |
| 91 | Lt, |
| 92 | /// `-` |
| 93 | Minus, |
| 94 | /// An opening delimiter. |
| 95 | Open(Delim), |
| 96 | /// `%` |
| 97 | Percent, |
| 98 | /// `+` |
| 99 | Plus, |
| 100 | /// `?` |
| 101 | Question, |
| 102 | /// `;` |
| 103 | Semi, |
| 104 | /// `/` |
| 105 | Slash, |
| 106 | /// `*` |
| 107 | Star, |
| 108 | /// `~` |
| 109 | Tilde, |
| 110 | } |
| 111 | |
| 112 | impl Display for Single { |
| 113 | fn fmt(&self, f: &mut Formatter) -> fmt::Result { |
| 114 | f.write_char(match self { |
| 115 | Single::Amp => '&', |
| 116 | Single::Apos => '\'', |
| 117 | Single::At => '@', |
| 118 | Single::Bang => '!', |
| 119 | Single::Bar => '|', |
| 120 | Single::Caret => '^', |
| 121 | Single::Close(Delim::Brace) => '}', |
| 122 | Single::Close(Delim::Bracket) => ']', |
| 123 | Single::Close(Delim::Paren) => ')', |
| 124 | Single::Colon => ':', |
| 125 | Single::Comma => ',', |
| 126 | Single::Dot => '.', |
| 127 | Single::Eq => '=', |
| 128 | Single::Gt => '>', |
| 129 | Single::Lt => '<', |
| 130 | Single::Minus => '-', |
| 131 | Single::Open(Delim::Brace) => '{', |
| 132 | Single::Open(Delim::Bracket) => '[', |
| 133 | Single::Open(Delim::Paren) => '(', |
| 134 | Single::Percent => '%', |
| 135 | Single::Plus => '+', |
| 136 | Single::Question => '?', |
| 137 | Single::Semi => ';', |
| 138 | Single::Slash => '/', |
| 139 | Single::Star => '*', |
| 140 | Single::Tilde => '~', |
| 141 | }) |
| 142 | } |
| 143 | } |
| 144 | |
| 145 | #[derive(Clone, Copy, Debug, Eq, PartialEq, Sequence)] |
| 146 | pub(crate) enum Number { |
| 147 | BigInt(Radix), |
| 148 | Float, |
| 149 | Int(Radix), |
| 150 | } |
| 151 | |
| 152 | #[derive(Clone, Copy, Debug, Eq, PartialEq, Sequence)] |
| 153 | pub(crate) enum StringToken { |
| 154 | Normal { terminated: bool }, |
| 155 | Interpolated(InterpolatedStart, Option<InterpolatedEnding>), |
| 156 | } |
| 157 | |
| 158 | #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
| 159 | enum StringKind { |
| 160 | Normal, |
| 161 | Interpolated, |
| 162 | } |
| 163 | |
| 164 | #[derive(Clone, Copy, Debug, Eq, PartialEq, Sequence)] |
| 165 | pub(crate) enum CommentKind { |
| 166 | Normal, |
| 167 | Doc, |
| 168 | } |
| 169 | |
| 170 | #[derive(Clone)] |
| 171 | pub(super) struct Lexer<'a> { |
| 172 | chars: Peekable<CharIndices<'a>>, |
| 173 | interpolation: u8, |
| 174 | } |
| 175 | |
| 176 | impl<'a> Lexer<'a> { |
| 177 | pub(super) fn new(input: &'a str) -> Self { |
| 178 | Self { |
| 179 | chars: input.char_indices().peekable(), |
| 180 | interpolation: 0, |
| 181 | } |
| 182 | } |
| 183 | |
| 184 | fn next_if_eq(&mut self, c: char) -> bool { |
| 185 | self.chars.next_if(|i| i.1 == c).is_some() |
| 186 | } |
| 187 | |
| 188 | fn eat_while(&mut self, mut f: impl FnMut(char) -> bool) { |
| 189 | while self.chars.next_if(|i| f(i.1)).is_some() {} |
| 190 | } |
| 191 | |
| 192 | /// Returns the first character ahead of the cursor without consuming it. This operation is fast, |
| 193 | /// but if you know you want to consume the character if it matches, use [`next_if_eq`] instead. |
| 194 | fn first(&mut self) -> Option<char> { |
| 195 | self.chars.peek().map(|i| i.1) |
| 196 | } |
| 197 | |
| 198 | /// Returns the second character ahead of the cursor without consuming it. This is slower |
| 199 | /// than [`first`] and should be avoided when possible. |
| 200 | fn second(&self) -> Option<char> { |
| 201 | let mut chars = self.chars.clone(); |
| 202 | chars.next(); |
| 203 | chars.next().map(|i| i.1) |
| 204 | } |
| 205 | |
| 206 | fn whitespace(&mut self, c: char) -> bool { |
| 207 | if c.is_whitespace() { |
| 208 | self.eat_while(char::is_whitespace); |
| 209 | true |
| 210 | } else { |
| 211 | false |
| 212 | } |
| 213 | } |
| 214 | |
| 215 | fn comment(&mut self, c: char) -> Option<CommentKind> { |
| 216 | if c == '/' && self.next_if_eq('/') { |
| 217 | let kind = if self.first() == Some('/') && self.second() != Some('/') { |
| 218 | self.chars.next(); |
| 219 | CommentKind::Doc |
| 220 | } else { |
| 221 | CommentKind::Normal |
| 222 | }; |
| 223 | |
| 224 | self.eat_while(|c| c != '\n'); |
| 225 | Some(kind) |
| 226 | } else { |
| 227 | None |
| 228 | } |
| 229 | } |
| 230 | |
| 231 | fn ident(&mut self, c: char) -> bool { |
| 232 | if c == '_' || c.is_alphabetic() { |
| 233 | self.eat_while(|c| c == '_' || c.is_alphanumeric()); |
| 234 | true |
| 235 | } else { |
| 236 | false |
| 237 | } |
| 238 | } |
| 239 | |
| 240 | fn number(&mut self, c: char) -> Option<Number> { |
| 241 | self.leading_zero(c).or_else(|| self.decimal(c)) |
| 242 | } |
| 243 | |
| 244 | fn leading_zero(&mut self, c: char) -> Option<Number> { |
| 245 | if c != '0' { |
| 246 | return None; |
| 247 | } |
| 248 | |
| 249 | let radix = if self.next_if_eq('b') { |
| 250 | Radix::Binary |
| 251 | } else if self.next_if_eq('o') { |
| 252 | Radix::Octal |
| 253 | } else if self.next_if_eq('x') { |
| 254 | Radix::Hexadecimal |
| 255 | } else { |
| 256 | Radix::Decimal |
| 257 | }; |
| 258 | |
| 259 | self.eat_while(|c| c == '_' || c.is_digit(radix.into())); |
| 260 | if self.next_if_eq('L') { |
| 261 | Some(Number::BigInt(radix)) |
| 262 | } else if radix == Radix::Decimal && self.float() { |
| 263 | Some(Number::Float) |
| 264 | } else { |
| 265 | Some(Number::Int(radix)) |
| 266 | } |
| 267 | } |
| 268 | |
| 269 | fn decimal(&mut self, c: char) -> Option<Number> { |
| 270 | if !c.is_ascii_digit() { |
| 271 | return None; |
| 272 | } |
| 273 | |
| 274 | self.eat_while(|c| c == '_' || c.is_ascii_digit()); |
| 275 | |
| 276 | if self.float() { |
| 277 | Some(Number::Float) |
| 278 | } else if self.next_if_eq('L') { |
| 279 | Some(Number::BigInt(Radix::Decimal)) |
| 280 | } else { |
| 281 | Some(Number::Int(Radix::Decimal)) |
| 282 | } |
| 283 | } |
| 284 | |
| 285 | fn float(&mut self) -> bool { |
| 286 | // Watch out for ranges: `0..` should be an integer followed by two dots. |
| 287 | if self.first() == Some('.') && self.second() != Some('.') { |
| 288 | self.chars.next(); |
| 289 | self.eat_while(|c| c == '_' || c.is_ascii_digit()); |
| 290 | self.exp(); |
| 291 | true |
| 292 | } else { |
| 293 | self.exp() |
| 294 | } |
| 295 | } |
| 296 | |
| 297 | fn exp(&mut self) -> bool { |
| 298 | if self.next_if_eq('e') { |
| 299 | self.chars.next_if(|i| i.1 == '+' || i.1 == '-'); |
| 300 | self.eat_while(|c| c.is_ascii_digit()); |
| 301 | true |
| 302 | } else { |
| 303 | false |
| 304 | } |
| 305 | } |
| 306 | |
| 307 | fn string(&mut self, c: char) -> Option<TokenKind> { |
| 308 | let kind = self.start_string(c)?; |
| 309 | |
| 310 | while self |
| 311 | .first() |
| 312 | .map_or(false, |c| !is_string_terminator(kind, c)) |
| 313 | { |
| 314 | self.eat_while(|c| c != '\\' && !is_string_terminator(kind, c)); |
| 315 | if self.next_if_eq('\\') { |
| 316 | self.chars.next(); |
| 317 | } |
| 318 | } |
| 319 | |
| 320 | Some(TokenKind::String(self.finish_string(c, kind))) |
| 321 | } |
| 322 | |
| 323 | fn start_string(&mut self, c: char) -> Option<StringKind> { |
| 324 | if c == '$' { |
| 325 | if self.next_if_eq('"') { |
| 326 | Some(StringKind::Interpolated) |
| 327 | } else { |
| 328 | None |
| 329 | } |
| 330 | } else if c == '"' { |
| 331 | Some(StringKind::Normal) |
| 332 | } else if self.interpolation > 0 && c == '}' { |
| 333 | self.interpolation = self |
| 334 | .interpolation |
| 335 | .checked_sub(1) |
| 336 | .expect("interpolation level should have been incremented at left brace"); |
| 337 | Some(StringKind::Interpolated) |
| 338 | } else { |
| 339 | None |
| 340 | } |
| 341 | } |
| 342 | |
| 343 | fn finish_string(&mut self, start: char, kind: StringKind) -> StringToken { |
| 344 | match kind { |
| 345 | StringKind::Normal => StringToken::Normal { |
| 346 | terminated: self.next_if_eq('"'), |
| 347 | }, |
| 348 | StringKind::Interpolated => { |
| 349 | let start = if start == '$' { |
| 350 | InterpolatedStart::DollarQuote |
| 351 | } else { |
| 352 | InterpolatedStart::RBrace |
| 353 | }; |
| 354 | |
| 355 | let end = if self.next_if_eq('{') { |
| 356 | self.interpolation = self |
| 357 | .interpolation |
| 358 | .checked_add(1) |
| 359 | .expect("interpolation should not exceed maximum depth"); |
| 360 | Some(InterpolatedEnding::LBrace) |
| 361 | } else if self.next_if_eq('"') { |
| 362 | Some(InterpolatedEnding::Quote) |
| 363 | } else { |
| 364 | None // Unterminated string. |
| 365 | }; |
| 366 | |
| 367 | StringToken::Interpolated(start, end) |
| 368 | } |
| 369 | } |
| 370 | } |
| 371 | } |
| 372 | |
| 373 | impl Iterator for Lexer<'_> { |
| 374 | type Item = Token; |
| 375 | |
| 376 | fn next(&mut self) -> Option<Self::Item> { |
| 377 | let (offset, c) = self.chars.next()?; |
| 378 | let kind = if let Some(kind) = self.comment(c) { |
| 379 | TokenKind::Comment(kind) |
| 380 | } else if self.whitespace(c) { |
| 381 | TokenKind::Whitespace |
| 382 | } else if self.ident(c) { |
| 383 | TokenKind::Ident |
| 384 | } else { |
| 385 | self.number(c) |
| 386 | .map(TokenKind::Number) |
| 387 | .or_else(|| self.string(c)) |
| 388 | .or_else(|| single(c).map(TokenKind::Single)) |
| 389 | .unwrap_or(TokenKind::Unknown) |
| 390 | }; |
| 391 | Some(Token { |
| 392 | kind, |
| 393 | offset: offset.try_into().expect("offset should fit into u32"), |
| 394 | }) |
| 395 | } |
| 396 | } |
| 397 | |
| 398 | fn single(c: char) -> Option<Single> { |
| 399 | match c { |
| 400 | '-' => Some(Single::Minus), |
| 401 | ',' => Some(Single::Comma), |
| 402 | ';' => Some(Single::Semi), |
| 403 | ':' => Some(Single::Colon), |
| 404 | '!' => Some(Single::Bang), |
| 405 | '?' => Some(Single::Question), |
| 406 | '.' => Some(Single::Dot), |
| 407 | '\'' => Some(Single::Apos), |
| 408 | '(' => Some(Single::Open(Delim::Paren)), |
| 409 | ')' => Some(Single::Close(Delim::Paren)), |
| 410 | '[' => Some(Single::Open(Delim::Bracket)), |
| 411 | ']' => Some(Single::Close(Delim::Bracket)), |
| 412 | '{' => Some(Single::Open(Delim::Brace)), |
| 413 | '}' => Some(Single::Close(Delim::Brace)), |
| 414 | '@' => Some(Single::At), |
| 415 | '*' => Some(Single::Star), |
| 416 | '/' => Some(Single::Slash), |
| 417 | '&' => Some(Single::Amp), |
| 418 | '%' => Some(Single::Percent), |
| 419 | '^' => Some(Single::Caret), |
| 420 | '+' => Some(Single::Plus), |
| 421 | '<' => Some(Single::Lt), |
| 422 | '=' => Some(Single::Eq), |
| 423 | '>' => Some(Single::Gt), |
| 424 | '|' => Some(Single::Bar), |
| 425 | '~' => Some(Single::Tilde), |
| 426 | _ => None, |
| 427 | } |
| 428 | } |
| 429 | |
| 430 | fn is_string_terminator(kind: StringKind, c: char) -> bool { |
| 431 | c == '"' || kind == StringKind::Interpolated && c == '{' |
| 432 | } |