microsoft/qdk
Publicmirrored fromhttps://github.com/microsoft/qdkAvailable
compiler/qsc_parse/src/lex/cooked.rs
522lines · modecode
| 1 | // Copyright (c) Microsoft Corporation. |
| 2 | // Licensed under the MIT License. |
| 3 | |
| 4 | //! The second lexing phase "cooks" a raw token stream, transforming them into tokens that directly |
| 5 | //! correspond to components in the Q# grammar. Keywords are treated as identifiers, except `and` |
| 6 | //! and `or`, which are cooked into [`ClosedBinOp`] so that `and=` and `or=` are lexed correctly. |
| 7 | //! |
| 8 | //! Whitespace and comment tokens are discarded; this means that cooked tokens are not necessarily |
| 9 | //! contiguous, so they include both a starting and ending byte offset. |
| 10 | //! |
| 11 | //! Tokens never contain substrings from the original input, but are simply labels that refer back |
| 12 | //! to regions in the input. Lexing never fails, but may produce error tokens. |
| 13 | |
| 14 | #[cfg(test)] |
| 15 | mod tests; |
| 16 | |
| 17 | use super::{ |
| 18 | raw::{self, Number, Single}, |
| 19 | Delim, InterpolatedEnding, InterpolatedStart, Radix, |
| 20 | }; |
| 21 | use crate::keyword::Keyword; |
| 22 | use enum_iterator::Sequence; |
| 23 | use miette::Diagnostic; |
| 24 | use qsc_data_structures::span::Span; |
| 25 | use std::{ |
| 26 | fmt::{self, Display, Formatter}, |
| 27 | iter::Peekable, |
| 28 | }; |
| 29 | use thiserror::Error; |
| 30 | |
| 31 | #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
| 32 | pub(crate) struct Token { |
| 33 | pub(crate) kind: TokenKind, |
| 34 | pub(crate) span: Span, |
| 35 | } |
| 36 | |
| 37 | #[derive(Clone, Copy, Debug, Diagnostic, Eq, Error, PartialEq)] |
| 38 | pub(crate) enum Error { |
| 39 | #[error("expected {0} to complete {1}, found {2}")] |
| 40 | #[diagnostic(code("Qsc.Lex.Incomplete"))] |
| 41 | Incomplete(raw::TokenKind, TokenKind, raw::TokenKind, #[label] Span), |
| 42 | |
| 43 | #[error("expected {0} to complete {1}, found EOF")] |
| 44 | #[diagnostic(code("Qsc.Lex.IncompleteEof"))] |
| 45 | IncompleteEof(raw::TokenKind, TokenKind, #[label] Span), |
| 46 | |
| 47 | #[error("unterminated string literal")] |
| 48 | #[diagnostic(code("Qsc.Lex.UnterminatedString"))] |
| 49 | UnterminatedString(#[label] Span), |
| 50 | |
| 51 | #[error("unrecognized character `{0}`")] |
| 52 | #[diagnostic(code("Qsc.Lex.UnknownChar"))] |
| 53 | Unknown(char, #[label] Span), |
| 54 | } |
| 55 | |
| 56 | impl Error { |
| 57 | pub(crate) fn with_offset(self, offset: u32) -> Self { |
| 58 | match self { |
| 59 | Self::Incomplete(expected, token, actual, span) => { |
| 60 | Self::Incomplete(expected, token, actual, span + offset) |
| 61 | } |
| 62 | Self::IncompleteEof(expected, token, span) => { |
| 63 | Self::IncompleteEof(expected, token, span + offset) |
| 64 | } |
| 65 | Self::UnterminatedString(span) => Self::UnterminatedString(span + offset), |
| 66 | Self::Unknown(c, span) => Self::Unknown(c, span + offset), |
| 67 | } |
| 68 | } |
| 69 | } |
| 70 | |
| 71 | /// A token kind. |
| 72 | #[derive(Clone, Copy, Debug, Eq, PartialEq, Sequence)] |
| 73 | pub(crate) enum TokenKind { |
| 74 | /// `'T` |
| 75 | /// used for generic parameters -- an apostrophe followed by an ident. |
| 76 | AposIdent, |
| 77 | /// `@` |
| 78 | At, |
| 79 | /// `!` |
| 80 | Bang, |
| 81 | /// `|` |
| 82 | Bar, |
| 83 | /// A big integer literal. |
| 84 | BigInt(Radix), |
| 85 | /// A closed binary operator followed by an equals token. |
| 86 | BinOpEq(ClosedBinOp), |
| 87 | /// A closing delimiter. |
| 88 | Close(Delim), |
| 89 | /// A closed binary operator not followed by an equals token. |
| 90 | ClosedBinOp(ClosedBinOp), |
| 91 | /// `:` |
| 92 | Colon, |
| 93 | /// `::` |
| 94 | ColonColon, |
| 95 | /// `,` |
| 96 | Comma, |
| 97 | /// A doc comment. |
| 98 | DocComment, |
| 99 | /// `.` |
| 100 | Dot, |
| 101 | /// `..` |
| 102 | DotDot, |
| 103 | /// `...` |
| 104 | DotDotDot, |
| 105 | /// End of file. |
| 106 | Eof, |
| 107 | /// `=` |
| 108 | Eq, |
| 109 | /// `==` |
| 110 | EqEq, |
| 111 | /// `=>` |
| 112 | FatArrow, |
| 113 | /// A floating-point literal. |
| 114 | Float, |
| 115 | /// `>` |
| 116 | Gt, |
| 117 | /// `>=` |
| 118 | Gte, |
| 119 | /// An identifier. |
| 120 | Ident, |
| 121 | /// An integer literal. |
| 122 | Int(Radix), |
| 123 | /// A keyword. |
| 124 | Keyword(Keyword), |
| 125 | /// `<-` |
| 126 | LArrow, |
| 127 | /// `<` |
| 128 | Lt, |
| 129 | /// `<=` |
| 130 | Lte, |
| 131 | /// `!=` |
| 132 | Ne, |
| 133 | /// An opening delimiter. |
| 134 | Open(Delim), |
| 135 | /// `?` |
| 136 | Question, |
| 137 | /// `->` |
| 138 | RArrow, |
| 139 | /// `;` |
| 140 | Semi, |
| 141 | /// A string literal. |
| 142 | String(StringToken), |
| 143 | /// `~~~` |
| 144 | TildeTildeTilde, |
| 145 | /// `w/` |
| 146 | WSlash, |
| 147 | /// `w/=` |
| 148 | WSlashEq, |
| 149 | } |
| 150 | |
| 151 | impl Display for TokenKind { |
| 152 | fn fmt(&self, f: &mut Formatter) -> fmt::Result { |
| 153 | match self { |
| 154 | TokenKind::AposIdent => f.write_str("apostrophe identifier"), |
| 155 | TokenKind::At => f.write_str("`@`"), |
| 156 | TokenKind::Bang => f.write_str("`!`"), |
| 157 | TokenKind::Bar => f.write_str("`|`"), |
| 158 | TokenKind::BigInt(_) => f.write_str("big integer"), |
| 159 | TokenKind::BinOpEq(op) => write!(f, "`{op}=`"), |
| 160 | TokenKind::Close(Delim::Brace) => f.write_str("`}`"), |
| 161 | TokenKind::Close(Delim::Bracket) => f.write_str("`]`"), |
| 162 | TokenKind::Close(Delim::Paren) => f.write_str("`)`"), |
| 163 | TokenKind::ClosedBinOp(op) => write!(f, "`{op}`"), |
| 164 | TokenKind::Colon => f.write_str("`:`"), |
| 165 | TokenKind::ColonColon => f.write_str("`::`"), |
| 166 | TokenKind::Comma => f.write_str("`,`"), |
| 167 | TokenKind::DocComment => f.write_str("doc comment"), |
| 168 | TokenKind::Dot => f.write_str("`.`"), |
| 169 | TokenKind::DotDot => f.write_str("`..`"), |
| 170 | TokenKind::DotDotDot => f.write_str("`...`"), |
| 171 | TokenKind::Eof => f.write_str("EOF"), |
| 172 | TokenKind::Eq => f.write_str("`=`"), |
| 173 | TokenKind::EqEq => f.write_str("`==`"), |
| 174 | TokenKind::FatArrow => f.write_str("`=>`"), |
| 175 | TokenKind::Float => f.write_str("float"), |
| 176 | TokenKind::Gt => f.write_str("`>`"), |
| 177 | TokenKind::Gte => f.write_str("`>=`"), |
| 178 | TokenKind::Ident => f.write_str("identifier"), |
| 179 | TokenKind::Int(_) => f.write_str("integer"), |
| 180 | TokenKind::Keyword(keyword) => write!(f, "keyword `{keyword}`"), |
| 181 | TokenKind::LArrow => f.write_str("`<-`"), |
| 182 | TokenKind::Lt => f.write_str("`<`"), |
| 183 | TokenKind::Lte => f.write_str("`<=`"), |
| 184 | TokenKind::Ne => f.write_str("`!=`"), |
| 185 | TokenKind::Open(Delim::Brace) => f.write_str("`{`"), |
| 186 | TokenKind::Open(Delim::Bracket) => f.write_str("`[`"), |
| 187 | TokenKind::Open(Delim::Paren) => f.write_str("`(`"), |
| 188 | TokenKind::Question => f.write_str("`?`"), |
| 189 | TokenKind::RArrow => f.write_str("`->`"), |
| 190 | TokenKind::Semi => f.write_str("`;`"), |
| 191 | TokenKind::String(_) => f.write_str("string"), |
| 192 | TokenKind::TildeTildeTilde => f.write_str("`~~~`"), |
| 193 | TokenKind::WSlash => f.write_str("`w/`"), |
| 194 | TokenKind::WSlashEq => f.write_str("`w/=`"), |
| 195 | } |
| 196 | } |
| 197 | } |
| 198 | |
| 199 | impl From<Number> for TokenKind { |
| 200 | fn from(value: Number) -> Self { |
| 201 | match value { |
| 202 | Number::BigInt(radix) => Self::BigInt(radix), |
| 203 | Number::Float => Self::Float, |
| 204 | Number::Int(radix) => Self::Int(radix), |
| 205 | } |
| 206 | } |
| 207 | } |
| 208 | |
| 209 | /// A binary operator that returns the same type as the type of its first operand; in other words, |
| 210 | /// the domain of the first operand is closed under this operation. These are candidates for |
| 211 | /// compound assignment operators, like `+=`. |
| 212 | #[derive(Clone, Copy, Debug, Eq, PartialEq, Sequence)] |
| 213 | pub(crate) enum ClosedBinOp { |
| 214 | /// `&&&` |
| 215 | AmpAmpAmp, |
| 216 | /// `and` |
| 217 | And, |
| 218 | /// `|||` |
| 219 | BarBarBar, |
| 220 | /// `^` |
| 221 | Caret, |
| 222 | /// `^^^` |
| 223 | CaretCaretCaret, |
| 224 | /// `>>>` |
| 225 | GtGtGt, |
| 226 | /// `<<<` |
| 227 | LtLtLt, |
| 228 | /// `-` |
| 229 | Minus, |
| 230 | /// `or` |
| 231 | Or, |
| 232 | /// `%` |
| 233 | Percent, |
| 234 | /// `+` |
| 235 | Plus, |
| 236 | /// `/` |
| 237 | Slash, |
| 238 | /// `*` |
| 239 | Star, |
| 240 | } |
| 241 | |
| 242 | impl Display for ClosedBinOp { |
| 243 | fn fmt(&self, f: &mut Formatter) -> fmt::Result { |
| 244 | f.write_str(match self { |
| 245 | ClosedBinOp::AmpAmpAmp => "&&&", |
| 246 | ClosedBinOp::And => "and", |
| 247 | ClosedBinOp::BarBarBar => "|||", |
| 248 | ClosedBinOp::Caret => "^", |
| 249 | ClosedBinOp::CaretCaretCaret => "^^^", |
| 250 | ClosedBinOp::GtGtGt => ">>>", |
| 251 | ClosedBinOp::LtLtLt => "<<<", |
| 252 | ClosedBinOp::Minus => "-", |
| 253 | ClosedBinOp::Or => "or", |
| 254 | ClosedBinOp::Percent => "%", |
| 255 | ClosedBinOp::Plus => "+", |
| 256 | ClosedBinOp::Slash => "/", |
| 257 | ClosedBinOp::Star => "*", |
| 258 | }) |
| 259 | } |
| 260 | } |
| 261 | |
| 262 | #[derive(Clone, Copy, Debug, Eq, PartialEq, Sequence)] |
| 263 | pub(crate) enum StringToken { |
| 264 | Normal, |
| 265 | Interpolated(InterpolatedStart, InterpolatedEnding), |
| 266 | } |
| 267 | |
| 268 | pub(crate) struct Lexer<'a> { |
| 269 | input: &'a str, |
| 270 | len: u32, |
| 271 | |
| 272 | // This uses a `Peekable` iterator over the raw lexer, which allows for one token lookahead. |
| 273 | tokens: Peekable<raw::Lexer<'a>>, |
| 274 | } |
| 275 | |
| 276 | impl<'a> Lexer<'a> { |
| 277 | pub(crate) fn new(input: &'a str) -> Self { |
| 278 | Self { |
| 279 | input, |
| 280 | len: input |
| 281 | .len() |
| 282 | .try_into() |
| 283 | .expect("input length should fit into u32"), |
| 284 | tokens: raw::Lexer::new(input).peekable(), |
| 285 | } |
| 286 | } |
| 287 | |
| 288 | fn offset(&mut self) -> u32 { |
| 289 | self.tokens.peek().map_or_else(|| self.len, |t| t.offset) |
| 290 | } |
| 291 | |
| 292 | fn next_if_eq_single(&mut self, single: Single) -> bool { |
| 293 | self.next_if_eq(raw::TokenKind::Single(single)) |
| 294 | } |
| 295 | |
| 296 | fn next_if_eq(&mut self, tok: raw::TokenKind) -> bool { |
| 297 | self.tokens.next_if(|t| t.kind == tok).is_some() |
| 298 | } |
| 299 | |
| 300 | fn expect_single(&mut self, single: Single, complete: TokenKind) -> Result<(), Error> { |
| 301 | self.expect(raw::TokenKind::Single(single), complete) |
| 302 | } |
| 303 | |
| 304 | fn expect(&mut self, tok: raw::TokenKind, complete: TokenKind) -> Result<(), Error> { |
| 305 | if self.next_if_eq(tok) { |
| 306 | Ok(()) |
| 307 | } else if let Some(&raw::Token { kind, offset }) = self.tokens.peek() { |
| 308 | let mut tokens = self.tokens.clone(); |
| 309 | let hi = tokens.nth(1).map_or_else(|| self.len, |t| t.offset); |
| 310 | let span = Span { lo: offset, hi }; |
| 311 | Err(Error::Incomplete(tok, complete, kind, span)) |
| 312 | } else { |
| 313 | let lo = self.len; |
| 314 | let span = Span { lo, hi: lo }; |
| 315 | Err(Error::IncompleteEof(tok, complete, span)) |
| 316 | } |
| 317 | } |
| 318 | |
| 319 | fn cook(&mut self, token: &raw::Token) -> Result<Option<Token>, Error> { |
| 320 | let kind = match token.kind { |
| 321 | raw::TokenKind::Comment(raw::CommentKind::Normal) | raw::TokenKind::Whitespace => { |
| 322 | Ok(None) |
| 323 | } |
| 324 | raw::TokenKind::Comment(raw::CommentKind::Doc) => Ok(Some(TokenKind::DocComment)), |
| 325 | raw::TokenKind::Ident => { |
| 326 | let ident = &self.input[(token.offset as usize)..(self.offset() as usize)]; |
| 327 | Ok(Some(self.ident(ident))) |
| 328 | } |
| 329 | raw::TokenKind::Number(number) => Ok(Some(number.into())), |
| 330 | raw::TokenKind::Single(single) => self.single(single).map(Some), |
| 331 | raw::TokenKind::String(raw::StringToken::Normal { terminated: true }) => { |
| 332 | Ok(Some(TokenKind::String(StringToken::Normal))) |
| 333 | } |
| 334 | raw::TokenKind::String(raw::StringToken::Interpolated(start, Some(ending))) => Ok( |
| 335 | Some(TokenKind::String(StringToken::Interpolated(start, ending))), |
| 336 | ), |
| 337 | raw::TokenKind::String( |
| 338 | raw::StringToken::Normal { terminated: false } |
| 339 | | raw::StringToken::Interpolated(_, None), |
| 340 | ) => Err(Error::UnterminatedString(Span { |
| 341 | lo: token.offset, |
| 342 | hi: token.offset, |
| 343 | })), |
| 344 | raw::TokenKind::Unknown => { |
| 345 | let c = self.input[(token.offset as usize)..] |
| 346 | .chars() |
| 347 | .next() |
| 348 | .expect("token offset should be the start of a character"); |
| 349 | let span = Span { |
| 350 | lo: token.offset, |
| 351 | hi: self.offset(), |
| 352 | }; |
| 353 | Err(Error::Unknown(c, span)) |
| 354 | } |
| 355 | }?; |
| 356 | |
| 357 | Ok(kind.map(|kind| { |
| 358 | let span = Span { |
| 359 | lo: token.offset, |
| 360 | hi: self.offset(), |
| 361 | }; |
| 362 | Token { kind, span } |
| 363 | })) |
| 364 | } |
| 365 | |
| 366 | #[allow(clippy::too_many_lines)] |
| 367 | fn single(&mut self, single: Single) -> Result<TokenKind, Error> { |
| 368 | match single { |
| 369 | Single::Amp => { |
| 370 | let op = ClosedBinOp::AmpAmpAmp; |
| 371 | self.expect_single(Single::Amp, TokenKind::ClosedBinOp(op))?; |
| 372 | self.expect_single(Single::Amp, TokenKind::ClosedBinOp(op))?; |
| 373 | Ok(self.closed_bin_op(op)) |
| 374 | } |
| 375 | Single::Apos => { |
| 376 | self.expect(raw::TokenKind::Ident, TokenKind::AposIdent)?; |
| 377 | Ok(TokenKind::AposIdent) |
| 378 | } |
| 379 | Single::At => Ok(TokenKind::At), |
| 380 | Single::Bang => { |
| 381 | if self.next_if_eq_single(Single::Eq) { |
| 382 | Ok(TokenKind::Ne) |
| 383 | } else { |
| 384 | Ok(TokenKind::Bang) |
| 385 | } |
| 386 | } |
| 387 | Single::Bar => { |
| 388 | if self.next_if_eq_single(Single::Bar) { |
| 389 | let op = ClosedBinOp::BarBarBar; |
| 390 | self.expect_single(Single::Bar, TokenKind::ClosedBinOp(op))?; |
| 391 | Ok(self.closed_bin_op(op)) |
| 392 | } else { |
| 393 | Ok(TokenKind::Bar) |
| 394 | } |
| 395 | } |
| 396 | Single::Caret => { |
| 397 | if self.next_if_eq_single(Single::Caret) { |
| 398 | let op = ClosedBinOp::CaretCaretCaret; |
| 399 | self.expect_single(Single::Caret, TokenKind::ClosedBinOp(op))?; |
| 400 | Ok(self.closed_bin_op(op)) |
| 401 | } else { |
| 402 | Ok(self.closed_bin_op(ClosedBinOp::Caret)) |
| 403 | } |
| 404 | } |
| 405 | Single::Close(delim) => Ok(TokenKind::Close(delim)), |
| 406 | Single::Colon => { |
| 407 | if self.next_if_eq_single(Single::Colon) { |
| 408 | Ok(TokenKind::ColonColon) |
| 409 | } else { |
| 410 | Ok(TokenKind::Colon) |
| 411 | } |
| 412 | } |
| 413 | Single::Comma => Ok(TokenKind::Comma), |
| 414 | Single::Dot => { |
| 415 | if self.next_if_eq_single(Single::Dot) { |
| 416 | if self.next_if_eq_single(Single::Dot) { |
| 417 | Ok(TokenKind::DotDotDot) |
| 418 | } else { |
| 419 | Ok(TokenKind::DotDot) |
| 420 | } |
| 421 | } else { |
| 422 | Ok(TokenKind::Dot) |
| 423 | } |
| 424 | } |
| 425 | Single::Eq => { |
| 426 | if self.next_if_eq_single(Single::Eq) { |
| 427 | Ok(TokenKind::EqEq) |
| 428 | } else if self.next_if_eq_single(Single::Gt) { |
| 429 | Ok(TokenKind::FatArrow) |
| 430 | } else { |
| 431 | Ok(TokenKind::Eq) |
| 432 | } |
| 433 | } |
| 434 | Single::Gt => { |
| 435 | if self.next_if_eq_single(Single::Eq) { |
| 436 | Ok(TokenKind::Gte) |
| 437 | } else if self.next_if_eq_single(Single::Gt) { |
| 438 | let op = ClosedBinOp::GtGtGt; |
| 439 | self.expect_single(Single::Gt, TokenKind::ClosedBinOp(op))?; |
| 440 | Ok(self.closed_bin_op(op)) |
| 441 | } else { |
| 442 | Ok(TokenKind::Gt) |
| 443 | } |
| 444 | } |
| 445 | Single::Lt => { |
| 446 | if self.next_if_eq_single(Single::Eq) { |
| 447 | Ok(TokenKind::Lte) |
| 448 | } else if self.next_if_eq_single(Single::Minus) { |
| 449 | Ok(TokenKind::LArrow) |
| 450 | } else if self.next_if_eq_single(Single::Lt) { |
| 451 | let op = ClosedBinOp::LtLtLt; |
| 452 | self.expect_single(Single::Lt, TokenKind::ClosedBinOp(op))?; |
| 453 | Ok(self.closed_bin_op(op)) |
| 454 | } else { |
| 455 | Ok(TokenKind::Lt) |
| 456 | } |
| 457 | } |
| 458 | Single::Minus => { |
| 459 | if self.next_if_eq_single(Single::Gt) { |
| 460 | Ok(TokenKind::RArrow) |
| 461 | } else { |
| 462 | Ok(self.closed_bin_op(ClosedBinOp::Minus)) |
| 463 | } |
| 464 | } |
| 465 | Single::Open(delim) => Ok(TokenKind::Open(delim)), |
| 466 | Single::Percent => Ok(self.closed_bin_op(ClosedBinOp::Percent)), |
| 467 | Single::Plus => Ok(self.closed_bin_op(ClosedBinOp::Plus)), |
| 468 | Single::Question => Ok(TokenKind::Question), |
| 469 | Single::Semi => Ok(TokenKind::Semi), |
| 470 | Single::Slash => Ok(self.closed_bin_op(ClosedBinOp::Slash)), |
| 471 | Single::Star => Ok(self.closed_bin_op(ClosedBinOp::Star)), |
| 472 | Single::Tilde => { |
| 473 | let complete = TokenKind::TildeTildeTilde; |
| 474 | self.expect_single(Single::Tilde, complete)?; |
| 475 | self.expect_single(Single::Tilde, complete)?; |
| 476 | Ok(complete) |
| 477 | } |
| 478 | } |
| 479 | } |
| 480 | |
| 481 | fn closed_bin_op(&mut self, op: ClosedBinOp) -> TokenKind { |
| 482 | if self.next_if_eq_single(Single::Eq) { |
| 483 | TokenKind::BinOpEq(op) |
| 484 | } else { |
| 485 | TokenKind::ClosedBinOp(op) |
| 486 | } |
| 487 | } |
| 488 | |
| 489 | fn ident(&mut self, ident: &str) -> TokenKind { |
| 490 | match ident { |
| 491 | "and" => self.closed_bin_op(ClosedBinOp::And), |
| 492 | "or" => self.closed_bin_op(ClosedBinOp::Or), |
| 493 | "w" if self.next_if_eq_single(Single::Slash) => { |
| 494 | if self.next_if_eq_single(Single::Eq) { |
| 495 | TokenKind::WSlashEq |
| 496 | } else { |
| 497 | TokenKind::WSlash |
| 498 | } |
| 499 | } |
| 500 | ident => ident |
| 501 | .parse() |
| 502 | .map(TokenKind::Keyword) |
| 503 | .unwrap_or(TokenKind::Ident), |
| 504 | } |
| 505 | } |
| 506 | } |
| 507 | |
| 508 | impl Iterator for Lexer<'_> { |
| 509 | type Item = Result<Token, Error>; |
| 510 | |
| 511 | fn next(&mut self) -> Option<Self::Item> { |
| 512 | while let Some(token) = self.tokens.next() { |
| 513 | match self.cook(&token) { |
| 514 | Ok(None) => {} |
| 515 | Ok(Some(token)) => return Some(Ok(token)), |
| 516 | Err(err) => return Some(Err(err)), |
| 517 | } |
| 518 | } |
| 519 | |
| 520 | None |
| 521 | } |
| 522 | } |
| 523 | |