microsoft/qdk

Public

mirrored fromhttps://github.com/microsoft/qdkAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
v1.1.3

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

compiler/qsc_parse/src/lex/cooked.rs

522lines · modecode

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! The second lexing phase "cooks" a raw token stream, transforming them into tokens that directly
5//! correspond to components in the Q# grammar. Keywords are treated as identifiers, except `and`
6//! and `or`, which are cooked into [`ClosedBinOp`] so that `and=` and `or=` are lexed correctly.
7//!
8//! Whitespace and comment tokens are discarded; this means that cooked tokens are not necessarily
9//! contiguous, so they include both a starting and ending byte offset.
10//!
11//! Tokens never contain substrings from the original input, but are simply labels that refer back
12//! to regions in the input. Lexing never fails, but may produce error tokens.
13
14#[cfg(test)]
15mod tests;
16
17use super::{
18 raw::{self, Number, Single},
19 Delim, InterpolatedEnding, InterpolatedStart, Radix,
20};
21use crate::keyword::Keyword;
22use enum_iterator::Sequence;
23use miette::Diagnostic;
24use qsc_data_structures::span::Span;
25use std::{
26 fmt::{self, Display, Formatter},
27 iter::Peekable,
28};
29use thiserror::Error;
30
31#[derive(Clone, Copy, Debug, Eq, PartialEq)]
32pub(crate) struct Token {
33 pub(crate) kind: TokenKind,
34 pub(crate) span: Span,
35}
36
37#[derive(Clone, Copy, Debug, Diagnostic, Eq, Error, PartialEq)]
38pub(crate) enum Error {
39 #[error("expected {0} to complete {1}, found {2}")]
40 #[diagnostic(code("Qsc.Lex.Incomplete"))]
41 Incomplete(raw::TokenKind, TokenKind, raw::TokenKind, #[label] Span),
42
43 #[error("expected {0} to complete {1}, found EOF")]
44 #[diagnostic(code("Qsc.Lex.IncompleteEof"))]
45 IncompleteEof(raw::TokenKind, TokenKind, #[label] Span),
46
47 #[error("unterminated string literal")]
48 #[diagnostic(code("Qsc.Lex.UnterminatedString"))]
49 UnterminatedString(#[label] Span),
50
51 #[error("unrecognized character `{0}`")]
52 #[diagnostic(code("Qsc.Lex.UnknownChar"))]
53 Unknown(char, #[label] Span),
54}
55
56impl Error {
57 pub(crate) fn with_offset(self, offset: u32) -> Self {
58 match self {
59 Self::Incomplete(expected, token, actual, span) => {
60 Self::Incomplete(expected, token, actual, span + offset)
61 }
62 Self::IncompleteEof(expected, token, span) => {
63 Self::IncompleteEof(expected, token, span + offset)
64 }
65 Self::UnterminatedString(span) => Self::UnterminatedString(span + offset),
66 Self::Unknown(c, span) => Self::Unknown(c, span + offset),
67 }
68 }
69}
70
71/// A token kind.
72#[derive(Clone, Copy, Debug, Eq, PartialEq, Sequence)]
73pub(crate) enum TokenKind {
74 /// `'T`
75 /// used for generic parameters -- an apostrophe followed by an ident.
76 AposIdent,
77 /// `@`
78 At,
79 /// `!`
80 Bang,
81 /// `|`
82 Bar,
83 /// A big integer literal.
84 BigInt(Radix),
85 /// A closed binary operator followed by an equals token.
86 BinOpEq(ClosedBinOp),
87 /// A closing delimiter.
88 Close(Delim),
89 /// A closed binary operator not followed by an equals token.
90 ClosedBinOp(ClosedBinOp),
91 /// `:`
92 Colon,
93 /// `::`
94 ColonColon,
95 /// `,`
96 Comma,
97 /// A doc comment.
98 DocComment,
99 /// `.`
100 Dot,
101 /// `..`
102 DotDot,
103 /// `...`
104 DotDotDot,
105 /// End of file.
106 Eof,
107 /// `=`
108 Eq,
109 /// `==`
110 EqEq,
111 /// `=>`
112 FatArrow,
113 /// A floating-point literal.
114 Float,
115 /// `>`
116 Gt,
117 /// `>=`
118 Gte,
119 /// An identifier.
120 Ident,
121 /// An integer literal.
122 Int(Radix),
123 /// A keyword.
124 Keyword(Keyword),
125 /// `<-`
126 LArrow,
127 /// `<`
128 Lt,
129 /// `<=`
130 Lte,
131 /// `!=`
132 Ne,
133 /// An opening delimiter.
134 Open(Delim),
135 /// `?`
136 Question,
137 /// `->`
138 RArrow,
139 /// `;`
140 Semi,
141 /// A string literal.
142 String(StringToken),
143 /// `~~~`
144 TildeTildeTilde,
145 /// `w/`
146 WSlash,
147 /// `w/=`
148 WSlashEq,
149}
150
151impl Display for TokenKind {
152 fn fmt(&self, f: &mut Formatter) -> fmt::Result {
153 match self {
154 TokenKind::AposIdent => f.write_str("apostrophe identifier"),
155 TokenKind::At => f.write_str("`@`"),
156 TokenKind::Bang => f.write_str("`!`"),
157 TokenKind::Bar => f.write_str("`|`"),
158 TokenKind::BigInt(_) => f.write_str("big integer"),
159 TokenKind::BinOpEq(op) => write!(f, "`{op}=`"),
160 TokenKind::Close(Delim::Brace) => f.write_str("`}`"),
161 TokenKind::Close(Delim::Bracket) => f.write_str("`]`"),
162 TokenKind::Close(Delim::Paren) => f.write_str("`)`"),
163 TokenKind::ClosedBinOp(op) => write!(f, "`{op}`"),
164 TokenKind::Colon => f.write_str("`:`"),
165 TokenKind::ColonColon => f.write_str("`::`"),
166 TokenKind::Comma => f.write_str("`,`"),
167 TokenKind::DocComment => f.write_str("doc comment"),
168 TokenKind::Dot => f.write_str("`.`"),
169 TokenKind::DotDot => f.write_str("`..`"),
170 TokenKind::DotDotDot => f.write_str("`...`"),
171 TokenKind::Eof => f.write_str("EOF"),
172 TokenKind::Eq => f.write_str("`=`"),
173 TokenKind::EqEq => f.write_str("`==`"),
174 TokenKind::FatArrow => f.write_str("`=>`"),
175 TokenKind::Float => f.write_str("float"),
176 TokenKind::Gt => f.write_str("`>`"),
177 TokenKind::Gte => f.write_str("`>=`"),
178 TokenKind::Ident => f.write_str("identifier"),
179 TokenKind::Int(_) => f.write_str("integer"),
180 TokenKind::Keyword(keyword) => write!(f, "keyword `{keyword}`"),
181 TokenKind::LArrow => f.write_str("`<-`"),
182 TokenKind::Lt => f.write_str("`<`"),
183 TokenKind::Lte => f.write_str("`<=`"),
184 TokenKind::Ne => f.write_str("`!=`"),
185 TokenKind::Open(Delim::Brace) => f.write_str("`{`"),
186 TokenKind::Open(Delim::Bracket) => f.write_str("`[`"),
187 TokenKind::Open(Delim::Paren) => f.write_str("`(`"),
188 TokenKind::Question => f.write_str("`?`"),
189 TokenKind::RArrow => f.write_str("`->`"),
190 TokenKind::Semi => f.write_str("`;`"),
191 TokenKind::String(_) => f.write_str("string"),
192 TokenKind::TildeTildeTilde => f.write_str("`~~~`"),
193 TokenKind::WSlash => f.write_str("`w/`"),
194 TokenKind::WSlashEq => f.write_str("`w/=`"),
195 }
196 }
197}
198
199impl From<Number> for TokenKind {
200 fn from(value: Number) -> Self {
201 match value {
202 Number::BigInt(radix) => Self::BigInt(radix),
203 Number::Float => Self::Float,
204 Number::Int(radix) => Self::Int(radix),
205 }
206 }
207}
208
209/// A binary operator that returns the same type as the type of its first operand; in other words,
210/// the domain of the first operand is closed under this operation. These are candidates for
211/// compound assignment operators, like `+=`.
212#[derive(Clone, Copy, Debug, Eq, PartialEq, Sequence)]
213pub(crate) enum ClosedBinOp {
214 /// `&&&`
215 AmpAmpAmp,
216 /// `and`
217 And,
218 /// `|||`
219 BarBarBar,
220 /// `^`
221 Caret,
222 /// `^^^`
223 CaretCaretCaret,
224 /// `>>>`
225 GtGtGt,
226 /// `<<<`
227 LtLtLt,
228 /// `-`
229 Minus,
230 /// `or`
231 Or,
232 /// `%`
233 Percent,
234 /// `+`
235 Plus,
236 /// `/`
237 Slash,
238 /// `*`
239 Star,
240}
241
242impl Display for ClosedBinOp {
243 fn fmt(&self, f: &mut Formatter) -> fmt::Result {
244 f.write_str(match self {
245 ClosedBinOp::AmpAmpAmp => "&&&",
246 ClosedBinOp::And => "and",
247 ClosedBinOp::BarBarBar => "|||",
248 ClosedBinOp::Caret => "^",
249 ClosedBinOp::CaretCaretCaret => "^^^",
250 ClosedBinOp::GtGtGt => ">>>",
251 ClosedBinOp::LtLtLt => "<<<",
252 ClosedBinOp::Minus => "-",
253 ClosedBinOp::Or => "or",
254 ClosedBinOp::Percent => "%",
255 ClosedBinOp::Plus => "+",
256 ClosedBinOp::Slash => "/",
257 ClosedBinOp::Star => "*",
258 })
259 }
260}
261
262#[derive(Clone, Copy, Debug, Eq, PartialEq, Sequence)]
263pub(crate) enum StringToken {
264 Normal,
265 Interpolated(InterpolatedStart, InterpolatedEnding),
266}
267
268pub(crate) struct Lexer<'a> {
269 input: &'a str,
270 len: u32,
271
272 // This uses a `Peekable` iterator over the raw lexer, which allows for one token lookahead.
273 tokens: Peekable<raw::Lexer<'a>>,
274}
275
276impl<'a> Lexer<'a> {
277 pub(crate) fn new(input: &'a str) -> Self {
278 Self {
279 input,
280 len: input
281 .len()
282 .try_into()
283 .expect("input length should fit into u32"),
284 tokens: raw::Lexer::new(input).peekable(),
285 }
286 }
287
288 fn offset(&mut self) -> u32 {
289 self.tokens.peek().map_or_else(|| self.len, |t| t.offset)
290 }
291
292 fn next_if_eq_single(&mut self, single: Single) -> bool {
293 self.next_if_eq(raw::TokenKind::Single(single))
294 }
295
296 fn next_if_eq(&mut self, tok: raw::TokenKind) -> bool {
297 self.tokens.next_if(|t| t.kind == tok).is_some()
298 }
299
300 fn expect_single(&mut self, single: Single, complete: TokenKind) -> Result<(), Error> {
301 self.expect(raw::TokenKind::Single(single), complete)
302 }
303
304 fn expect(&mut self, tok: raw::TokenKind, complete: TokenKind) -> Result<(), Error> {
305 if self.next_if_eq(tok) {
306 Ok(())
307 } else if let Some(&raw::Token { kind, offset }) = self.tokens.peek() {
308 let mut tokens = self.tokens.clone();
309 let hi = tokens.nth(1).map_or_else(|| self.len, |t| t.offset);
310 let span = Span { lo: offset, hi };
311 Err(Error::Incomplete(tok, complete, kind, span))
312 } else {
313 let lo = self.len;
314 let span = Span { lo, hi: lo };
315 Err(Error::IncompleteEof(tok, complete, span))
316 }
317 }
318
319 fn cook(&mut self, token: &raw::Token) -> Result<Option<Token>, Error> {
320 let kind = match token.kind {
321 raw::TokenKind::Comment(raw::CommentKind::Normal) | raw::TokenKind::Whitespace => {
322 Ok(None)
323 }
324 raw::TokenKind::Comment(raw::CommentKind::Doc) => Ok(Some(TokenKind::DocComment)),
325 raw::TokenKind::Ident => {
326 let ident = &self.input[(token.offset as usize)..(self.offset() as usize)];
327 Ok(Some(self.ident(ident)))
328 }
329 raw::TokenKind::Number(number) => Ok(Some(number.into())),
330 raw::TokenKind::Single(single) => self.single(single).map(Some),
331 raw::TokenKind::String(raw::StringToken::Normal { terminated: true }) => {
332 Ok(Some(TokenKind::String(StringToken::Normal)))
333 }
334 raw::TokenKind::String(raw::StringToken::Interpolated(start, Some(ending))) => Ok(
335 Some(TokenKind::String(StringToken::Interpolated(start, ending))),
336 ),
337 raw::TokenKind::String(
338 raw::StringToken::Normal { terminated: false }
339 | raw::StringToken::Interpolated(_, None),
340 ) => Err(Error::UnterminatedString(Span {
341 lo: token.offset,
342 hi: token.offset,
343 })),
344 raw::TokenKind::Unknown => {
345 let c = self.input[(token.offset as usize)..]
346 .chars()
347 .next()
348 .expect("token offset should be the start of a character");
349 let span = Span {
350 lo: token.offset,
351 hi: self.offset(),
352 };
353 Err(Error::Unknown(c, span))
354 }
355 }?;
356
357 Ok(kind.map(|kind| {
358 let span = Span {
359 lo: token.offset,
360 hi: self.offset(),
361 };
362 Token { kind, span }
363 }))
364 }
365
366 #[allow(clippy::too_many_lines)]
367 fn single(&mut self, single: Single) -> Result<TokenKind, Error> {
368 match single {
369 Single::Amp => {
370 let op = ClosedBinOp::AmpAmpAmp;
371 self.expect_single(Single::Amp, TokenKind::ClosedBinOp(op))?;
372 self.expect_single(Single::Amp, TokenKind::ClosedBinOp(op))?;
373 Ok(self.closed_bin_op(op))
374 }
375 Single::Apos => {
376 self.expect(raw::TokenKind::Ident, TokenKind::AposIdent)?;
377 Ok(TokenKind::AposIdent)
378 }
379 Single::At => Ok(TokenKind::At),
380 Single::Bang => {
381 if self.next_if_eq_single(Single::Eq) {
382 Ok(TokenKind::Ne)
383 } else {
384 Ok(TokenKind::Bang)
385 }
386 }
387 Single::Bar => {
388 if self.next_if_eq_single(Single::Bar) {
389 let op = ClosedBinOp::BarBarBar;
390 self.expect_single(Single::Bar, TokenKind::ClosedBinOp(op))?;
391 Ok(self.closed_bin_op(op))
392 } else {
393 Ok(TokenKind::Bar)
394 }
395 }
396 Single::Caret => {
397 if self.next_if_eq_single(Single::Caret) {
398 let op = ClosedBinOp::CaretCaretCaret;
399 self.expect_single(Single::Caret, TokenKind::ClosedBinOp(op))?;
400 Ok(self.closed_bin_op(op))
401 } else {
402 Ok(self.closed_bin_op(ClosedBinOp::Caret))
403 }
404 }
405 Single::Close(delim) => Ok(TokenKind::Close(delim)),
406 Single::Colon => {
407 if self.next_if_eq_single(Single::Colon) {
408 Ok(TokenKind::ColonColon)
409 } else {
410 Ok(TokenKind::Colon)
411 }
412 }
413 Single::Comma => Ok(TokenKind::Comma),
414 Single::Dot => {
415 if self.next_if_eq_single(Single::Dot) {
416 if self.next_if_eq_single(Single::Dot) {
417 Ok(TokenKind::DotDotDot)
418 } else {
419 Ok(TokenKind::DotDot)
420 }
421 } else {
422 Ok(TokenKind::Dot)
423 }
424 }
425 Single::Eq => {
426 if self.next_if_eq_single(Single::Eq) {
427 Ok(TokenKind::EqEq)
428 } else if self.next_if_eq_single(Single::Gt) {
429 Ok(TokenKind::FatArrow)
430 } else {
431 Ok(TokenKind::Eq)
432 }
433 }
434 Single::Gt => {
435 if self.next_if_eq_single(Single::Eq) {
436 Ok(TokenKind::Gte)
437 } else if self.next_if_eq_single(Single::Gt) {
438 let op = ClosedBinOp::GtGtGt;
439 self.expect_single(Single::Gt, TokenKind::ClosedBinOp(op))?;
440 Ok(self.closed_bin_op(op))
441 } else {
442 Ok(TokenKind::Gt)
443 }
444 }
445 Single::Lt => {
446 if self.next_if_eq_single(Single::Eq) {
447 Ok(TokenKind::Lte)
448 } else if self.next_if_eq_single(Single::Minus) {
449 Ok(TokenKind::LArrow)
450 } else if self.next_if_eq_single(Single::Lt) {
451 let op = ClosedBinOp::LtLtLt;
452 self.expect_single(Single::Lt, TokenKind::ClosedBinOp(op))?;
453 Ok(self.closed_bin_op(op))
454 } else {
455 Ok(TokenKind::Lt)
456 }
457 }
458 Single::Minus => {
459 if self.next_if_eq_single(Single::Gt) {
460 Ok(TokenKind::RArrow)
461 } else {
462 Ok(self.closed_bin_op(ClosedBinOp::Minus))
463 }
464 }
465 Single::Open(delim) => Ok(TokenKind::Open(delim)),
466 Single::Percent => Ok(self.closed_bin_op(ClosedBinOp::Percent)),
467 Single::Plus => Ok(self.closed_bin_op(ClosedBinOp::Plus)),
468 Single::Question => Ok(TokenKind::Question),
469 Single::Semi => Ok(TokenKind::Semi),
470 Single::Slash => Ok(self.closed_bin_op(ClosedBinOp::Slash)),
471 Single::Star => Ok(self.closed_bin_op(ClosedBinOp::Star)),
472 Single::Tilde => {
473 let complete = TokenKind::TildeTildeTilde;
474 self.expect_single(Single::Tilde, complete)?;
475 self.expect_single(Single::Tilde, complete)?;
476 Ok(complete)
477 }
478 }
479 }
480
481 fn closed_bin_op(&mut self, op: ClosedBinOp) -> TokenKind {
482 if self.next_if_eq_single(Single::Eq) {
483 TokenKind::BinOpEq(op)
484 } else {
485 TokenKind::ClosedBinOp(op)
486 }
487 }
488
489 fn ident(&mut self, ident: &str) -> TokenKind {
490 match ident {
491 "and" => self.closed_bin_op(ClosedBinOp::And),
492 "or" => self.closed_bin_op(ClosedBinOp::Or),
493 "w" if self.next_if_eq_single(Single::Slash) => {
494 if self.next_if_eq_single(Single::Eq) {
495 TokenKind::WSlashEq
496 } else {
497 TokenKind::WSlash
498 }
499 }
500 ident => ident
501 .parse()
502 .map(TokenKind::Keyword)
503 .unwrap_or(TokenKind::Ident),
504 }
505 }
506}
507
508impl Iterator for Lexer<'_> {
509 type Item = Result<Token, Error>;
510
511 fn next(&mut self) -> Option<Self::Item> {
512 while let Some(token) = self.tokens.next() {
513 match self.cook(&token) {
514 Ok(None) => {}
515 Ok(Some(token)) => return Some(Ok(token)),
516 Err(err) => return Some(Err(err)),
517 }
518 }
519
520 None
521 }
522}
523