microsoft/qdk

Public

mirrored from https://github.com/microsoft/qdkAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
v1.1.3

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

compiler/qsc_parse/src/lex/raw.rs

432lines · modecode

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! The first lexing phase transforms an input string into literals, single-character operators,
5//! whitespace, and comments. Keywords are treated as identifiers. The raw token stream is
6//! contiguous: there are no gaps between tokens.
7//!
8//! These are "raw" tokens because single-character operators don't always correspond to Q#
9//! operators, and whitespace and comments will later be discarded. Raw tokens are the ingredients
10//! that are "cooked" into compound tokens before they can be consumed by the parser.
11//!
12//! Tokens never contain substrings from the original input, but are simply labels that refer back
13//! to offsets in the input. Lexing never fails, but may produce unknown tokens.
14
15#[cfg(test)]
16mod tests;
17
18use super::{Delim, InterpolatedEnding, InterpolatedStart, Radix};
19use enum_iterator::Sequence;
20use std::{
21 fmt::{self, Display, Formatter, Write},
22 iter::Peekable,
23 str::CharIndices,
24};
25
26/// A raw token.
27#[derive(Clone, Debug, Eq, PartialEq)]
28pub(super) struct Token {
29 /// The token kind.
30 pub(super) kind: TokenKind,
31 /// The byte offset of the token starting character.
32 pub(super) offset: u32,
33}
34
35#[derive(Clone, Copy, Debug, Eq, PartialEq, Sequence)]
36pub(crate) enum TokenKind {
37 Comment(CommentKind),
38 Ident,
39 Number(Number),
40 Single(Single),
41 String(StringToken),
42 Unknown,
43 Whitespace,
44}
45
46impl Display for TokenKind {
47 fn fmt(&self, f: &mut Formatter) -> fmt::Result {
48 match self {
49 TokenKind::Comment(CommentKind::Normal) => f.write_str("comment"),
50 TokenKind::Comment(CommentKind::Doc) => f.write_str("doc comment"),
51 TokenKind::Ident => f.write_str("identifier"),
52 TokenKind::Number(Number::BigInt(_)) => f.write_str("big integer"),
53 TokenKind::Number(Number::Float) => f.write_str("float"),
54 TokenKind::Number(Number::Int(_)) => f.write_str("integer"),
55 TokenKind::Single(single) => write!(f, "`{single}`"),
56 TokenKind::String(_) => f.write_str("string"),
57 TokenKind::Unknown => f.write_str("unknown"),
58 TokenKind::Whitespace => f.write_str("whitespace"),
59 }
60 }
61}
62
63/// A single-character operator token.
64#[derive(Clone, Copy, Debug, Eq, PartialEq, Sequence)]
65pub(crate) enum Single {
66 /// `&`
67 Amp,
68 /// `'`
69 Apos,
70 /// `@`
71 At,
72 /// `!`
73 Bang,
74 /// `|`
75 Bar,
76 /// `^`
77 Caret,
78 /// A closing delimiter.
79 Close(Delim),
80 /// `:`
81 Colon,
82 /// `,`
83 Comma,
84 /// `.`
85 Dot,
86 /// `=`
87 Eq,
88 /// `>`
89 Gt,
90 /// `<`
91 Lt,
92 /// `-`
93 Minus,
94 /// An opening delimiter.
95 Open(Delim),
96 /// `%`
97 Percent,
98 /// `+`
99 Plus,
100 /// `?`
101 Question,
102 /// `;`
103 Semi,
104 /// `/`
105 Slash,
106 /// `*`
107 Star,
108 /// `~`
109 Tilde,
110}
111
112impl Display for Single {
113 fn fmt(&self, f: &mut Formatter) -> fmt::Result {
114 f.write_char(match self {
115 Single::Amp => '&',
116 Single::Apos => '\'',
117 Single::At => '@',
118 Single::Bang => '!',
119 Single::Bar => '|',
120 Single::Caret => '^',
121 Single::Close(Delim::Brace) => '}',
122 Single::Close(Delim::Bracket) => ']',
123 Single::Close(Delim::Paren) => ')',
124 Single::Colon => ':',
125 Single::Comma => ',',
126 Single::Dot => '.',
127 Single::Eq => '=',
128 Single::Gt => '>',
129 Single::Lt => '<',
130 Single::Minus => '-',
131 Single::Open(Delim::Brace) => '{',
132 Single::Open(Delim::Bracket) => '[',
133 Single::Open(Delim::Paren) => '(',
134 Single::Percent => '%',
135 Single::Plus => '+',
136 Single::Question => '?',
137 Single::Semi => ';',
138 Single::Slash => '/',
139 Single::Star => '*',
140 Single::Tilde => '~',
141 })
142 }
143}
144
145#[derive(Clone, Copy, Debug, Eq, PartialEq, Sequence)]
146pub(crate) enum Number {
147 BigInt(Radix),
148 Float,
149 Int(Radix),
150}
151
152#[derive(Clone, Copy, Debug, Eq, PartialEq, Sequence)]
153pub(crate) enum StringToken {
154 Normal { terminated: bool },
155 Interpolated(InterpolatedStart, Option<InterpolatedEnding>),
156}
157
158#[derive(Clone, Copy, Debug, Eq, PartialEq)]
159enum StringKind {
160 Normal,
161 Interpolated,
162}
163
164#[derive(Clone, Copy, Debug, Eq, PartialEq, Sequence)]
165pub(crate) enum CommentKind {
166 Normal,
167 Doc,
168}
169
170#[derive(Clone)]
171pub(super) struct Lexer<'a> {
172 chars: Peekable<CharIndices<'a>>,
173 interpolation: u8,
174}
175
176impl<'a> Lexer<'a> {
177 pub(super) fn new(input: &'a str) -> Self {
178 Self {
179 chars: input.char_indices().peekable(),
180 interpolation: 0,
181 }
182 }
183
184 fn next_if_eq(&mut self, c: char) -> bool {
185 self.chars.next_if(|i| i.1 == c).is_some()
186 }
187
188 fn eat_while(&mut self, mut f: impl FnMut(char) -> bool) {
189 while self.chars.next_if(|i| f(i.1)).is_some() {}
190 }
191
192 /// Returns the first character ahead of the cursor without consuming it. This operation is fast,
193 /// but if you know you want to consume the character if it matches, use [`next_if_eq`] instead.
194 fn first(&mut self) -> Option<char> {
195 self.chars.peek().map(|i| i.1)
196 }
197
198 /// Returns the second character ahead of the cursor without consuming it. This is slower
199 /// than [`first`] and should be avoided when possible.
200 fn second(&self) -> Option<char> {
201 let mut chars = self.chars.clone();
202 chars.next();
203 chars.next().map(|i| i.1)
204 }
205
206 fn whitespace(&mut self, c: char) -> bool {
207 if c.is_whitespace() {
208 self.eat_while(char::is_whitespace);
209 true
210 } else {
211 false
212 }
213 }
214
215 fn comment(&mut self, c: char) -> Option<CommentKind> {
216 if c == '/' && self.next_if_eq('/') {
217 let kind = if self.first() == Some('/') && self.second() != Some('/') {
218 self.chars.next();
219 CommentKind::Doc
220 } else {
221 CommentKind::Normal
222 };
223
224 self.eat_while(|c| c != '\n');
225 Some(kind)
226 } else {
227 None
228 }
229 }
230
231 fn ident(&mut self, c: char) -> bool {
232 if c == '_' || c.is_alphabetic() {
233 self.eat_while(|c| c == '_' || c.is_alphanumeric());
234 true
235 } else {
236 false
237 }
238 }
239
240 fn number(&mut self, c: char) -> Option<Number> {
241 self.leading_zero(c).or_else(|| self.decimal(c))
242 }
243
244 fn leading_zero(&mut self, c: char) -> Option<Number> {
245 if c != '0' {
246 return None;
247 }
248
249 let radix = if self.next_if_eq('b') {
250 Radix::Binary
251 } else if self.next_if_eq('o') {
252 Radix::Octal
253 } else if self.next_if_eq('x') {
254 Radix::Hexadecimal
255 } else {
256 Radix::Decimal
257 };
258
259 self.eat_while(|c| c == '_' || c.is_digit(radix.into()));
260 if self.next_if_eq('L') {
261 Some(Number::BigInt(radix))
262 } else if radix == Radix::Decimal && self.float() {
263 Some(Number::Float)
264 } else {
265 Some(Number::Int(radix))
266 }
267 }
268
269 fn decimal(&mut self, c: char) -> Option<Number> {
270 if !c.is_ascii_digit() {
271 return None;
272 }
273
274 self.eat_while(|c| c == '_' || c.is_ascii_digit());
275
276 if self.float() {
277 Some(Number::Float)
278 } else if self.next_if_eq('L') {
279 Some(Number::BigInt(Radix::Decimal))
280 } else {
281 Some(Number::Int(Radix::Decimal))
282 }
283 }
284
285 fn float(&mut self) -> bool {
286 // Watch out for ranges: `0..` should be an integer followed by two dots.
287 if self.first() == Some('.') && self.second() != Some('.') {
288 self.chars.next();
289 self.eat_while(|c| c == '_' || c.is_ascii_digit());
290 self.exp();
291 true
292 } else {
293 self.exp()
294 }
295 }
296
297 fn exp(&mut self) -> bool {
298 if self.next_if_eq('e') {
299 self.chars.next_if(|i| i.1 == '+' || i.1 == '-');
300 self.eat_while(|c| c.is_ascii_digit());
301 true
302 } else {
303 false
304 }
305 }
306
307 fn string(&mut self, c: char) -> Option<TokenKind> {
308 let kind = self.start_string(c)?;
309
310 while self
311 .first()
312 .map_or(false, |c| !is_string_terminator(kind, c))
313 {
314 self.eat_while(|c| c != '\\' && !is_string_terminator(kind, c));
315 if self.next_if_eq('\\') {
316 self.chars.next();
317 }
318 }
319
320 Some(TokenKind::String(self.finish_string(c, kind)))
321 }
322
323 fn start_string(&mut self, c: char) -> Option<StringKind> {
324 if c == '$' {
325 if self.next_if_eq('"') {
326 Some(StringKind::Interpolated)
327 } else {
328 None
329 }
330 } else if c == '"' {
331 Some(StringKind::Normal)
332 } else if self.interpolation > 0 && c == '}' {
333 self.interpolation = self
334 .interpolation
335 .checked_sub(1)
336 .expect("interpolation level should have been incremented at left brace");
337 Some(StringKind::Interpolated)
338 } else {
339 None
340 }
341 }
342
343 fn finish_string(&mut self, start: char, kind: StringKind) -> StringToken {
344 match kind {
345 StringKind::Normal => StringToken::Normal {
346 terminated: self.next_if_eq('"'),
347 },
348 StringKind::Interpolated => {
349 let start = if start == '$' {
350 InterpolatedStart::DollarQuote
351 } else {
352 InterpolatedStart::RBrace
353 };
354
355 let end = if self.next_if_eq('{') {
356 self.interpolation = self
357 .interpolation
358 .checked_add(1)
359 .expect("interpolation should not exceed maximum depth");
360 Some(InterpolatedEnding::LBrace)
361 } else if self.next_if_eq('"') {
362 Some(InterpolatedEnding::Quote)
363 } else {
364 None // Unterminated string.
365 };
366
367 StringToken::Interpolated(start, end)
368 }
369 }
370 }
371}
372
373impl Iterator for Lexer<'_> {
374 type Item = Token;
375
376 fn next(&mut self) -> Option<Self::Item> {
377 let (offset, c) = self.chars.next()?;
378 let kind = if let Some(kind) = self.comment(c) {
379 TokenKind::Comment(kind)
380 } else if self.whitespace(c) {
381 TokenKind::Whitespace
382 } else if self.ident(c) {
383 TokenKind::Ident
384 } else {
385 self.number(c)
386 .map(TokenKind::Number)
387 .or_else(|| self.string(c))
388 .or_else(|| single(c).map(TokenKind::Single))
389 .unwrap_or(TokenKind::Unknown)
390 };
391 Some(Token {
392 kind,
393 offset: offset.try_into().expect("offset should fit into u32"),
394 })
395 }
396}
397
398fn single(c: char) -> Option<Single> {
399 match c {
400 '-' => Some(Single::Minus),
401 ',' => Some(Single::Comma),
402 ';' => Some(Single::Semi),
403 ':' => Some(Single::Colon),
404 '!' => Some(Single::Bang),
405 '?' => Some(Single::Question),
406 '.' => Some(Single::Dot),
407 '\'' => Some(Single::Apos),
408 '(' => Some(Single::Open(Delim::Paren)),
409 ')' => Some(Single::Close(Delim::Paren)),
410 '[' => Some(Single::Open(Delim::Bracket)),
411 ']' => Some(Single::Close(Delim::Bracket)),
412 '{' => Some(Single::Open(Delim::Brace)),
413 '}' => Some(Single::Close(Delim::Brace)),
414 '@' => Some(Single::At),
415 '*' => Some(Single::Star),
416 '/' => Some(Single::Slash),
417 '&' => Some(Single::Amp),
418 '%' => Some(Single::Percent),
419 '^' => Some(Single::Caret),
420 '+' => Some(Single::Plus),
421 '<' => Some(Single::Lt),
422 '=' => Some(Single::Eq),
423 '>' => Some(Single::Gt),
424 '|' => Some(Single::Bar),
425 '~' => Some(Single::Tilde),
426 _ => None,
427 }
428}
429
430fn is_string_terminator(kind: StringKind, c: char) -> bool {
431 c == '"' || kind == StringKind::Interpolated && c == '{'
432}