risingwave_sqlparser/
tokenizer.rs

1// Licensed under the Apache License, Version 2.0 (the "License");
2// you may not use this file except in compliance with the License.
3// You may obtain a copy of the License at
4//
5//     http://www.apache.org/licenses/LICENSE-2.0
6//
7// Unless required by applicable law or agreed to in writing, software
8// distributed under the License is distributed on an "AS IS" BASIS,
9// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10// See the License for the specific language governing permissions and
11// limitations under the License.
12
13//! SQL Tokenizer
14//!
15//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
16//!
17//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).
18
19#[cfg(not(feature = "std"))]
20use alloc::{
21    borrow::ToOwned,
22    format,
23    string::{String, ToString},
24    vec,
25    vec::Vec,
26};
27use core::fmt;
28use core::fmt::Debug;
29use core::iter::Peekable;
30use core::str::Chars;
31
32#[cfg(feature = "serde")]
33use serde::{Deserialize, Serialize};
34
35use crate::ast::{CstyleEscapedString, DollarQuotedString};
36use crate::keywords::{ALL_KEYWORDS, ALL_KEYWORDS_INDEX, Keyword};
37
38/// SQL Token enumeration
39#[derive(Debug, Clone, PartialEq, Eq, Hash)]
40#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
41pub enum Token {
42    /// An end-of-file marker, not a real token
43    EOF,
44    /// A keyword (like SELECT) or an optionally quoted SQL identifier
45    Word(Word),
46    /// An unsigned numeric literal
47    Number(String),
48    /// A character that could not be tokenized
49    Char(char),
50    /// Single quoted string: i.e: 'string'
51    SingleQuotedString(String),
52    /// Dollar quoted string: i.e: $$string$$ or $tag_name$string$tag_name$
53    DollarQuotedString(DollarQuotedString),
54    /// Single quoted string with c-style escapes: i.e: E'string'
55    CstyleEscapesString(CstyleEscapedString),
56    /// "National" string literal: i.e: N'string'
57    NationalStringLiteral(String),
58    /// Hexadecimal string literal: i.e.: X'deadbeef'
59    HexStringLiteral(String),
60    /// Parameter symbols: i.e:  $1, $2
61    Parameter(String),
62    /// Comma
63    Comma,
64    /// Whitespace (space, tab, etc)
65    Whitespace(Whitespace),
66    /// Custom Operator
67    Op(String),
68    /// Equality operator `=`
69    Eq,
70    /// Not Equals operator `<>` (or `!=` in some dialects)
71    Neq,
72    /// Less Than operator `<`
73    Lt,
74    /// Greater Than operator `>`
75    Gt,
76    /// Less Than Or Equals operator `<=`
77    LtEq,
78    /// Greater Than Or Equals operator `>=`
79    GtEq,
80    /// Plus operator `+`
81    Plus,
82    /// Minus operator `-`
83    Minus,
84    /// Multiplication operator `*`
85    Mul,
86    /// Division operator `/`
87    Div,
88    /// Modulo Operator `%`
89    Mod,
90    /// Left parenthesis `(`
91    LParen,
92    /// Right parenthesis `)`
93    RParen,
94    /// Period (used for compound identifiers or projections into nested types)
95    Period,
96    /// Colon `:`
97    Colon,
98    /// DoubleColon `::` (used for casting in postgresql)
99    DoubleColon,
100    /// SemiColon `;` used as separator for COPY and payload
101    SemiColon,
102    /// Backslash `\` used in terminating the COPY payload with `\.`
103    Backslash,
104    /// Left bracket `[`
105    LBracket,
106    /// Right bracket `]`
107    RBracket,
108    /// Pipe `|`
109    Pipe,
110    /// Caret `^`
111    Caret,
112    /// Left brace `{`
113    LBrace,
114    /// Right brace `}`
115    RBrace,
116    /// Right Arrow `=>`
117    RArrow,
118}
119
120impl fmt::Display for Token {
121    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
122        match self {
123            Token::EOF => f.write_str("EOF"),
124            Token::Word(w) => write!(f, "{}", w),
125            Token::Number(n) => write!(f, "{}", n),
126            Token::Char(c) => write!(f, "{}", c),
127            Token::SingleQuotedString(s) => write!(f, "'{}'", s),
128            Token::DollarQuotedString(s) => write!(f, "{}", s),
129            Token::NationalStringLiteral(s) => write!(f, "N'{}'", s),
130            Token::HexStringLiteral(s) => write!(f, "X'{}'", s),
131            Token::CstyleEscapesString(s) => write!(f, "E'{}'", s),
132            Token::Parameter(s) => write!(f, "${}", s),
133            Token::Comma => f.write_str(","),
134            Token::Whitespace(ws) => write!(f, "{}", ws),
135            Token::Op(op) => write!(f, "{}", op),
136            Token::Eq => f.write_str("="),
137            Token::Neq => f.write_str("<>"),
138            Token::Lt => f.write_str("<"),
139            Token::Gt => f.write_str(">"),
140            Token::LtEq => f.write_str("<="),
141            Token::GtEq => f.write_str(">="),
142            Token::Plus => f.write_str("+"),
143            Token::Minus => f.write_str("-"),
144            Token::Mul => f.write_str("*"),
145            Token::Div => f.write_str("/"),
146            Token::Mod => f.write_str("%"),
147            Token::LParen => f.write_str("("),
148            Token::RParen => f.write_str(")"),
149            Token::Period => f.write_str("."),
150            Token::Colon => f.write_str(":"),
151            Token::DoubleColon => f.write_str("::"),
152            Token::SemiColon => f.write_str(";"),
153            Token::Backslash => f.write_str("\\"),
154            Token::LBracket => f.write_str("["),
155            Token::RBracket => f.write_str("]"),
156            Token::Caret => f.write_str("^"),
157            Token::Pipe => f.write_str("|"),
158            Token::LBrace => f.write_str("{"),
159            Token::RBrace => f.write_str("}"),
160            Token::RArrow => f.write_str("=>"),
161        }
162    }
163}
164
165impl Token {
166    pub fn make_keyword(keyword: &str) -> Self {
167        Token::make_word(keyword, None)
168    }
169
170    pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
171        let word_uppercase = word.to_uppercase();
172        Token::Word(Word {
173            value: word.to_owned(),
174            quote_style,
175            keyword: if quote_style.is_none() {
176                let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
177                keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
178            } else {
179                Keyword::NoKeyword
180            },
181        })
182    }
183
184    pub fn with_location(self, location: Location) -> TokenWithLocation {
185        TokenWithLocation::new(self, location.line, location.column)
186    }
187}
188
189/// A keyword (like SELECT) or an optionally quoted SQL identifier
190#[derive(Debug, Clone, PartialEq, Eq, Hash)]
191#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
192pub struct Word {
193    /// The value of the token, without the enclosing quotes, and with the
194    /// escape sequences (if any) processed (TODO: escapes are not handled)
195    pub value: String,
196    /// An identifier can be "quoted" (&lt;delimited identifier> in ANSI parlance).
197    /// The standard and most implementations allow using double quotes for this,
198    /// but some implementations support other quoting styles as well (e.g. \[MS SQL])
199    pub quote_style: Option<char>,
200    /// If the word was not quoted and it matched one of the known keywords,
201    /// this will have one of the values from dialect::keywords, otherwise empty
202    pub keyword: Keyword,
203}
204
205impl fmt::Display for Word {
206    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
207        match self.quote_style {
208            Some(s) if s == '"' || s == '[' || s == '`' => {
209                write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
210            }
211            None => f.write_str(&self.value),
212            _ => panic!("Unexpected quote_style!"),
213        }
214    }
215}
216
217impl Word {
218    fn matching_end_quote(ch: char) -> char {
219        match ch {
220            '"' => '"', // ANSI and most dialects
221            '[' => ']', // MS SQL
222            '`' => '`', // MySQL
223            _ => panic!("unexpected quoting style!"),
224        }
225    }
226}
227
228#[derive(Debug, Clone, PartialEq, Eq, Hash)]
229#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
230pub enum Whitespace {
231    Space,
232    Newline,
233    Tab,
234    SingleLineComment { comment: String, prefix: String },
235    MultiLineComment(String),
236}
237
238impl fmt::Display for Whitespace {
239    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
240        match self {
241            Whitespace::Space => f.write_str(" "),
242            Whitespace::Newline => f.write_str("\n"),
243            Whitespace::Tab => f.write_str("\t"),
244            Whitespace::SingleLineComment { prefix, comment } => write!(f, "{}{}", prefix, comment),
245            Whitespace::MultiLineComment(s) => write!(f, "/*{}*/", s),
246        }
247    }
248}
249
250/// Location in input string
251#[derive(Debug, Eq, PartialEq, Clone)]
252pub struct Location {
253    /// Line number, starting from 1
254    pub line: u64,
255    /// Line column, starting from 1
256    pub column: u64,
257}
258
259/// A [Token] with [Location] attached to it
260#[derive(Debug, Eq, PartialEq, Clone)]
261pub struct TokenWithLocation {
262    pub token: Token,
263    pub location: Location,
264}
265
266impl TokenWithLocation {
267    pub fn new(token: Token, line: u64, column: u64) -> TokenWithLocation {
268        TokenWithLocation {
269            token,
270            location: Location { line, column },
271        }
272    }
273
274    pub fn eof() -> TokenWithLocation {
275        TokenWithLocation::new(Token::EOF, 0, 0)
276    }
277}
278
279impl PartialEq<Token> for TokenWithLocation {
280    fn eq(&self, other: &Token) -> bool {
281        &self.token == other
282    }
283}
284
285impl PartialEq<TokenWithLocation> for Token {
286    fn eq(&self, other: &TokenWithLocation) -> bool {
287        self == &other.token
288    }
289}
290
291impl fmt::Display for TokenWithLocation {
292    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
293        if self.token == Token::EOF {
294            write!(f, "end of input")
295        } else {
296            write!(
297                f,
298                "{} at line {}, column {}",
299                self.token, self.location.line, self.location.column
300            )
301        }
302    }
303}
304
305/// Tokenizer error
306#[derive(Debug, PartialEq)]
307pub struct TokenizerError {
308    pub message: String,
309    pub line: u64,
310    pub col: u64,
311    pub context: String,
312}
313
314impl fmt::Display for TokenizerError {
315    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
316        write!(
317            f,
318            "{} at line {}, column {}\n{}",
319            self.message, self.line, self.col, self.context
320        )
321    }
322}
323
324#[cfg(feature = "std")]
325impl std::error::Error for TokenizerError {}
326
327/// SQL Tokenizer
328#[derive(Clone)]
329pub struct Tokenizer<'a> {
330    sql: &'a str,
331    chars: Peekable<Chars<'a>>,
332    line: u64,
333    col: u64,
334}
335
336impl<'a> Tokenizer<'a> {
337    /// Create a new SQL tokenizer for the specified SQL statement
338    pub fn new(query: &'a str) -> Self {
339        Self {
340            sql: query,
341            chars: query.chars().peekable(),
342            line: 1,
343            col: 1,
344        }
345    }
346
347    /// Consume the next character.
348    fn next(&mut self) -> Option<char> {
349        let ch = self.chars.next();
350        if let Some(ch) = ch {
351            match ch {
352                '\n' => {
353                    self.line += 1;
354                    self.col = 1;
355                }
356                '\t' => self.col += 4,
357                _ => self.col += 1,
358            }
359        }
360        ch
361    }
362
363    /// Return the next character without consuming it.
364    fn peek(&mut self) -> Option<char> {
365        self.chars.peek().cloned()
366    }
367
368    /// Tokenize the statement and produce a vector of tokens with locations.
369    ///
370    /// Whitespaces are skipped.
371    pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
372        let tokens = self.tokenize()?;
373        Ok(tokens
374            .into_iter()
375            .filter(|token| !matches!(&token.token, Token::Whitespace(_)))
376            .collect())
377    }
378
379    /// Tokenize the statement and produce a vector of tokens.
380    ///
381    /// Whitespaces are included.
382    #[allow(dead_code)]
383    fn tokenize_with_whitespace(&mut self) -> Result<Vec<Token>, TokenizerError> {
384        let tokens = self.tokenize()?;
385        Ok(tokens.into_iter().map(|t| t.token).collect())
386    }
387
388    /// Tokenize the statement and produce a vector of tokens.
389    ///
390    /// Whitespaces are included.
391    fn tokenize(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
392        let mut tokens = Vec::new();
393        while let Some(token) = self.next_token_with_location()? {
394            tokens.push(token);
395        }
396        Ok(tokens)
397    }
398
399    /// Get the next token or return None
400    fn next_token_with_location(&mut self) -> Result<Option<TokenWithLocation>, TokenizerError> {
401        let loc = Location {
402            line: self.line,
403            column: self.col,
404        };
405        self.next_token()
406            .map(|t| t.map(|token| token.with_location(loc)))
407    }
408
409    /// Get the next token or return None
410    fn next_token(&mut self) -> Result<Option<Token>, TokenizerError> {
411        macro_rules! op_chars {
412            // https://www.postgresql.org/docs/17/sql-syntax-lexical.html#SQL-SYNTAX-OPERATORS
413            (all as_pat) => {
414                '+' | '-' | '*' | '/' | '<' | '>' | '=' | op_chars!(ext as_pat)
415            };
416            (ext $m:ident) => {
417                op_chars!($m '~' '!' '@' '#' '%' '^' '&' '|' '`' '?')
418            };
419            (as_arr $($c:literal)+) => {
420                [ $($c),+ ]
421            };
422            (as_pat $($c:literal)+) => {
423                $($c)|+
424            };
425        }
426
427        match self.peek() {
428            Some(ch) => match ch {
429                ' ' => self.consume_and_return(Token::Whitespace(Whitespace::Space)),
430                '\t' => self.consume_and_return(Token::Whitespace(Whitespace::Tab)),
431                '\n' => self.consume_and_return(Token::Whitespace(Whitespace::Newline)),
432                '\r' => {
433                    // Emit a single Whitespace::Newline token for \r and \r\n
434                    self.next();
435                    if let Some('\n') = self.peek() {
436                        self.next();
437                    }
438                    Ok(Some(Token::Whitespace(Whitespace::Newline)))
439                }
440                'N' => {
441                    self.next(); // consume, to check the next char
442                    match self.peek() {
443                        Some('\'') => {
444                            // N'...' - a <national character string literal>
445                            let s = self.tokenize_single_quoted_string()?;
446                            Ok(Some(Token::NationalStringLiteral(s)))
447                        }
448                        _ => {
449                            // regular identifier starting with an "N"
450                            let s = self.tokenize_word('N');
451                            Ok(Some(Token::make_word(&s, None)))
452                        }
453                    }
454                }
455                x @ 'e' | x @ 'E' => {
456                    self.next(); // consume, to check the next char
457                    match self.peek() {
458                        Some('\'') => {
459                            // E'...' - a <character string literal>
460                            let s = self.tokenize_single_quoted_string_with_escape()?;
461                            Ok(Some(Token::CstyleEscapesString(s)))
462                        }
463                        _ => {
464                            // regular identifier starting with an "E"
465                            let s = self.tokenize_word(x);
466                            Ok(Some(Token::make_word(&s, None)))
467                        }
468                    }
469                }
470                // The spec only allows an uppercase 'X' to introduce a hex
471                // string, but PostgreSQL, at least, allows a lowercase 'x' too.
472                x @ 'x' | x @ 'X' => {
473                    self.next(); // consume, to check the next char
474                    match self.peek() {
475                        Some('\'') => {
476                            // X'...' - a <binary string literal>
477                            let s = self.tokenize_single_quoted_string()?;
478                            Ok(Some(Token::HexStringLiteral(s)))
479                        }
480                        _ => {
481                            // regular identifier starting with an "X"
482                            let s = self.tokenize_word(x);
483                            Ok(Some(Token::make_word(&s, None)))
484                        }
485                    }
486                }
487                // identifier or keyword
488                ch if is_identifier_start(ch) => {
489                    self.next(); // consume the first char
490                    let s = self.tokenize_word(ch);
491
492                    Ok(Some(Token::make_word(&s, None)))
493                }
494                // string
495                '\'' => {
496                    let s = self.tokenize_single_quoted_string()?;
497
498                    Ok(Some(Token::SingleQuotedString(s)))
499                }
500                // delimited (quoted) identifier
501                quote_start if is_delimited_identifier_start(quote_start) => {
502                    self.next(); // consume the opening quote
503                    let quote_end = Word::matching_end_quote(quote_start);
504                    let s = self.peeking_take_while(|ch| ch != quote_end);
505                    if self.next() == Some(quote_end) {
506                        Ok(Some(Token::make_word(&s, Some(quote_start))))
507                    } else {
508                        self.error(format!(
509                            "Expected close delimiter '{}' before EOF.",
510                            quote_end
511                        ))
512                    }
513                }
514                // numbers and period
515                '0'..='9' | '.' => {
516                    let mut s = self.peeking_take_while(|ch| ch.is_ascii_digit());
517
518                    // match binary literal that starts with 0x
519                    if s == "0"
520                        && let Some(radix) = self.peek()
521                        && "xob".contains(radix.to_ascii_lowercase())
522                    {
523                        self.next();
524                        let radix = radix.to_ascii_lowercase();
525                        let base = match radix {
526                            'x' => 16,
527                            'o' => 8,
528                            'b' => 2,
529                            _ => unreachable!(),
530                        };
531                        let s2 = self.peeking_take_while(|ch| ch.is_digit(base));
532                        if s2.is_empty() {
533                            return self.error("incomplete integer literal");
534                        }
535                        self.reject_number_junk()?;
536                        return Ok(Some(Token::Number(format!("0{radix}{s2}"))));
537                    }
538
539                    // match one period
540                    if let Some('.') = self.peek() {
541                        s.push('.');
542                        self.next();
543                    }
544                    s += &self.peeking_take_while(|ch| ch.is_ascii_digit());
545
546                    // No number -> Token::Period
547                    if s == "." {
548                        return Ok(Some(Token::Period));
549                    }
550
551                    match self.peek() {
552                        // Number is a scientific number (1e6)
553                        Some('e') | Some('E') => {
554                            s.push('e');
555                            self.next();
556
557                            if let Some('-') = self.peek() {
558                                s.push('-');
559                                self.next();
560                            }
561                            s += &self.peeking_take_while(|ch| ch.is_ascii_digit());
562                            self.reject_number_junk()?;
563                            return Ok(Some(Token::Number(s)));
564                        }
565                        // Not a scientific number
566                        _ => {}
567                    };
568                    self.reject_number_junk()?;
569                    Ok(Some(Token::Number(s)))
570                }
571                // punctuation
572                '(' => self.consume_and_return(Token::LParen),
573                ')' => self.consume_and_return(Token::RParen),
574                ',' => self.consume_and_return(Token::Comma),
575                ':' => {
576                    self.next();
577                    match self.peek() {
578                        Some(':') => self.consume_and_return(Token::DoubleColon),
579                        _ => Ok(Some(Token::Colon)),
580                    }
581                }
582                '$' => Ok(Some(self.tokenize_dollar_preceded_value()?)),
583                ';' => self.consume_and_return(Token::SemiColon),
584                '\\' => self.consume_and_return(Token::Backslash),
585                '[' => self.consume_and_return(Token::LBracket),
586                ']' => self.consume_and_return(Token::RBracket),
587                '{' => self.consume_and_return(Token::LBrace),
588                '}' => self.consume_and_return(Token::RBrace),
589                // operators
590                op_chars!(all as_pat) => {
591                    let mut trial = self.clone();
592                    let op_taken = trial.peeking_take_while(|c| matches!(c, op_chars!(all as_pat)));
593                    // It is safe to assume byte index is char index in `op_token` below.
594
595                    // https://www.postgresql.org/docs/17/sql-syntax-lexical.html#SQL-SYNTAX-OPERATORS
596                    // https://github.com/postgres/postgres/blob/REL_17_4/src/backend/parser/scan.l#L900-L1006
597                    let slash_star = op_taken.find("/*");
598                    let dash_dash = op_taken.find("--");
599                    let pos = match (slash_star, dash_dash) {
600                        (Some(s), Some(d)) => s.min(d),
601                        (Some(s), None) => s,
602                        (None, Some(d)) => d,
603                        (None, None) => op_taken.len(),
604                    };
605                    let mut op = &op_taken[..pos];
606                    if op.is_empty() {
607                        match self.next() {
608                            Some('-') => {
609                                self.next(); // consume the second '-', starting a single-line comment
610                                let comment = self.tokenize_single_line_comment();
611
612                                return Ok(Some(Token::Whitespace(
613                                    Whitespace::SingleLineComment {
614                                        prefix: "--".to_owned(),
615                                        comment,
616                                    },
617                                )));
618                            }
619                            Some('/') => {
620                                self.next(); // consume the '*', starting a multi-line comment
621                                return self.tokenize_multiline_comment();
622                            }
623                            _ => unreachable!(),
624                        }
625                    };
626                    if op.len() > 1
627                        && op.ends_with(['+', '-'])
628                        && !op.contains(op_chars!(ext as_arr))
629                    {
630                        op = op.trim_end_matches(['+', '-']);
631                        if op.is_empty() {
632                            op = &op_taken[..1];
633                        }
634                    }
635                    if op.len() == op_taken.len() {
636                        *self = trial;
637                    } else {
638                        for _ in op.chars() {
639                            self.next();
640                        }
641                    }
642                    match op {
643                        // https://github.com/postgres/postgres/blob/REL_17_4/src/backend/parser/scan.l#L965-L973
644                        "+" => Ok(Some(Token::Plus)),
645                        "-" => Ok(Some(Token::Minus)),
646                        "*" => Ok(Some(Token::Mul)),
647                        "/" => Ok(Some(Token::Div)),
648                        "%" => Ok(Some(Token::Mod)),
649                        "^" => Ok(Some(Token::Caret)),
650                        "<" => Ok(Some(Token::Lt)),
651                        ">" => Ok(Some(Token::Gt)),
652                        "=" => Ok(Some(Token::Eq)),
653                        // https://github.com/postgres/postgres/blob/REL_17_4/src/backend/parser/scan.l#L974-L992
654                        "=>" => Ok(Some(Token::RArrow)),
655                        "<=" => Ok(Some(Token::LtEq)),
656                        ">=" => Ok(Some(Token::GtEq)),
657                        "<>" => Ok(Some(Token::Neq)),
658                        "!=" => Ok(Some(Token::Neq)),
659                        // Our support of `Expr::LambdaFunction` makes us PostgreSQL-incompatible here.
660                        //     foo(bar, | x | x)
661                        // In PostgreSQL, this is unary operator `|` applied to `x`, then bitwise-or `x`.
662                        // In our dialect, this is a lambda function - the identity function.
663                        "|" => Ok(Some(Token::Pipe)),
664                        _ => Ok(Some(Token::Op(op.to_owned()))),
665                    }
666                }
667                other => self.consume_and_return(Token::Char(other)),
668            },
669            None => Ok(None),
670        }
671    }
672
673    /// Tokenize dollar preceded value (i.e: a string/placeholder)
674    fn tokenize_dollar_preceded_value(&mut self) -> Result<Token, TokenizerError> {
675        let mut s = String::new();
676        let mut value = String::new();
677
678        self.next();
679
680        if let Some('$') = self.peek() {
681            self.next();
682
683            let mut is_terminated = false;
684            let mut prev: Option<char> = None;
685
686            while let Some(ch) = self.peek() {
687                if prev == Some('$') {
688                    if ch == '$' {
689                        self.next();
690                        is_terminated = true;
691                        break;
692                    } else {
693                        s.push('$');
694                        s.push(ch);
695                    }
696                } else if ch != '$' {
697                    s.push(ch);
698                }
699
700                prev = Some(ch);
701                self.next();
702            }
703
704            return if self.peek().is_none() && !is_terminated {
705                self.error("Unterminated dollar-quoted string")
706            } else {
707                Ok(Token::DollarQuotedString(DollarQuotedString {
708                    value: s,
709                    tag: None,
710                }))
711            };
712        } else {
713            value.push_str(&self.peeking_take_while(|ch| ch.is_alphanumeric() || ch == '_'));
714
715            if let Some('$') = self.peek() {
716                self.next();
717                s.push_str(&self.peeking_take_while(|ch| ch != '$'));
718
719                match self.peek() {
720                    Some('$') => {
721                        self.next();
722                        for c in value.chars() {
723                            let next_char = self.next();
724                            if Some(c) != next_char {
725                                return self.error(format!(
726                                    "Unterminated dollar-quoted string at or near \"{}\"",
727                                    value
728                                ));
729                            }
730                        }
731
732                        if let Some('$') = self.peek() {
733                            self.next();
734                        } else {
735                            return self.error("Unterminated dollar-quoted string, expected $");
736                        }
737                    }
738                    _ => {
739                        return self.error("Unterminated dollar-quoted, expected $");
740                    }
741                }
742            } else {
743                return Ok(Token::Parameter(value));
744            }
745        }
746
747        Ok(Token::DollarQuotedString(DollarQuotedString {
748            value: s,
749            tag: if value.is_empty() { None } else { Some(value) },
750        }))
751    }
752
753    fn error<R>(&self, message: impl Into<String>) -> Result<R, TokenizerError> {
754        let prefix = format!("LINE {}: ", self.line);
755        let sql_line = self.sql.split('\n').nth(self.line as usize - 1).unwrap();
756        let cursor = " ".repeat(prefix.len() + self.col as usize - 1);
757        let context = format!("{}{}\n{}^", prefix, sql_line, cursor);
758        Err(TokenizerError {
759            message: message.into(),
760            col: self.col,
761            line: self.line,
762            context,
763        })
764    }
765
766    fn reject_number_junk(&mut self) -> Result<(), TokenizerError> {
767        if let Some(ch) = self.peek()
768            && is_identifier_start(ch)
769        {
770            return self.error("trailing junk after numeric literal");
771        }
772        Ok(())
773    }
774
775    // Consume characters until newline
776    fn tokenize_single_line_comment(&mut self) -> String {
777        let mut comment = self.peeking_take_while(|ch| ch != '\n');
778        if let Some(ch) = self.next() {
779            assert_eq!(ch, '\n');
780            comment.push(ch);
781        }
782        comment
783    }
784
785    /// Tokenize an identifier or keyword, after the first char is already consumed.
786    fn tokenize_word(&mut self, first_char: char) -> String {
787        let mut s = first_char.to_string();
788        s.push_str(&self.peeking_take_while(is_identifier_part));
789        s
790    }
791
792    /// Read a single quoted string, starting with the opening quote.
793    fn tokenize_single_quoted_string(&mut self) -> Result<String, TokenizerError> {
794        let mut s = String::new();
795        self.next(); // consume the opening quote
796
797        // slash escaping is specific to MySQL dialect
798        let mut is_escaped = false;
799        while let Some(ch) = self.peek() {
800            match ch {
801                '\'' => {
802                    self.next(); // consume
803                    if is_escaped {
804                        s.push(ch);
805                        is_escaped = false;
806                    } else if self.peek().map(|c| c == '\'').unwrap_or(false) {
807                        s.push(ch);
808                        self.next();
809                    } else {
810                        return Ok(s);
811                    }
812                }
813                '\\' => {
814                    s.push(ch);
815                    self.next();
816                }
817                _ => {
818                    self.next(); // consume
819                    s.push(ch);
820                }
821            }
822        }
823        self.error("Unterminated string literal")
824    }
825
826    /// Read a single qutoed string with escape
827    fn tokenize_single_quoted_string_with_escape(
828        &mut self,
829    ) -> Result<CstyleEscapedString, TokenizerError> {
830        let mut terminated = false;
831        let mut s = String::new();
832        self.next(); // consume the opening quote
833
834        while let Some(ch) = self.peek() {
835            match ch {
836                '\'' => {
837                    self.next(); // consume
838                    if self.peek().map(|c| c == '\'').unwrap_or(false) {
839                        s.push('\\');
840                        s.push(ch);
841                        self.next();
842                    } else {
843                        terminated = true;
844                        break;
845                    }
846                }
847                '\\' => {
848                    s.push(ch);
849                    self.next();
850                    if self.peek().map(|c| c == '\'' || c == '\\').unwrap_or(false) {
851                        s.push(self.next().unwrap());
852                    }
853                }
854                _ => {
855                    self.next(); // consume
856                    s.push(ch);
857                }
858            }
859        }
860
861        if !terminated {
862            return self.error("Unterminated string literal");
863        }
864
865        let unescaped = match Self::unescape_c_style(&s) {
866            Ok(unescaped) => unescaped,
867            Err(e) => return self.error(e),
868        };
869
870        Ok(CstyleEscapedString {
871            value: unescaped,
872            raw: s,
873        })
874    }
875
876    /// Helper function used to convert string with c-style escapes into a normal string
877    /// e.g. 'hello\x3fworld' -> 'hello?world'
878    ///
879    /// Detail of c-style escapes refer from:
880    /// <https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE:~:text=4.1.2.2.%C2%A0String%20Constants%20With%20C%2DStyle%20Escapes>
881    fn unescape_c_style(s: &str) -> Result<String, String> {
882        fn hex_byte_process(
883            chars: &mut Peekable<Chars<'_>>,
884            res: &mut String,
885            len: usize,
886            default_char: char,
887        ) -> Result<(), String> {
888            let mut unicode_seq: String = String::with_capacity(len);
889            for _ in 0..len {
890                if let Some(c) = chars.peek()
891                    && c.is_ascii_hexdigit()
892                {
893                    unicode_seq.push(chars.next().unwrap());
894                } else {
895                    break;
896                }
897            }
898
899            if unicode_seq.is_empty() && len == 2 {
900                res.push(default_char);
901                return Ok(());
902            } else if unicode_seq.len() < len && len != 2 {
903                return Err("invalid unicode sequence: must be \\uXXXX or \\UXXXXXXXX".to_owned());
904            }
905
906            if len == 2 {
907                let number = [u8::from_str_radix(&unicode_seq, 16)
908                    .map_err(|e| format!("invalid unicode sequence: {}", e))?];
909
910                res.push(
911                    std::str::from_utf8(&number)
912                        .map_err(|err| format!("invalid unicode sequence: {}", err))?
913                        .chars()
914                        .next()
915                        .unwrap(),
916                );
917            } else {
918                let number = u32::from_str_radix(&unicode_seq, 16)
919                    .map_err(|e| format!("invalid unicode sequence: {}", e))?;
920                res.push(
921                    char::from_u32(number)
922                        .ok_or_else(|| format!("invalid unicode sequence: {}", unicode_seq))?,
923                );
924            }
925            Ok(())
926        }
927
928        fn octal_byte_process(
929            chars: &mut Peekable<Chars<'_>>,
930            res: &mut String,
931            digit: char,
932        ) -> Result<(), String> {
933            let mut unicode_seq: String = String::with_capacity(3);
934            unicode_seq.push(digit);
935            for _ in 0..2 {
936                if let Some(c) = chars.peek()
937                    && matches!(*c, '0'..='7')
938                {
939                    unicode_seq.push(chars.next().unwrap());
940                } else {
941                    break;
942                }
943            }
944
945            let number = [u8::from_str_radix(&unicode_seq, 8)
946                .map_err(|e| format!("invalid unicode sequence: {}", e))?];
947
948            res.push(
949                std::str::from_utf8(&number)
950                    .map_err(|err| format!("invalid unicode sequence: {}", err))?
951                    .chars()
952                    .next()
953                    .unwrap(),
954            );
955            Ok(())
956        }
957
958        let mut chars = s.chars().peekable();
959        let mut res = String::with_capacity(s.len());
960
961        while let Some(c) = chars.next() {
962            if c == '\\' {
963                match chars.next() {
964                    None => {
965                        return Err("unterminated escape sequence".to_owned());
966                    }
967                    Some(next_c) => match next_c {
968                        'b' => res.push('\u{08}'),
969                        'f' => res.push('\u{0C}'),
970                        'n' => res.push('\n'),
971                        'r' => res.push('\r'),
972                        't' => res.push('\t'),
973                        'x' => hex_byte_process(&mut chars, &mut res, 2, 'x')?,
974                        'u' => hex_byte_process(&mut chars, &mut res, 4, 'u')?,
975                        'U' => hex_byte_process(&mut chars, &mut res, 8, 'U')?,
976                        digit @ '0'..='7' => octal_byte_process(&mut chars, &mut res, digit)?,
977                        _ => res.push(next_c),
978                    },
979                }
980            } else {
981                res.push(c);
982            }
983        }
984
985        Ok(res)
986    }
987
988    fn tokenize_multiline_comment(&mut self) -> Result<Option<Token>, TokenizerError> {
989        let mut s = String::new();
990
991        let mut nested = 1;
992        let mut last_ch = ' ';
993
994        loop {
995            match self.next() {
996                Some(ch) => {
997                    if last_ch == '/' && ch == '*' {
998                        nested += 1;
999                    } else if last_ch == '*' && ch == '/' {
1000                        nested -= 1;
1001                        if nested == 0 {
1002                            s.pop();
1003                            break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
1004                        }
1005                    }
1006                    s.push(ch);
1007                    last_ch = ch;
1008                }
1009                None => break self.error("Unexpected EOF while in a multi-line comment"),
1010            }
1011        }
1012    }
1013
1014    #[allow(clippy::unnecessary_wraps)]
1015    fn consume_and_return(&mut self, t: Token) -> Result<Option<Token>, TokenizerError> {
1016        self.next();
1017        Ok(Some(t))
1018    }
1019
1020    /// Read from `self` until `predicate` returns `false` or EOF is hit.
1021    /// Return the characters read as String, and keep the first non-matching
1022    /// char available as `self.next()`.
1023    fn peeking_take_while(&mut self, mut predicate: impl FnMut(char) -> bool) -> String {
1024        let mut s = String::new();
1025        while let Some(ch) = self.peek() {
1026            if predicate(ch) {
1027                self.next(); // consume
1028                s.push(ch);
1029            } else {
1030                break;
1031            }
1032        }
1033        s
1034    }
1035}
1036
1037/// Determine if a character starts a quoted identifier. The default
1038/// implementation, accepting "double quoted" ids is both ANSI-compliant
1039/// and appropriate for most dialects (with the notable exception of
1040/// MySQL, MS SQL, and sqlite). You can accept one of characters listed
1041/// in `Word::matching_end_quote` here
1042fn is_delimited_identifier_start(ch: char) -> bool {
1043    ch == '"'
1044}
1045
1046/// Determine if a character is a valid start character for an unquoted identifier
1047fn is_identifier_start(ch: char) -> bool {
1048    // See https://www.postgresql.org/docs/11/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS
1049    // We don't yet support identifiers beginning with "letters with
1050    // diacritical marks and non-Latin letters"
1051    ch.is_ascii_alphabetic() || ch == '_'
1052}
1053
1054/// Determine if a character is a valid unquoted identifier character
1055fn is_identifier_part(ch: char) -> bool {
1056    ch.is_ascii_alphanumeric() || ch == '$' || ch == '_'
1057}
1058
1059#[cfg(test)]
1060mod tests {
1061    use super::*;
1062
1063    #[test]
1064    fn tokenizer_error_impl() {
1065        let err = TokenizerError {
1066            message: "test".into(),
1067            line: 1,
1068            col: 1,
1069            context: "LINE 1:".to_owned(),
1070        };
1071        #[cfg(feature = "std")]
1072        {
1073            use std::error::Error;
1074            assert!(err.source().is_none());
1075        }
1076        assert_eq!(err.to_string(), "test at line 1, column 1\nLINE 1:");
1077    }
1078
1079    #[test]
1080    fn tokenize_select_1() {
1081        let sql = String::from("SELECT 1");
1082        let mut tokenizer = Tokenizer::new(&sql);
1083        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1084
1085        let expected = vec![
1086            Token::make_keyword("SELECT"),
1087            Token::Whitespace(Whitespace::Space),
1088            Token::Number(String::from("1")),
1089        ];
1090
1091        compare(expected, tokens);
1092    }
1093
1094    #[test]
1095    fn tokenize_select_float() {
1096        let sql = String::from("SELECT .1");
1097        let mut tokenizer = Tokenizer::new(&sql);
1098        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1099
1100        let expected = vec![
1101            Token::make_keyword("SELECT"),
1102            Token::Whitespace(Whitespace::Space),
1103            Token::Number(String::from(".1")),
1104        ];
1105
1106        compare(expected, tokens);
1107    }
1108
1109    #[test]
1110    fn tokenize_scalar_function() {
1111        let sql = String::from("SELECT sqrt(1)");
1112        let mut tokenizer = Tokenizer::new(&sql);
1113        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1114
1115        let expected = vec![
1116            Token::make_keyword("SELECT"),
1117            Token::Whitespace(Whitespace::Space),
1118            Token::make_word("sqrt", None),
1119            Token::LParen,
1120            Token::Number(String::from("1")),
1121            Token::RParen,
1122        ];
1123
1124        compare(expected, tokens);
1125    }
1126
1127    #[test]
1128    fn tokenize_string_string_concat() {
1129        let sql = String::from("SELECT 'a' || 'b'");
1130        let mut tokenizer = Tokenizer::new(&sql);
1131        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1132
1133        let expected = vec![
1134            Token::make_keyword("SELECT"),
1135            Token::Whitespace(Whitespace::Space),
1136            Token::SingleQuotedString(String::from("a")),
1137            Token::Whitespace(Whitespace::Space),
1138            Token::Op("||".to_owned()),
1139            Token::Whitespace(Whitespace::Space),
1140            Token::SingleQuotedString(String::from("b")),
1141        ];
1142
1143        compare(expected, tokens);
1144    }
1145
1146    #[test]
1147    fn tokenize_bitwise_op() {
1148        let sql = String::from("SELECT one | two ^ three");
1149        let mut tokenizer = Tokenizer::new(&sql);
1150        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1151
1152        let expected = vec![
1153            Token::make_keyword("SELECT"),
1154            Token::Whitespace(Whitespace::Space),
1155            Token::make_word("one", None),
1156            Token::Whitespace(Whitespace::Space),
1157            Token::Pipe,
1158            Token::Whitespace(Whitespace::Space),
1159            Token::make_word("two", None),
1160            Token::Whitespace(Whitespace::Space),
1161            Token::Caret,
1162            Token::Whitespace(Whitespace::Space),
1163            Token::make_word("three", None),
1164        ];
1165        compare(expected, tokens);
1166    }
1167
1168    #[test]
1169    fn tokenize_logical_xor() {
1170        let sql =
1171            String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
1172        let mut tokenizer = Tokenizer::new(&sql);
1173        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1174
1175        let expected = vec![
1176            Token::make_keyword("SELECT"),
1177            Token::Whitespace(Whitespace::Space),
1178            Token::make_keyword("true"),
1179            Token::Whitespace(Whitespace::Space),
1180            Token::make_keyword("XOR"),
1181            Token::Whitespace(Whitespace::Space),
1182            Token::make_keyword("true"),
1183            Token::Comma,
1184            Token::Whitespace(Whitespace::Space),
1185            Token::make_keyword("false"),
1186            Token::Whitespace(Whitespace::Space),
1187            Token::make_keyword("XOR"),
1188            Token::Whitespace(Whitespace::Space),
1189            Token::make_keyword("false"),
1190            Token::Comma,
1191            Token::Whitespace(Whitespace::Space),
1192            Token::make_keyword("true"),
1193            Token::Whitespace(Whitespace::Space),
1194            Token::make_keyword("XOR"),
1195            Token::Whitespace(Whitespace::Space),
1196            Token::make_keyword("false"),
1197            Token::Comma,
1198            Token::Whitespace(Whitespace::Space),
1199            Token::make_keyword("false"),
1200            Token::Whitespace(Whitespace::Space),
1201            Token::make_keyword("XOR"),
1202            Token::Whitespace(Whitespace::Space),
1203            Token::make_keyword("true"),
1204        ];
1205        compare(expected, tokens);
1206    }
1207
1208    #[test]
1209    fn tokenize_simple_select() {
1210        let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
1211        let mut tokenizer = Tokenizer::new(&sql);
1212        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1213
1214        let expected = vec![
1215            Token::make_keyword("SELECT"),
1216            Token::Whitespace(Whitespace::Space),
1217            Token::Mul,
1218            Token::Whitespace(Whitespace::Space),
1219            Token::make_keyword("FROM"),
1220            Token::Whitespace(Whitespace::Space),
1221            Token::make_word("customer", None),
1222            Token::Whitespace(Whitespace::Space),
1223            Token::make_keyword("WHERE"),
1224            Token::Whitespace(Whitespace::Space),
1225            Token::make_word("id", None),
1226            Token::Whitespace(Whitespace::Space),
1227            Token::Eq,
1228            Token::Whitespace(Whitespace::Space),
1229            Token::Number(String::from("1")),
1230            Token::Whitespace(Whitespace::Space),
1231            Token::make_keyword("LIMIT"),
1232            Token::Whitespace(Whitespace::Space),
1233            Token::Number(String::from("5")),
1234        ];
1235
1236        compare(expected, tokens);
1237    }
1238
1239    #[test]
1240    fn tokenize_explain_select() {
1241        let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
1242        let mut tokenizer = Tokenizer::new(&sql);
1243        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1244
1245        let expected = vec![
1246            Token::make_keyword("EXPLAIN"),
1247            Token::Whitespace(Whitespace::Space),
1248            Token::make_keyword("SELECT"),
1249            Token::Whitespace(Whitespace::Space),
1250            Token::Mul,
1251            Token::Whitespace(Whitespace::Space),
1252            Token::make_keyword("FROM"),
1253            Token::Whitespace(Whitespace::Space),
1254            Token::make_word("customer", None),
1255            Token::Whitespace(Whitespace::Space),
1256            Token::make_keyword("WHERE"),
1257            Token::Whitespace(Whitespace::Space),
1258            Token::make_word("id", None),
1259            Token::Whitespace(Whitespace::Space),
1260            Token::Eq,
1261            Token::Whitespace(Whitespace::Space),
1262            Token::Number(String::from("1")),
1263        ];
1264
1265        compare(expected, tokens);
1266    }
1267
1268    #[test]
1269    fn tokenize_explain_analyze_select() {
1270        let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
1271        let mut tokenizer = Tokenizer::new(&sql);
1272        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1273
1274        let expected = vec![
1275            Token::make_keyword("EXPLAIN"),
1276            Token::Whitespace(Whitespace::Space),
1277            Token::make_keyword("ANALYZE"),
1278            Token::Whitespace(Whitespace::Space),
1279            Token::make_keyword("SELECT"),
1280            Token::Whitespace(Whitespace::Space),
1281            Token::Mul,
1282            Token::Whitespace(Whitespace::Space),
1283            Token::make_keyword("FROM"),
1284            Token::Whitespace(Whitespace::Space),
1285            Token::make_word("customer", None),
1286            Token::Whitespace(Whitespace::Space),
1287            Token::make_keyword("WHERE"),
1288            Token::Whitespace(Whitespace::Space),
1289            Token::make_word("id", None),
1290            Token::Whitespace(Whitespace::Space),
1291            Token::Eq,
1292            Token::Whitespace(Whitespace::Space),
1293            Token::Number(String::from("1")),
1294        ];
1295
1296        compare(expected, tokens);
1297    }
1298
1299    #[test]
1300    fn tokenize_string_predicate() {
1301        let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
1302        let mut tokenizer = Tokenizer::new(&sql);
1303        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1304
1305        let expected = vec![
1306            Token::make_keyword("SELECT"),
1307            Token::Whitespace(Whitespace::Space),
1308            Token::Mul,
1309            Token::Whitespace(Whitespace::Space),
1310            Token::make_keyword("FROM"),
1311            Token::Whitespace(Whitespace::Space),
1312            Token::make_word("customer", None),
1313            Token::Whitespace(Whitespace::Space),
1314            Token::make_keyword("WHERE"),
1315            Token::Whitespace(Whitespace::Space),
1316            Token::make_word("salary", None),
1317            Token::Whitespace(Whitespace::Space),
1318            Token::Neq,
1319            Token::Whitespace(Whitespace::Space),
1320            Token::SingleQuotedString(String::from("Not Provided")),
1321        ];
1322
1323        compare(expected, tokens);
1324    }
1325
1326    #[test]
1327    fn tokenize_invalid_string() {
1328        let sql = String::from("\nمصطفىh");
1329        let mut tokenizer = Tokenizer::new(&sql);
1330        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1331        // println!("tokens: {:#?}", tokens);
1332        let expected = vec![
1333            Token::Whitespace(Whitespace::Newline),
1334            Token::Char('م'),
1335            Token::Char('ص'),
1336            Token::Char('ط'),
1337            Token::Char('ف'),
1338            Token::Char('ى'),
1339            Token::make_word("h", None),
1340        ];
1341        compare(expected, tokens);
1342    }
1343
1344    #[test]
1345    fn tokenize_newline_in_string_literal() {
1346        let sql = String::from("'foo\r\nbar\nbaz'");
1347        let mut tokenizer = Tokenizer::new(&sql);
1348        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1349        let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_owned())];
1350        compare(expected, tokens);
1351    }
1352
1353    #[test]
1354    fn tokenize_unterminated_string_literal() {
1355        let sql = String::from("select 'foo");
1356        let mut tokenizer = Tokenizer::new(&sql);
1357        assert_eq!(
1358            tokenizer.tokenize_with_whitespace(),
1359            Err(TokenizerError {
1360                message: "Unterminated string literal".to_owned(),
1361                line: 1,
1362                col: 12,
1363                context: "LINE 1: select 'foo\n                   ^".to_owned(),
1364            })
1365        );
1366    }
1367
1368    #[test]
1369    fn tokenize_invalid_string_cols() {
1370        let sql = String::from("\n\nSELECT * FROM table\tمصطفىh");
1371        let mut tokenizer = Tokenizer::new(&sql);
1372        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1373        // println!("tokens: {:#?}", tokens);
1374        let expected = vec![
1375            Token::Whitespace(Whitespace::Newline),
1376            Token::Whitespace(Whitespace::Newline),
1377            Token::make_keyword("SELECT"),
1378            Token::Whitespace(Whitespace::Space),
1379            Token::Mul,
1380            Token::Whitespace(Whitespace::Space),
1381            Token::make_keyword("FROM"),
1382            Token::Whitespace(Whitespace::Space),
1383            Token::make_keyword("table"),
1384            Token::Whitespace(Whitespace::Tab),
1385            Token::Char('م'),
1386            Token::Char('ص'),
1387            Token::Char('ط'),
1388            Token::Char('ف'),
1389            Token::Char('ى'),
1390            Token::make_word("h", None),
1391        ];
1392        compare(expected, tokens);
1393    }
1394
1395    #[test]
1396    fn tokenize_right_arrow() {
1397        let sql = String::from("FUNCTION(key=>value)");
1398        let mut tokenizer = Tokenizer::new(&sql);
1399        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1400        let expected = vec![
1401            Token::make_word("FUNCTION", None),
1402            Token::LParen,
1403            Token::make_word("key", None),
1404            Token::RArrow,
1405            Token::make_word("value", None),
1406            Token::RParen,
1407        ];
1408        compare(expected, tokens);
1409    }
1410
1411    #[test]
1412    fn tokenize_is_null() {
1413        let sql = String::from("a IS NULL");
1414        let mut tokenizer = Tokenizer::new(&sql);
1415        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1416
1417        let expected = vec![
1418            Token::make_word("a", None),
1419            Token::Whitespace(Whitespace::Space),
1420            Token::make_keyword("IS"),
1421            Token::Whitespace(Whitespace::Space),
1422            Token::make_keyword("NULL"),
1423        ];
1424
1425        compare(expected, tokens);
1426    }
1427
1428    #[test]
1429    fn tokenize_comment() {
1430        let sql = String::from("0--this is a comment\n1");
1431        let mut tokenizer = Tokenizer::new(&sql);
1432        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1433        let expected = vec![
1434            Token::Number("0".to_owned()),
1435            Token::Whitespace(Whitespace::SingleLineComment {
1436                prefix: "--".to_owned(),
1437                comment: "this is a comment\n".to_owned(),
1438            }),
1439            Token::Number("1".to_owned()),
1440        ];
1441        compare(expected, tokens);
1442    }
1443
1444    #[test]
1445    fn tokenize_comment_at_eof() {
1446        let sql = String::from("--this is a comment");
1447        let mut tokenizer = Tokenizer::new(&sql);
1448        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1449        let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
1450            prefix: "--".to_owned(),
1451            comment: "this is a comment".to_owned(),
1452        })];
1453        compare(expected, tokens);
1454    }
1455
1456    #[test]
1457    fn tokenize_multiline_comment() {
1458        let sql = String::from("0/*multi-line\n* /comment*/1");
1459        let mut tokenizer = Tokenizer::new(&sql);
1460        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1461        let expected = vec![
1462            Token::Number("0".to_owned()),
1463            Token::Whitespace(Whitespace::MultiLineComment(
1464                "multi-line\n* /comment".to_owned(),
1465            )),
1466            Token::Number("1".to_owned()),
1467        ];
1468        compare(expected, tokens);
1469    }
1470
1471    #[test]
1472    fn tokenize_nested_multiline_comment() {
1473        let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
1474        let mut tokenizer = Tokenizer::new(&sql);
1475        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1476        let expected = vec![
1477            Token::Number("0".to_owned()),
1478            Token::Whitespace(Whitespace::MultiLineComment(
1479                "multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_owned(),
1480            )),
1481            Token::Number("1".to_owned()),
1482        ];
1483        compare(expected, tokens);
1484    }
1485
1486    #[test]
1487    fn tokenize_multiline_comment_with_even_asterisks() {
1488        let sql = String::from("\n/** Comment **/\n");
1489        let mut tokenizer = Tokenizer::new(&sql);
1490        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1491        let expected = vec![
1492            Token::Whitespace(Whitespace::Newline),
1493            Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_owned())),
1494            Token::Whitespace(Whitespace::Newline),
1495        ];
1496        compare(expected, tokens);
1497    }
1498
1499    #[test]
1500    fn tokenize_mismatched_quotes() {
1501        let sql = String::from("\"foo");
1502        let mut tokenizer = Tokenizer::new(&sql);
1503        assert_eq!(
1504            tokenizer.tokenize_with_whitespace(),
1505            Err(TokenizerError {
1506                message: "Expected close delimiter '\"' before EOF.".to_owned(),
1507                line: 1,
1508                col: 5,
1509                context: "LINE 1: \"foo\n            ^".to_owned(),
1510            })
1511        );
1512    }
1513
1514    #[test]
1515    fn tokenize_newlines() {
1516        let sql = String::from("line1\nline2\rline3\r\nline4\r");
1517        let mut tokenizer = Tokenizer::new(&sql);
1518        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1519        let expected = vec![
1520            Token::make_word("line1", None),
1521            Token::Whitespace(Whitespace::Newline),
1522            Token::make_word("line2", None),
1523            Token::Whitespace(Whitespace::Newline),
1524            Token::make_word("line3", None),
1525            Token::Whitespace(Whitespace::Newline),
1526            Token::make_word("line4", None),
1527            Token::Whitespace(Whitespace::Newline),
1528        ];
1529        compare(expected, tokens);
1530    }
1531
1532    #[test]
1533    fn tokenize_pg_regex_match() {
1534        let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
1535        let mut tokenizer = Tokenizer::new(sql);
1536        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1537        let expected = vec![
1538            Token::make_keyword("SELECT"),
1539            Token::Whitespace(Whitespace::Space),
1540            Token::make_word("col", None),
1541            Token::Whitespace(Whitespace::Space),
1542            Token::Op("~".to_owned()),
1543            Token::Whitespace(Whitespace::Space),
1544            Token::SingleQuotedString("^a".into()),
1545            Token::Comma,
1546            Token::Whitespace(Whitespace::Space),
1547            Token::make_word("col", None),
1548            Token::Whitespace(Whitespace::Space),
1549            Token::Op("~*".to_owned()),
1550            Token::Whitespace(Whitespace::Space),
1551            Token::SingleQuotedString("^a".into()),
1552            Token::Comma,
1553            Token::Whitespace(Whitespace::Space),
1554            Token::make_word("col", None),
1555            Token::Whitespace(Whitespace::Space),
1556            Token::Op("!~".to_owned()),
1557            Token::Whitespace(Whitespace::Space),
1558            Token::SingleQuotedString("^a".into()),
1559            Token::Comma,
1560            Token::Whitespace(Whitespace::Space),
1561            Token::make_word("col", None),
1562            Token::Whitespace(Whitespace::Space),
1563            Token::Op("!~*".to_owned()),
1564            Token::Whitespace(Whitespace::Space),
1565            Token::SingleQuotedString("^a".into()),
1566        ];
1567        compare(expected, tokens);
1568    }
1569
1570    #[test]
1571    fn tokenize_select_array() {
1572        let sql = String::from("SELECT '{1, 2, 3}'");
1573        let mut tokenizer = Tokenizer::new(&sql);
1574        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1575
1576        let expected = vec![
1577            Token::make_keyword("SELECT"),
1578            Token::Whitespace(Whitespace::Space),
1579            Token::SingleQuotedString(String::from("{1, 2, 3}")),
1580        ];
1581
1582        compare(expected, tokens);
1583    }
1584
1585    fn compare(expected: Vec<Token>, actual: Vec<Token>) {
1586        // println!("------------------------------");
1587        // println!("tokens   = {:?}", actual);
1588        // println!("expected = {:?}", expected);
1589        // println!("------------------------------");
1590        assert_eq!(expected, actual);
1591    }
1592}
risingwave_sqlparser/tokenizer.rs

risingwave_sqlparser/
tokenizer.rs