Skip to main content

risingwave_sqlparser/
tokenizer.rs

1// Licensed under the Apache License, Version 2.0 (the "License");
2// you may not use this file except in compliance with the License.
3// You may obtain a copy of the License at
4//
5//     http://www.apache.org/licenses/LICENSE-2.0
6//
7// Unless required by applicable law or agreed to in writing, software
8// distributed under the License is distributed on an "AS IS" BASIS,
9// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10// See the License for the specific language governing permissions and
11// limitations under the License.
12
13//! SQL Tokenizer
14//!
15//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
16//!
17//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).
18
19use std::fmt;
20use std::fmt::Debug;
21use std::iter::Peekable;
22use std::str::Chars;
23
24use crate::ast::{CstyleEscapedString, DollarQuotedString};
25use crate::keywords::{ALL_KEYWORDS, ALL_KEYWORDS_INDEX, Keyword};
26
27/// SQL Token enumeration
28#[derive(Debug, Clone, PartialEq, Eq, Hash)]
29pub enum Token {
30    /// An end-of-file marker, not a real token
31    EOF,
32    /// A keyword (like SELECT) or an optionally quoted SQL identifier
33    Word(Word),
34    /// An unsigned numeric literal
35    Number(String),
36    /// A character that could not be tokenized
37    Char(char),
38    /// Single quoted string: i.e: 'string'
39    SingleQuotedString(String),
40    /// Dollar quoted string: i.e: $$string$$ or $tag_name$string$tag_name$
41    DollarQuotedString(DollarQuotedString),
42    /// Single quoted string with c-style escapes: i.e: E'string'
43    CstyleEscapesString(CstyleEscapedString),
44    /// "National" string literal: i.e: N'string'
45    NationalStringLiteral(String),
46    /// Hexadecimal string literal: i.e.: X'deadbeef'
47    HexStringLiteral(String),
48    /// Parameter symbols: i.e:  $1, $2
49    Parameter(String),
50    /// Comma
51    Comma,
52    /// Whitespace (space, tab, etc)
53    Whitespace(Whitespace),
54    /// Custom Operator
55    Op(String),
56    /// Equality operator `=`
57    Eq,
58    /// Not Equals operator `<>` (or `!=` in some dialects)
59    Neq,
60    /// Less Than operator `<`
61    Lt,
62    /// Greater Than operator `>`
63    Gt,
64    /// Less Than Or Equals operator `<=`
65    LtEq,
66    /// Greater Than Or Equals operator `>=`
67    GtEq,
68    /// Plus operator `+`
69    Plus,
70    /// Minus operator `-`
71    Minus,
72    /// Multiplication operator `*`
73    Mul,
74    /// Division operator `/`
75    Div,
76    /// Modulo Operator `%`
77    Mod,
78    /// Left parenthesis `(`
79    LParen,
80    /// Right parenthesis `)`
81    RParen,
82    /// Period (used for compound identifiers or projections into nested types)
83    Period,
84    /// Colon `:`
85    Colon,
86    /// DoubleColon `::` (used for casting in postgresql)
87    DoubleColon,
88    /// SemiColon `;` used as separator for COPY and payload
89    SemiColon,
90    /// Backslash `\` used in terminating the COPY payload with `\.`
91    Backslash,
92    /// Left bracket `[`
93    LBracket,
94    /// Right bracket `]`
95    RBracket,
96    /// Pipe `|`
97    Pipe,
98    /// Caret `^`
99    Caret,
100    /// Left brace `{`
101    LBrace,
102    /// Right brace `}`
103    RBrace,
104    /// Right Arrow `=>`
105    RArrow,
106}
107
108impl fmt::Display for Token {
109    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
110        match self {
111            Token::EOF => f.write_str("EOF"),
112            Token::Word(w) => write!(f, "{}", w),
113            Token::Number(n) => write!(f, "{}", n),
114            Token::Char(c) => write!(f, "{}", c),
115            Token::SingleQuotedString(s) => write!(f, "'{}'", s),
116            Token::DollarQuotedString(s) => write!(f, "{}", s),
117            Token::NationalStringLiteral(s) => write!(f, "N'{}'", s),
118            Token::HexStringLiteral(s) => write!(f, "X'{}'", s),
119            Token::CstyleEscapesString(s) => write!(f, "E'{}'", s),
120            Token::Parameter(s) => write!(f, "${}", s),
121            Token::Comma => f.write_str(","),
122            Token::Whitespace(ws) => write!(f, "{}", ws),
123            Token::Op(op) => write!(f, "{}", op),
124            Token::Eq => f.write_str("="),
125            Token::Neq => f.write_str("<>"),
126            Token::Lt => f.write_str("<"),
127            Token::Gt => f.write_str(">"),
128            Token::LtEq => f.write_str("<="),
129            Token::GtEq => f.write_str(">="),
130            Token::Plus => f.write_str("+"),
131            Token::Minus => f.write_str("-"),
132            Token::Mul => f.write_str("*"),
133            Token::Div => f.write_str("/"),
134            Token::Mod => f.write_str("%"),
135            Token::LParen => f.write_str("("),
136            Token::RParen => f.write_str(")"),
137            Token::Period => f.write_str("."),
138            Token::Colon => f.write_str(":"),
139            Token::DoubleColon => f.write_str("::"),
140            Token::SemiColon => f.write_str(";"),
141            Token::Backslash => f.write_str("\\"),
142            Token::LBracket => f.write_str("["),
143            Token::RBracket => f.write_str("]"),
144            Token::Caret => f.write_str("^"),
145            Token::Pipe => f.write_str("|"),
146            Token::LBrace => f.write_str("{"),
147            Token::RBrace => f.write_str("}"),
148            Token::RArrow => f.write_str("=>"),
149        }
150    }
151}
152
153impl Token {
154    pub fn make_keyword(keyword: &str) -> Self {
155        Token::make_word(keyword, None)
156    }
157
158    pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
159        let word_uppercase = word.to_uppercase();
160        Token::Word(Word {
161            value: word.to_owned(),
162            quote_style,
163            keyword: if quote_style.is_none() {
164                let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
165                keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
166            } else {
167                Keyword::NoKeyword
168            },
169        })
170    }
171
172    pub fn with_location(self, location: Location) -> TokenWithLocation {
173        TokenWithLocation::new(self, location.line, location.column)
174    }
175}
176
177/// A keyword (like SELECT) or an optionally quoted SQL identifier
178#[derive(Debug, Clone, PartialEq, Eq, Hash)]
179pub struct Word {
180    /// The value of the token, without the enclosing quotes, and with the
181    /// escape sequences (if any) processed (TODO: escapes are not handled)
182    pub value: String,
183    /// An identifier can be "quoted" (&lt;delimited identifier> in ANSI parlance).
184    /// The standard and most implementations allow using double quotes for this,
185    /// but some implementations support other quoting styles as well (e.g. \[MS SQL])
186    pub quote_style: Option<char>,
187    /// If the word was not quoted and it matched one of the known keywords,
188    /// this will have one of the values from dialect::keywords, otherwise empty
189    pub keyword: Keyword,
190}
191
192impl fmt::Display for Word {
193    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
194        match self.quote_style {
195            Some(s) if s == '[' || s == '`' => {
196                write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
197            }
198            Some('"') => write!(f, "\"{}\"", self.value.replace('"', "\"\"")),
199            None => f.write_str(&self.value),
200            _ => panic!("Unexpected quote_style!"),
201        }
202    }
203}
204
205impl Word {
206    fn matching_end_quote(ch: char) -> char {
207        match ch {
208            '"' => '"', // ANSI and most dialects
209            '[' => ']', // MS SQL
210            '`' => '`', // MySQL
211            _ => panic!("unexpected quoting style!"),
212        }
213    }
214}
215
216#[derive(Debug, Clone, PartialEq, Eq, Hash)]
217pub enum Whitespace {
218    Space,
219    Newline,
220    Tab,
221    SingleLineComment { comment: String, prefix: String },
222    MultiLineComment(String),
223}
224
225impl fmt::Display for Whitespace {
226    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
227        match self {
228            Whitespace::Space => f.write_str(" "),
229            Whitespace::Newline => f.write_str("\n"),
230            Whitespace::Tab => f.write_str("\t"),
231            Whitespace::SingleLineComment { prefix, comment } => write!(f, "{}{}", prefix, comment),
232            Whitespace::MultiLineComment(s) => write!(f, "/*{}*/", s),
233        }
234    }
235}
236
237/// Location in input string
238#[derive(Debug, Eq, PartialEq, Clone)]
239pub struct Location {
240    /// Line number, starting from 1
241    pub line: u64,
242    /// Line column, starting from 1
243    pub column: u64,
244}
245
246/// A [Token] with [Location] attached to it
247#[derive(Debug, Eq, PartialEq, Clone)]
248pub struct TokenWithLocation {
249    pub token: Token,
250    pub location: Location,
251}
252
253impl TokenWithLocation {
254    pub fn new(token: Token, line: u64, column: u64) -> TokenWithLocation {
255        TokenWithLocation {
256            token,
257            location: Location { line, column },
258        }
259    }
260
261    pub fn eof() -> TokenWithLocation {
262        TokenWithLocation::new(Token::EOF, 0, 0)
263    }
264}
265
266impl PartialEq<Token> for TokenWithLocation {
267    fn eq(&self, other: &Token) -> bool {
268        &self.token == other
269    }
270}
271
272impl PartialEq<TokenWithLocation> for Token {
273    fn eq(&self, other: &TokenWithLocation) -> bool {
274        self == &other.token
275    }
276}
277
278impl fmt::Display for TokenWithLocation {
279    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
280        if self.token == Token::EOF {
281            write!(f, "end of input")
282        } else {
283            write!(
284                f,
285                "{} at line {}, column {}",
286                self.token, self.location.line, self.location.column
287            )
288        }
289    }
290}
291
292/// Tokenizer error
293#[derive(Debug, PartialEq)]
294pub struct TokenizerError {
295    pub message: String,
296    pub line: u64,
297    pub col: u64,
298    pub context: String,
299}
300
301impl fmt::Display for TokenizerError {
302    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
303        write!(
304            f,
305            "{} at line {}, column {}\n{}",
306            self.message, self.line, self.col, self.context
307        )
308    }
309}
310
311impl std::error::Error for TokenizerError {}
312
313/// SQL Tokenizer
314#[derive(Clone)]
315pub struct Tokenizer<'a> {
316    sql: &'a str,
317    chars: Peekable<Chars<'a>>,
318    line: u64,
319    col: u64,
320}
321
322impl<'a> Tokenizer<'a> {
323    /// Create a new SQL tokenizer for the specified SQL statement
324    pub fn new(query: &'a str) -> Self {
325        Self {
326            sql: query,
327            chars: query.chars().peekable(),
328            line: 1,
329            col: 1,
330        }
331    }
332
333    /// Consume the next character.
334    fn next(&mut self) -> Option<char> {
335        let ch = self.chars.next();
336        if let Some(ch) = ch {
337            match ch {
338                '\n' => {
339                    self.line += 1;
340                    self.col = 1;
341                }
342                '\t' => self.col += 4,
343                _ => self.col += 1,
344            }
345        }
346        ch
347    }
348
349    /// Return the next character without consuming it.
350    fn peek(&mut self) -> Option<char> {
351        self.chars.peek().cloned()
352    }
353
354    /// Tokenize the statement and produce a vector of tokens with locations.
355    ///
356    /// Whitespaces are skipped.
357    pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
358        let tokens = self.tokenize()?;
359        Ok(tokens
360            .into_iter()
361            .filter(|token| !matches!(&token.token, Token::Whitespace(_)))
362            .collect())
363    }
364
365    /// Tokenize the statement and produce a vector of tokens.
366    ///
367    /// Whitespaces are included.
368    #[allow(dead_code)]
369    fn tokenize_with_whitespace(&mut self) -> Result<Vec<Token>, TokenizerError> {
370        let tokens = self.tokenize()?;
371        Ok(tokens.into_iter().map(|t| t.token).collect())
372    }
373
374    /// Tokenize the statement and produce a vector of tokens.
375    ///
376    /// Whitespaces are included.
377    fn tokenize(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
378        let mut tokens = Vec::new();
379        while let Some(token) = self.next_token_with_location()? {
380            tokens.push(token);
381        }
382        Ok(tokens)
383    }
384
385    /// Get the next token or return None
386    fn next_token_with_location(&mut self) -> Result<Option<TokenWithLocation>, TokenizerError> {
387        let loc = Location {
388            line: self.line,
389            column: self.col,
390        };
391        self.next_token()
392            .map(|t| t.map(|token| token.with_location(loc)))
393    }
394
395    /// Get the next token or return None
396    fn next_token(&mut self) -> Result<Option<Token>, TokenizerError> {
397        macro_rules! op_chars {
398            // https://www.postgresql.org/docs/17/sql-syntax-lexical.html#SQL-SYNTAX-OPERATORS
399            (all as_pat) => {
400                '+' | '-' | '*' | '/' | '<' | '>' | '=' | op_chars!(ext as_pat)
401            };
402            (ext $m:ident) => {
403                op_chars!($m '~' '!' '@' '#' '%' '^' '&' '|' '`' '?')
404            };
405            (as_arr $($c:literal)+) => {
406                [ $($c),+ ]
407            };
408            (as_pat $($c:literal)+) => {
409                $($c)|+
410            };
411        }
412
413        match self.peek() {
414            Some(ch) => match ch {
415                ' ' => self.consume_and_return(Token::Whitespace(Whitespace::Space)),
416                '\t' => self.consume_and_return(Token::Whitespace(Whitespace::Tab)),
417                '\n' => self.consume_and_return(Token::Whitespace(Whitespace::Newline)),
418                '\r' => {
419                    // Emit a single Whitespace::Newline token for \r and \r\n
420                    self.next();
421                    if let Some('\n') = self.peek() {
422                        self.next();
423                    }
424                    Ok(Some(Token::Whitespace(Whitespace::Newline)))
425                }
426                'N' => {
427                    self.next(); // consume, to check the next char
428                    match self.peek() {
429                        Some('\'') => {
430                            // N'...' - a <national character string literal>
431                            let s = self.tokenize_single_quoted_string()?;
432                            Ok(Some(Token::NationalStringLiteral(s)))
433                        }
434                        _ => {
435                            // regular identifier starting with an "N"
436                            let s = self.tokenize_word('N');
437                            Ok(Some(Token::make_word(&s, None)))
438                        }
439                    }
440                }
441                x @ 'e' | x @ 'E' => {
442                    self.next(); // consume, to check the next char
443                    match self.peek() {
444                        Some('\'') => {
445                            // E'...' - a <character string literal>
446                            let s = self.tokenize_single_quoted_string_with_escape()?;
447                            Ok(Some(Token::CstyleEscapesString(s)))
448                        }
449                        _ => {
450                            // regular identifier starting with an "E"
451                            let s = self.tokenize_word(x);
452                            Ok(Some(Token::make_word(&s, None)))
453                        }
454                    }
455                }
456                // The spec only allows an uppercase 'X' to introduce a hex
457                // string, but PostgreSQL, at least, allows a lowercase 'x' too.
458                x @ 'x' | x @ 'X' => {
459                    self.next(); // consume, to check the next char
460                    match self.peek() {
461                        Some('\'') => {
462                            // X'...' - a <binary string literal>
463                            let s = self.tokenize_single_quoted_string()?;
464                            Ok(Some(Token::HexStringLiteral(s)))
465                        }
466                        _ => {
467                            // regular identifier starting with an "X"
468                            let s = self.tokenize_word(x);
469                            Ok(Some(Token::make_word(&s, None)))
470                        }
471                    }
472                }
473                // identifier or keyword
474                ch if is_identifier_start(ch) => {
475                    self.next(); // consume the first char
476                    let s = self.tokenize_word(ch);
477
478                    Ok(Some(Token::make_word(&s, None)))
479                }
480                // string
481                '\'' => {
482                    let s = self.tokenize_single_quoted_string()?;
483
484                    Ok(Some(Token::SingleQuotedString(s)))
485                }
486                // delimited (quoted) identifier
487                quote_start if is_delimited_identifier_start(quote_start) => {
488                    let s = self.tokenize_delimited_identifier(quote_start)?;
489                    Ok(Some(Token::make_word(&s, Some(quote_start))))
490                }
491                // numbers and period
492                '0'..='9' | '.' => {
493                    let mut s = self.peeking_take_while(|ch| ch.is_ascii_digit());
494
495                    // match binary literal that starts with 0x
496                    if s == "0"
497                        && let Some(radix) = self.peek()
498                        && "xob".contains(radix.to_ascii_lowercase())
499                    {
500                        self.next();
501                        let radix = radix.to_ascii_lowercase();
502                        let base = match radix {
503                            'x' => 16,
504                            'o' => 8,
505                            'b' => 2,
506                            _ => unreachable!(),
507                        };
508                        let s2 = self.peeking_take_while(|ch| ch.is_digit(base));
509                        if s2.is_empty() {
510                            return self.error("incomplete integer literal");
511                        }
512                        self.reject_number_junk()?;
513                        return Ok(Some(Token::Number(format!("0{radix}{s2}"))));
514                    }
515
516                    // match one period
517                    if let Some('.') = self.peek() {
518                        s.push('.');
519                        self.next();
520                    }
521                    s += &self.peeking_take_while(|ch| ch.is_ascii_digit());
522
523                    // No number -> Token::Period
524                    if s == "." {
525                        return Ok(Some(Token::Period));
526                    }
527
528                    match self.peek() {
529                        // Number is a scientific number (1e6)
530                        Some('e') | Some('E') => {
531                            s.push('e');
532                            self.next();
533
534                            if let Some('-') = self.peek() {
535                                s.push('-');
536                                self.next();
537                            }
538                            s += &self.peeking_take_while(|ch| ch.is_ascii_digit());
539                            self.reject_number_junk()?;
540                            return Ok(Some(Token::Number(s)));
541                        }
542                        // Not a scientific number
543                        _ => {}
544                    };
545                    self.reject_number_junk()?;
546                    Ok(Some(Token::Number(s)))
547                }
548                // punctuation
549                '(' => self.consume_and_return(Token::LParen),
550                ')' => self.consume_and_return(Token::RParen),
551                ',' => self.consume_and_return(Token::Comma),
552                ':' => {
553                    self.next();
554                    match self.peek() {
555                        Some(':') => self.consume_and_return(Token::DoubleColon),
556                        _ => Ok(Some(Token::Colon)),
557                    }
558                }
559                '$' => Ok(Some(self.tokenize_dollar_preceded_value()?)),
560                ';' => self.consume_and_return(Token::SemiColon),
561                '\\' => self.consume_and_return(Token::Backslash),
562                '[' => self.consume_and_return(Token::LBracket),
563                ']' => self.consume_and_return(Token::RBracket),
564                '{' => self.consume_and_return(Token::LBrace),
565                '}' => self.consume_and_return(Token::RBrace),
566                // operators
567                op_chars!(all as_pat) => {
568                    let mut trial = self.clone();
569                    let op_taken = trial.peeking_take_while(|c| matches!(c, op_chars!(all as_pat)));
570                    // It is safe to assume byte index is char index in `op_token` below.
571
572                    // https://www.postgresql.org/docs/17/sql-syntax-lexical.html#SQL-SYNTAX-OPERATORS
573                    // https://github.com/postgres/postgres/blob/REL_17_4/src/backend/parser/scan.l#L900-L1006
574                    let slash_star = op_taken.find("/*");
575                    let dash_dash = op_taken.find("--");
576                    let pos = match (slash_star, dash_dash) {
577                        (Some(s), Some(d)) => s.min(d),
578                        (Some(s), None) => s,
579                        (None, Some(d)) => d,
580                        (None, None) => op_taken.len(),
581                    };
582                    let mut op = &op_taken[..pos];
583                    if op.is_empty() {
584                        match self.next() {
585                            Some('-') => {
586                                self.next(); // consume the second '-', starting a single-line comment
587                                let comment = self.tokenize_single_line_comment();
588
589                                return Ok(Some(Token::Whitespace(
590                                    Whitespace::SingleLineComment {
591                                        prefix: "--".to_owned(),
592                                        comment,
593                                    },
594                                )));
595                            }
596                            Some('/') => {
597                                self.next(); // consume the '*', starting a multi-line comment
598                                return self.tokenize_multiline_comment();
599                            }
600                            _ => unreachable!(),
601                        }
602                    };
603                    if op.len() > 1
604                        && op.ends_with(['+', '-'])
605                        && !op.contains(op_chars!(ext as_arr))
606                    {
607                        op = op.trim_end_matches(['+', '-']);
608                        if op.is_empty() {
609                            op = &op_taken[..1];
610                        }
611                    }
612                    if op.len() == op_taken.len() {
613                        *self = trial;
614                    } else {
615                        for _ in op.chars() {
616                            self.next();
617                        }
618                    }
619                    match op {
620                        // https://github.com/postgres/postgres/blob/REL_17_4/src/backend/parser/scan.l#L965-L973
621                        "+" => Ok(Some(Token::Plus)),
622                        "-" => Ok(Some(Token::Minus)),
623                        "*" => Ok(Some(Token::Mul)),
624                        "/" => Ok(Some(Token::Div)),
625                        "%" => Ok(Some(Token::Mod)),
626                        "^" => Ok(Some(Token::Caret)),
627                        "<" => Ok(Some(Token::Lt)),
628                        ">" => Ok(Some(Token::Gt)),
629                        "=" => Ok(Some(Token::Eq)),
630                        // https://github.com/postgres/postgres/blob/REL_17_4/src/backend/parser/scan.l#L974-L992
631                        "=>" => Ok(Some(Token::RArrow)),
632                        "<=" => Ok(Some(Token::LtEq)),
633                        ">=" => Ok(Some(Token::GtEq)),
634                        "<>" => Ok(Some(Token::Neq)),
635                        "!=" => Ok(Some(Token::Neq)),
636                        // Our support of `Expr::LambdaFunction` makes us PostgreSQL-incompatible here.
637                        //     foo(bar, | x | x)
638                        // In PostgreSQL, this is unary operator `|` applied to `x`, then bitwise-or `x`.
639                        // In our dialect, this is a lambda function - the identity function.
640                        "|" => Ok(Some(Token::Pipe)),
641                        _ => Ok(Some(Token::Op(op.to_owned()))),
642                    }
643                }
644                other => self.consume_and_return(Token::Char(other)),
645            },
646            None => Ok(None),
647        }
648    }
649
650    fn tokenize_delimited_identifier(
651        &mut self,
652        quote_start: char,
653    ) -> Result<String, TokenizerError> {
654        let quote_end = Word::matching_end_quote(quote_start);
655        let mut s = String::new();
656
657        self.next(); // consume opening quote
658
659        while let Some(ch) = self.peek() {
660            self.next(); // consume ch
661
662            if ch == quote_end {
663                if self.peek() == Some(quote_end) {
664                    self.next(); // consume escaped quote
665                    s.push(quote_end);
666                } else {
667                    return Ok(s);
668                }
669            } else {
670                s.push(ch);
671            }
672        }
673
674        self.error(format!(
675            "Expected close delimiter '{}' before EOF.",
676            quote_end
677        ))
678    }
679
680    /// Tokenize dollar preceded value (i.e: a string/placeholder)
681    fn tokenize_dollar_preceded_value(&mut self) -> Result<Token, TokenizerError> {
682        let mut s = String::new();
683        let mut value = String::new();
684
685        self.next();
686
687        if let Some('$') = self.peek() {
688            // syntax: $$......$$
689            self.next();
690
691            let delimiter = "$$";
692            while self.peek().is_some() {
693                if self.starts_with(delimiter) {
694                    for _ in delimiter.chars() {
695                        self.next();
696                    }
697                    return Ok(Token::DollarQuotedString(DollarQuotedString {
698                        value: s,
699                        tag: None,
700                    }));
701                }
702                s.push(self.next().unwrap());
703            }
704
705            self.error("Unterminated dollar-quoted string")
706        } else {
707            // syntax: $SomeTag$.....$SomeTag$
708            value.push_str(&self.peeking_take_while(|ch| ch.is_ascii_alphanumeric() || ch == '_'));
709
710            if let Some('$') = self.peek() {
711                if !is_valid_dollar_quote_tag(&value) {
712                    return self.error(format!("Invalid dollar-quoted string tag \"{}\"", value));
713                }
714
715                self.next();
716
717                let delimiter = format!("${}$", value);
718                while self.peek().is_some() {
719                    if self.starts_with(&delimiter) {
720                        for _ in delimiter.chars() {
721                            self.next();
722                        }
723                        return Ok(Token::DollarQuotedString(DollarQuotedString {
724                            value: s,
725                            tag: Some(value),
726                        }));
727                    }
728                    s.push(self.next().unwrap());
729                }
730
731                self.error(format!(
732                    "Unterminated dollar-quoted string at or near \"{}\"",
733                    value
734                ))
735            } else {
736                Ok(Token::Parameter(value))
737            }
738        }
739    }
740
741    fn error<R>(&self, message: impl Into<String>) -> Result<R, TokenizerError> {
742        let prefix = format!("LINE {}: ", self.line);
743        let sql_line = self.sql.split('\n').nth(self.line as usize - 1).unwrap();
744        let cursor = " ".repeat(prefix.len() + self.col as usize - 1);
745        let context = format!("{}{}\n{}^", prefix, sql_line, cursor);
746        Err(TokenizerError {
747            message: message.into(),
748            col: self.col,
749            line: self.line,
750            context,
751        })
752    }
753
754    fn reject_number_junk(&mut self) -> Result<(), TokenizerError> {
755        if let Some(ch) = self.peek()
756            && is_identifier_start(ch)
757        {
758            return self.error("trailing junk after numeric literal");
759        }
760        Ok(())
761    }
762
763    // Consume characters until newline
764    fn tokenize_single_line_comment(&mut self) -> String {
765        let mut comment = self.peeking_take_while(|ch| ch != '\n');
766        if let Some(ch) = self.next() {
767            assert_eq!(ch, '\n');
768            comment.push(ch);
769        }
770        comment
771    }
772
773    /// Tokenize an identifier or keyword, after the first char is already consumed.
774    fn tokenize_word(&mut self, first_char: char) -> String {
775        let mut s = first_char.to_string();
776        s.push_str(&self.peeking_take_while(is_identifier_part));
777        s
778    }
779
780    /// Read a single quoted string, starting with the opening quote.
781    fn tokenize_single_quoted_string(&mut self) -> Result<String, TokenizerError> {
782        let mut s = String::new();
783        self.next(); // consume the opening quote
784
785        // slash escaping is specific to MySQL dialect
786        let mut is_escaped = false;
787        while let Some(ch) = self.peek() {
788            match ch {
789                '\'' => {
790                    self.next(); // consume
791                    if is_escaped {
792                        s.push(ch);
793                        is_escaped = false;
794                    } else if self.peek().map(|c| c == '\'').unwrap_or(false) {
795                        s.push(ch);
796                        self.next();
797                    } else {
798                        return Ok(s);
799                    }
800                }
801                '\\' => {
802                    s.push(ch);
803                    self.next();
804                }
805                _ => {
806                    self.next(); // consume
807                    s.push(ch);
808                }
809            }
810        }
811        self.error("Unterminated string literal")
812    }
813
814    /// Read a single qutoed string with escape
815    fn tokenize_single_quoted_string_with_escape(
816        &mut self,
817    ) -> Result<CstyleEscapedString, TokenizerError> {
818        let mut terminated = false;
819        let mut s = String::new();
820        self.next(); // consume the opening quote
821
822        while let Some(ch) = self.peek() {
823            match ch {
824                '\'' => {
825                    self.next(); // consume
826                    if self.peek().map(|c| c == '\'').unwrap_or(false) {
827                        s.push('\\');
828                        s.push(ch);
829                        self.next();
830                    } else {
831                        terminated = true;
832                        break;
833                    }
834                }
835                '\\' => {
836                    s.push(ch);
837                    self.next();
838                    if self.peek().map(|c| c == '\'' || c == '\\').unwrap_or(false) {
839                        s.push(self.next().unwrap());
840                    }
841                }
842                _ => {
843                    self.next(); // consume
844                    s.push(ch);
845                }
846            }
847        }
848
849        if !terminated {
850            return self.error("Unterminated string literal");
851        }
852
853        let unescaped = match Self::unescape_c_style(&s) {
854            Ok(unescaped) => unescaped,
855            Err(e) => return self.error(e),
856        };
857
858        Ok(CstyleEscapedString {
859            value: unescaped,
860            raw: s,
861        })
862    }
863
864    /// Helper function used to convert string with c-style escapes into a normal string
865    /// e.g. 'hello\x3fworld' -> 'hello?world'
866    ///
867    /// Detail of c-style escapes refer from:
868    /// <https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE:~:text=4.1.2.2.%C2%A0String%20Constants%20With%20C%2DStyle%20Escapes>
869    fn unescape_c_style(s: &str) -> Result<String, String> {
870        fn hex_byte_process(
871            chars: &mut Peekable<Chars<'_>>,
872            res: &mut String,
873            len: usize,
874            default_char: char,
875        ) -> Result<(), String> {
876            let mut unicode_seq: String = String::with_capacity(len);
877            for _ in 0..len {
878                if let Some(c) = chars.peek()
879                    && c.is_ascii_hexdigit()
880                {
881                    unicode_seq.push(chars.next().unwrap());
882                } else {
883                    break;
884                }
885            }
886
887            if unicode_seq.is_empty() && len == 2 {
888                res.push(default_char);
889                return Ok(());
890            } else if unicode_seq.len() < len && len != 2 {
891                return Err("invalid unicode sequence: must be \\uXXXX or \\UXXXXXXXX".to_owned());
892            }
893
894            if len == 2 {
895                let number = [u8::from_str_radix(&unicode_seq, 16)
896                    .map_err(|e| format!("invalid unicode sequence: {}", e))?];
897
898                res.push(
899                    std::str::from_utf8(&number)
900                        .map_err(|err| format!("invalid unicode sequence: {}", err))?
901                        .chars()
902                        .next()
903                        .unwrap(),
904                );
905            } else {
906                let number = u32::from_str_radix(&unicode_seq, 16)
907                    .map_err(|e| format!("invalid unicode sequence: {}", e))?;
908                res.push(
909                    char::from_u32(number)
910                        .ok_or_else(|| format!("invalid unicode sequence: {}", unicode_seq))?,
911                );
912            }
913            Ok(())
914        }
915
916        fn octal_byte_process(
917            chars: &mut Peekable<Chars<'_>>,
918            res: &mut String,
919            digit: char,
920        ) -> Result<(), String> {
921            let mut unicode_seq: String = String::with_capacity(3);
922            unicode_seq.push(digit);
923            for _ in 0..2 {
924                if let Some(c) = chars.peek()
925                    && matches!(*c, '0'..='7')
926                {
927                    unicode_seq.push(chars.next().unwrap());
928                } else {
929                    break;
930                }
931            }
932
933            let number = [u8::from_str_radix(&unicode_seq, 8)
934                .map_err(|e| format!("invalid unicode sequence: {}", e))?];
935
936            res.push(
937                std::str::from_utf8(&number)
938                    .map_err(|err| format!("invalid unicode sequence: {}", err))?
939                    .chars()
940                    .next()
941                    .unwrap(),
942            );
943            Ok(())
944        }
945
946        let mut chars = s.chars().peekable();
947        let mut res = String::with_capacity(s.len());
948
949        while let Some(c) = chars.next() {
950            if c == '\\' {
951                match chars.next() {
952                    None => {
953                        return Err("unterminated escape sequence".to_owned());
954                    }
955                    Some(next_c) => match next_c {
956                        'b' => res.push('\u{08}'),
957                        'f' => res.push('\u{0C}'),
958                        'n' => res.push('\n'),
959                        'r' => res.push('\r'),
960                        't' => res.push('\t'),
961                        'x' => hex_byte_process(&mut chars, &mut res, 2, 'x')?,
962                        'u' => hex_byte_process(&mut chars, &mut res, 4, 'u')?,
963                        'U' => hex_byte_process(&mut chars, &mut res, 8, 'U')?,
964                        digit @ '0'..='7' => octal_byte_process(&mut chars, &mut res, digit)?,
965                        _ => res.push(next_c),
966                    },
967                }
968            } else {
969                res.push(c);
970            }
971        }
972
973        Ok(res)
974    }
975
976    fn tokenize_multiline_comment(&mut self) -> Result<Option<Token>, TokenizerError> {
977        let mut s = String::new();
978
979        let mut nested = 1;
980        let mut last_ch = ' ';
981
982        loop {
983            match self.next() {
984                Some(ch) => {
985                    if last_ch == '/' && ch == '*' {
986                        nested += 1;
987                    } else if last_ch == '*' && ch == '/' {
988                        nested -= 1;
989                        if nested == 0 {
990                            s.pop();
991                            break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
992                        }
993                    }
994                    s.push(ch);
995                    last_ch = ch;
996                }
997                None => break self.error("Unexpected EOF while in a multi-line comment"),
998            }
999        }
1000    }
1001
1002    #[expect(clippy::unnecessary_wraps)]
1003    fn consume_and_return(&mut self, t: Token) -> Result<Option<Token>, TokenizerError> {
1004        self.next();
1005        Ok(Some(t))
1006    }
1007
1008    fn starts_with(&self, expected: &str) -> bool {
1009        let mut chars = self.chars.clone();
1010        for expected_char in expected.chars() {
1011            if chars.next() != Some(expected_char) {
1012                return false;
1013            }
1014        }
1015        true
1016    }
1017
1018    /// Read from `self` until `predicate` returns `false` or EOF is hit.
1019    /// Return the characters read as String, and keep the first non-matching
1020    /// char available as `self.next()`.
1021    fn peeking_take_while(&mut self, mut predicate: impl FnMut(char) -> bool) -> String {
1022        let mut s = String::new();
1023        while let Some(ch) = self.peek() {
1024            if predicate(ch) {
1025                self.next(); // consume
1026                s.push(ch);
1027            } else {
1028                break;
1029            }
1030        }
1031        s
1032    }
1033}
1034
1035/// Determine if a character starts a quoted identifier. The default
1036/// implementation, accepting "double quoted" ids is both ANSI-compliant
1037/// and appropriate for most dialects (with the notable exception of
1038/// MySQL, MS SQL, and sqlite). You can accept one of characters listed
1039/// in `Word::matching_end_quote` here
1040fn is_delimited_identifier_start(ch: char) -> bool {
1041    ch == '"'
1042}
1043
1044/// Determine if a character is a valid start character for an unquoted identifier
1045fn is_identifier_start(ch: char) -> bool {
1046    // See https://www.postgresql.org/docs/11/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS
1047    // We don't yet support identifiers beginning with "letters with
1048    // diacritical marks and non-Latin letters"
1049    ch.is_ascii_alphabetic() || ch == '_'
1050}
1051
1052/// Determine if a character is a valid unquoted identifier character
1053fn is_identifier_part(ch: char) -> bool {
1054    ch.is_ascii_alphanumeric() || ch == '$' || ch == '_'
1055}
1056
1057fn is_valid_dollar_quote_tag(tag: &str) -> bool {
1058    let mut chars = tag.chars();
1059    matches!(chars.next(), Some(ch) if ch.is_ascii_alphabetic() || ch == '_')
1060        && chars.all(|ch| ch.is_ascii_alphanumeric() || ch == '_')
1061}
1062
1063#[cfg(test)]
1064mod tests {
1065    use super::*;
1066
1067    #[test]
1068    fn tokenizer_error_impl() {
1069        use std::error::Error;
1070
1071        let err = TokenizerError {
1072            message: "test".into(),
1073            line: 1,
1074            col: 1,
1075            context: "LINE 1:".to_owned(),
1076        };
1077
1078        assert!(err.source().is_none());
1079        assert_eq!(err.to_string(), "test at line 1, column 1\nLINE 1:");
1080    }
1081
1082    #[test]
1083    fn tokenize_select_1() {
1084        let sql = String::from("SELECT 1");
1085        let mut tokenizer = Tokenizer::new(&sql);
1086        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1087
1088        let expected = vec![
1089            Token::make_keyword("SELECT"),
1090            Token::Whitespace(Whitespace::Space),
1091            Token::Number(String::from("1")),
1092        ];
1093
1094        compare(expected, tokens);
1095    }
1096
1097    #[test]
1098    fn tokenize_select_float() {
1099        let sql = String::from("SELECT .1");
1100        let mut tokenizer = Tokenizer::new(&sql);
1101        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1102
1103        let expected = vec![
1104            Token::make_keyword("SELECT"),
1105            Token::Whitespace(Whitespace::Space),
1106            Token::Number(String::from(".1")),
1107        ];
1108
1109        compare(expected, tokens);
1110    }
1111
1112    #[test]
1113    fn tokenize_scalar_function() {
1114        let sql = String::from("SELECT sqrt(1)");
1115        let mut tokenizer = Tokenizer::new(&sql);
1116        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1117
1118        let expected = vec![
1119            Token::make_keyword("SELECT"),
1120            Token::Whitespace(Whitespace::Space),
1121            Token::make_word("sqrt", None),
1122            Token::LParen,
1123            Token::Number(String::from("1")),
1124            Token::RParen,
1125        ];
1126
1127        compare(expected, tokens);
1128    }
1129
1130    #[test]
1131    fn tokenize_string_string_concat() {
1132        let sql = String::from("SELECT 'a' || 'b'");
1133        let mut tokenizer = Tokenizer::new(&sql);
1134        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1135
1136        let expected = vec![
1137            Token::make_keyword("SELECT"),
1138            Token::Whitespace(Whitespace::Space),
1139            Token::SingleQuotedString(String::from("a")),
1140            Token::Whitespace(Whitespace::Space),
1141            Token::Op("||".to_owned()),
1142            Token::Whitespace(Whitespace::Space),
1143            Token::SingleQuotedString(String::from("b")),
1144        ];
1145
1146        compare(expected, tokens);
1147    }
1148
1149    #[test]
1150    fn tokenize_escaped_double_quote_in_delimited_identifier() {
1151        let sql = String::from(r###"SELECT "a""b", "x""""y""###);
1152        let mut tokenizer = Tokenizer::new(&sql);
1153        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1154
1155        let expected = vec![
1156            Token::make_keyword("SELECT"),
1157            Token::Whitespace(Whitespace::Space),
1158            Token::make_word("a\"b", Some('"')),
1159            Token::Comma,
1160            Token::Whitespace(Whitespace::Space),
1161            Token::make_word("x\"\"y", Some('"')),
1162        ];
1163
1164        compare(expected, tokens);
1165    }
1166
1167    #[test]
1168    fn display_escaped_double_quote_in_delimited_identifier() {
1169        let sql = String::from(r###"SELECT "a""b", "x""""y""###);
1170        let mut tokenizer = Tokenizer::new(&sql);
1171        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1172
1173        assert_eq!(
1174            tokens.iter().map(ToString::to_string).collect::<String>(),
1175            sql
1176        );
1177    }
1178
1179    #[test]
1180    fn tokenize_bitwise_op() {
1181        let sql = String::from("SELECT one | two ^ three");
1182        let mut tokenizer = Tokenizer::new(&sql);
1183        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1184
1185        let expected = vec![
1186            Token::make_keyword("SELECT"),
1187            Token::Whitespace(Whitespace::Space),
1188            Token::make_word("one", None),
1189            Token::Whitespace(Whitespace::Space),
1190            Token::Pipe,
1191            Token::Whitespace(Whitespace::Space),
1192            Token::make_word("two", None),
1193            Token::Whitespace(Whitespace::Space),
1194            Token::Caret,
1195            Token::Whitespace(Whitespace::Space),
1196            Token::make_word("three", None),
1197        ];
1198        compare(expected, tokens);
1199    }
1200
1201    #[test]
1202    fn tokenize_tagged_dollar_quoted_string_with_inner_different_tag() {
1203        let sql = String::from("SELECT $foo$the content with $bar$nested$bar$ usage$foo$");
1204        let mut tokenizer = Tokenizer::new(&sql);
1205        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1206
1207        let expected = vec![
1208            Token::make_keyword("SELECT"),
1209            Token::Whitespace(Whitespace::Space),
1210            Token::DollarQuotedString(DollarQuotedString {
1211                tag: Some("foo".into()),
1212                value: "the content with $bar$nested$bar$ usage".into(),
1213            }),
1214        ];
1215
1216        compare(expected, tokens);
1217    }
1218
1219    #[test]
1220    fn tokenize_tagged_dollar_quoted_string_with_identifier_tag() {
1221        let sql = String::from("SELECT $_tag_1$hello$_tag_1$");
1222        let mut tokenizer = Tokenizer::new(&sql);
1223        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1224
1225        let expected = vec![
1226            Token::make_keyword("SELECT"),
1227            Token::Whitespace(Whitespace::Space),
1228            Token::DollarQuotedString(DollarQuotedString {
1229                tag: Some("_tag_1".into()),
1230                value: "hello".into(),
1231            }),
1232        ];
1233
1234        compare(expected, tokens);
1235    }
1236
1237    #[test]
1238    fn tokenize_dollar_quoted_string_with_invalid_tag() {
1239        let sql = String::from("SELECT $1tag$hello$1tag$");
1240        let mut tokenizer = Tokenizer::new(&sql);
1241        let error = tokenizer.tokenize_with_whitespace().unwrap_err();
1242
1243        assert!(
1244            error
1245                .to_string()
1246                .contains("Invalid dollar-quoted string tag \"1tag\"")
1247        );
1248    }
1249
1250    #[test]
1251    fn tokenize_tagged_dollar_quoted_string_followed_by_alias_with_dollar() {
1252        let sql = String::from("SELECT $go$o$not nesting just $ sign$go$o$");
1253        let mut tokenizer = Tokenizer::new(&sql);
1254        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1255
1256        let expected = vec![
1257            Token::make_keyword("SELECT"),
1258            Token::Whitespace(Whitespace::Space),
1259            Token::DollarQuotedString(DollarQuotedString {
1260                tag: Some("go".into()),
1261                value: "o$not nesting just $ sign".into(),
1262            }),
1263            Token::make_word("o$", None),
1264        ];
1265
1266        compare(expected, tokens);
1267    }
1268
1269    #[test]
1270    fn tokenize_logical_xor() {
1271        let sql =
1272            String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
1273        let mut tokenizer = Tokenizer::new(&sql);
1274        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1275
1276        let expected = vec![
1277            Token::make_keyword("SELECT"),
1278            Token::Whitespace(Whitespace::Space),
1279            Token::make_keyword("true"),
1280            Token::Whitespace(Whitespace::Space),
1281            Token::make_keyword("XOR"),
1282            Token::Whitespace(Whitespace::Space),
1283            Token::make_keyword("true"),
1284            Token::Comma,
1285            Token::Whitespace(Whitespace::Space),
1286            Token::make_keyword("false"),
1287            Token::Whitespace(Whitespace::Space),
1288            Token::make_keyword("XOR"),
1289            Token::Whitespace(Whitespace::Space),
1290            Token::make_keyword("false"),
1291            Token::Comma,
1292            Token::Whitespace(Whitespace::Space),
1293            Token::make_keyword("true"),
1294            Token::Whitespace(Whitespace::Space),
1295            Token::make_keyword("XOR"),
1296            Token::Whitespace(Whitespace::Space),
1297            Token::make_keyword("false"),
1298            Token::Comma,
1299            Token::Whitespace(Whitespace::Space),
1300            Token::make_keyword("false"),
1301            Token::Whitespace(Whitespace::Space),
1302            Token::make_keyword("XOR"),
1303            Token::Whitespace(Whitespace::Space),
1304            Token::make_keyword("true"),
1305        ];
1306        compare(expected, tokens);
1307    }
1308
1309    #[test]
1310    fn tokenize_simple_select() {
1311        let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
1312        let mut tokenizer = Tokenizer::new(&sql);
1313        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1314
1315        let expected = vec![
1316            Token::make_keyword("SELECT"),
1317            Token::Whitespace(Whitespace::Space),
1318            Token::Mul,
1319            Token::Whitespace(Whitespace::Space),
1320            Token::make_keyword("FROM"),
1321            Token::Whitespace(Whitespace::Space),
1322            Token::make_word("customer", None),
1323            Token::Whitespace(Whitespace::Space),
1324            Token::make_keyword("WHERE"),
1325            Token::Whitespace(Whitespace::Space),
1326            Token::make_word("id", None),
1327            Token::Whitespace(Whitespace::Space),
1328            Token::Eq,
1329            Token::Whitespace(Whitespace::Space),
1330            Token::Number(String::from("1")),
1331            Token::Whitespace(Whitespace::Space),
1332            Token::make_keyword("LIMIT"),
1333            Token::Whitespace(Whitespace::Space),
1334            Token::Number(String::from("5")),
1335        ];
1336
1337        compare(expected, tokens);
1338    }
1339
1340    #[test]
1341    fn tokenize_explain_select() {
1342        let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
1343        let mut tokenizer = Tokenizer::new(&sql);
1344        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1345
1346        let expected = vec![
1347            Token::make_keyword("EXPLAIN"),
1348            Token::Whitespace(Whitespace::Space),
1349            Token::make_keyword("SELECT"),
1350            Token::Whitespace(Whitespace::Space),
1351            Token::Mul,
1352            Token::Whitespace(Whitespace::Space),
1353            Token::make_keyword("FROM"),
1354            Token::Whitespace(Whitespace::Space),
1355            Token::make_word("customer", None),
1356            Token::Whitespace(Whitespace::Space),
1357            Token::make_keyword("WHERE"),
1358            Token::Whitespace(Whitespace::Space),
1359            Token::make_word("id", None),
1360            Token::Whitespace(Whitespace::Space),
1361            Token::Eq,
1362            Token::Whitespace(Whitespace::Space),
1363            Token::Number(String::from("1")),
1364        ];
1365
1366        compare(expected, tokens);
1367    }
1368
1369    #[test]
1370    fn tokenize_explain_analyze_select() {
1371        let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
1372        let mut tokenizer = Tokenizer::new(&sql);
1373        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1374
1375        let expected = vec![
1376            Token::make_keyword("EXPLAIN"),
1377            Token::Whitespace(Whitespace::Space),
1378            Token::make_keyword("ANALYZE"),
1379            Token::Whitespace(Whitespace::Space),
1380            Token::make_keyword("SELECT"),
1381            Token::Whitespace(Whitespace::Space),
1382            Token::Mul,
1383            Token::Whitespace(Whitespace::Space),
1384            Token::make_keyword("FROM"),
1385            Token::Whitespace(Whitespace::Space),
1386            Token::make_word("customer", None),
1387            Token::Whitespace(Whitespace::Space),
1388            Token::make_keyword("WHERE"),
1389            Token::Whitespace(Whitespace::Space),
1390            Token::make_word("id", None),
1391            Token::Whitespace(Whitespace::Space),
1392            Token::Eq,
1393            Token::Whitespace(Whitespace::Space),
1394            Token::Number(String::from("1")),
1395        ];
1396
1397        compare(expected, tokens);
1398    }
1399
1400    #[test]
1401    fn tokenize_string_predicate() {
1402        let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
1403        let mut tokenizer = Tokenizer::new(&sql);
1404        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1405
1406        let expected = vec![
1407            Token::make_keyword("SELECT"),
1408            Token::Whitespace(Whitespace::Space),
1409            Token::Mul,
1410            Token::Whitespace(Whitespace::Space),
1411            Token::make_keyword("FROM"),
1412            Token::Whitespace(Whitespace::Space),
1413            Token::make_word("customer", None),
1414            Token::Whitespace(Whitespace::Space),
1415            Token::make_keyword("WHERE"),
1416            Token::Whitespace(Whitespace::Space),
1417            Token::make_word("salary", None),
1418            Token::Whitespace(Whitespace::Space),
1419            Token::Neq,
1420            Token::Whitespace(Whitespace::Space),
1421            Token::SingleQuotedString(String::from("Not Provided")),
1422        ];
1423
1424        compare(expected, tokens);
1425    }
1426
1427    #[test]
1428    fn tokenize_invalid_string() {
1429        let sql = String::from("\nمصطفىh");
1430        let mut tokenizer = Tokenizer::new(&sql);
1431        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1432        // println!("tokens: {:#?}", tokens);
1433        let expected = vec![
1434            Token::Whitespace(Whitespace::Newline),
1435            Token::Char('م'),
1436            Token::Char('ص'),
1437            Token::Char('ط'),
1438            Token::Char('ف'),
1439            Token::Char('ى'),
1440            Token::make_word("h", None),
1441        ];
1442        compare(expected, tokens);
1443    }
1444
1445    #[test]
1446    fn tokenize_newline_in_string_literal() {
1447        let sql = String::from("'foo\r\nbar\nbaz'");
1448        let mut tokenizer = Tokenizer::new(&sql);
1449        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1450        let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_owned())];
1451        compare(expected, tokens);
1452    }
1453
1454    #[test]
1455    fn tokenize_unterminated_string_literal() {
1456        let sql = String::from("select 'foo");
1457        let mut tokenizer = Tokenizer::new(&sql);
1458        assert_eq!(
1459            tokenizer.tokenize_with_whitespace(),
1460            Err(TokenizerError {
1461                message: "Unterminated string literal".to_owned(),
1462                line: 1,
1463                col: 12,
1464                context: "LINE 1: select 'foo\n                   ^".to_owned(),
1465            })
1466        );
1467    }
1468
1469    #[test]
1470    fn tokenize_invalid_string_cols() {
1471        let sql = String::from("\n\nSELECT * FROM table\tمصطفىh");
1472        let mut tokenizer = Tokenizer::new(&sql);
1473        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1474        // println!("tokens: {:#?}", tokens);
1475        let expected = vec![
1476            Token::Whitespace(Whitespace::Newline),
1477            Token::Whitespace(Whitespace::Newline),
1478            Token::make_keyword("SELECT"),
1479            Token::Whitespace(Whitespace::Space),
1480            Token::Mul,
1481            Token::Whitespace(Whitespace::Space),
1482            Token::make_keyword("FROM"),
1483            Token::Whitespace(Whitespace::Space),
1484            Token::make_keyword("table"),
1485            Token::Whitespace(Whitespace::Tab),
1486            Token::Char('م'),
1487            Token::Char('ص'),
1488            Token::Char('ط'),
1489            Token::Char('ف'),
1490            Token::Char('ى'),
1491            Token::make_word("h", None),
1492        ];
1493        compare(expected, tokens);
1494    }
1495
1496    #[test]
1497    fn tokenize_right_arrow() {
1498        let sql = String::from("FUNCTION(key=>value)");
1499        let mut tokenizer = Tokenizer::new(&sql);
1500        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1501        let expected = vec![
1502            Token::make_word("FUNCTION", None),
1503            Token::LParen,
1504            Token::make_word("key", None),
1505            Token::RArrow,
1506            Token::make_word("value", None),
1507            Token::RParen,
1508        ];
1509        compare(expected, tokens);
1510    }
1511
1512    #[test]
1513    fn tokenize_is_null() {
1514        let sql = String::from("a IS NULL");
1515        let mut tokenizer = Tokenizer::new(&sql);
1516        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1517
1518        let expected = vec![
1519            Token::make_word("a", None),
1520            Token::Whitespace(Whitespace::Space),
1521            Token::make_keyword("IS"),
1522            Token::Whitespace(Whitespace::Space),
1523            Token::make_keyword("NULL"),
1524        ];
1525
1526        compare(expected, tokens);
1527    }
1528
1529    #[test]
1530    fn tokenize_comment() {
1531        let sql = String::from("0--this is a comment\n1");
1532        let mut tokenizer = Tokenizer::new(&sql);
1533        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1534        let expected = vec![
1535            Token::Number("0".to_owned()),
1536            Token::Whitespace(Whitespace::SingleLineComment {
1537                prefix: "--".to_owned(),
1538                comment: "this is a comment\n".to_owned(),
1539            }),
1540            Token::Number("1".to_owned()),
1541        ];
1542        compare(expected, tokens);
1543    }
1544
1545    #[test]
1546    fn tokenize_comment_at_eof() {
1547        let sql = String::from("--this is a comment");
1548        let mut tokenizer = Tokenizer::new(&sql);
1549        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1550        let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
1551            prefix: "--".to_owned(),
1552            comment: "this is a comment".to_owned(),
1553        })];
1554        compare(expected, tokens);
1555    }
1556
1557    #[test]
1558    fn tokenize_multiline_comment() {
1559        let sql = String::from("0/*multi-line\n* /comment*/1");
1560        let mut tokenizer = Tokenizer::new(&sql);
1561        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1562        let expected = vec![
1563            Token::Number("0".to_owned()),
1564            Token::Whitespace(Whitespace::MultiLineComment(
1565                "multi-line\n* /comment".to_owned(),
1566            )),
1567            Token::Number("1".to_owned()),
1568        ];
1569        compare(expected, tokens);
1570    }
1571
1572    #[test]
1573    fn tokenize_nested_multiline_comment() {
1574        let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
1575        let mut tokenizer = Tokenizer::new(&sql);
1576        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1577        let expected = vec![
1578            Token::Number("0".to_owned()),
1579            Token::Whitespace(Whitespace::MultiLineComment(
1580                "multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_owned(),
1581            )),
1582            Token::Number("1".to_owned()),
1583        ];
1584        compare(expected, tokens);
1585    }
1586
1587    #[test]
1588    fn tokenize_multiline_comment_with_even_asterisks() {
1589        let sql = String::from("\n/** Comment **/\n");
1590        let mut tokenizer = Tokenizer::new(&sql);
1591        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1592        let expected = vec![
1593            Token::Whitespace(Whitespace::Newline),
1594            Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_owned())),
1595            Token::Whitespace(Whitespace::Newline),
1596        ];
1597        compare(expected, tokens);
1598    }
1599
1600    #[test]
1601    fn tokenize_mismatched_quotes() {
1602        let sql = String::from("\"foo");
1603        let mut tokenizer = Tokenizer::new(&sql);
1604        assert_eq!(
1605            tokenizer.tokenize_with_whitespace(),
1606            Err(TokenizerError {
1607                message: "Expected close delimiter '\"' before EOF.".to_owned(),
1608                line: 1,
1609                col: 5,
1610                context: "LINE 1: \"foo\n            ^".to_owned(),
1611            })
1612        );
1613    }
1614
1615    #[test]
1616    fn tokenize_newlines() {
1617        let sql = String::from("line1\nline2\rline3\r\nline4\r");
1618        let mut tokenizer = Tokenizer::new(&sql);
1619        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1620        let expected = vec![
1621            Token::make_word("line1", None),
1622            Token::Whitespace(Whitespace::Newline),
1623            Token::make_word("line2", None),
1624            Token::Whitespace(Whitespace::Newline),
1625            Token::make_word("line3", None),
1626            Token::Whitespace(Whitespace::Newline),
1627            Token::make_word("line4", None),
1628            Token::Whitespace(Whitespace::Newline),
1629        ];
1630        compare(expected, tokens);
1631    }
1632
1633    #[test]
1634    fn tokenize_pg_regex_match() {
1635        let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
1636        let mut tokenizer = Tokenizer::new(sql);
1637        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1638        let expected = vec![
1639            Token::make_keyword("SELECT"),
1640            Token::Whitespace(Whitespace::Space),
1641            Token::make_word("col", None),
1642            Token::Whitespace(Whitespace::Space),
1643            Token::Op("~".to_owned()),
1644            Token::Whitespace(Whitespace::Space),
1645            Token::SingleQuotedString("^a".into()),
1646            Token::Comma,
1647            Token::Whitespace(Whitespace::Space),
1648            Token::make_word("col", None),
1649            Token::Whitespace(Whitespace::Space),
1650            Token::Op("~*".to_owned()),
1651            Token::Whitespace(Whitespace::Space),
1652            Token::SingleQuotedString("^a".into()),
1653            Token::Comma,
1654            Token::Whitespace(Whitespace::Space),
1655            Token::make_word("col", None),
1656            Token::Whitespace(Whitespace::Space),
1657            Token::Op("!~".to_owned()),
1658            Token::Whitespace(Whitespace::Space),
1659            Token::SingleQuotedString("^a".into()),
1660            Token::Comma,
1661            Token::Whitespace(Whitespace::Space),
1662            Token::make_word("col", None),
1663            Token::Whitespace(Whitespace::Space),
1664            Token::Op("!~*".to_owned()),
1665            Token::Whitespace(Whitespace::Space),
1666            Token::SingleQuotedString("^a".into()),
1667        ];
1668        compare(expected, tokens);
1669    }
1670
1671    #[test]
1672    fn tokenize_select_array() {
1673        let sql = String::from("SELECT '{1, 2, 3}'");
1674        let mut tokenizer = Tokenizer::new(&sql);
1675        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1676
1677        let expected = vec![
1678            Token::make_keyword("SELECT"),
1679            Token::Whitespace(Whitespace::Space),
1680            Token::SingleQuotedString(String::from("{1, 2, 3}")),
1681        ];
1682
1683        compare(expected, tokens);
1684    }
1685
1686    fn compare(expected: Vec<Token>, actual: Vec<Token>) {
1687        // println!("------------------------------");
1688        // println!("tokens   = {:?}", actual);
1689        // println!("expected = {:?}", expected);
1690        // println!("------------------------------");
1691        assert_eq!(expected, actual);
1692    }
1693}