risingwave_sqlparser/
tokenizer.rs

1// Licensed under the Apache License, Version 2.0 (the "License");
2// you may not use this file except in compliance with the License.
3// You may obtain a copy of the License at
4//
5//     http://www.apache.org/licenses/LICENSE-2.0
6//
7// Unless required by applicable law or agreed to in writing, software
8// distributed under the License is distributed on an "AS IS" BASIS,
9// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10// See the License for the specific language governing permissions and
11// limitations under the License.
12
13//! SQL Tokenizer
14//!
15//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
16//!
17//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).
18
19use std::fmt;
20use std::fmt::Debug;
21use std::iter::Peekable;
22use std::str::Chars;
23
24use crate::ast::{CstyleEscapedString, DollarQuotedString};
25use crate::keywords::{ALL_KEYWORDS, ALL_KEYWORDS_INDEX, Keyword};
26
27/// SQL Token enumeration
28#[derive(Debug, Clone, PartialEq, Eq, Hash)]
29pub enum Token {
30    /// An end-of-file marker, not a real token
31    EOF,
32    /// A keyword (like SELECT) or an optionally quoted SQL identifier
33    Word(Word),
34    /// An unsigned numeric literal
35    Number(String),
36    /// A character that could not be tokenized
37    Char(char),
38    /// Single quoted string: i.e: 'string'
39    SingleQuotedString(String),
40    /// Dollar quoted string: i.e: $$string$$ or $tag_name$string$tag_name$
41    DollarQuotedString(DollarQuotedString),
42    /// Single quoted string with c-style escapes: i.e: E'string'
43    CstyleEscapesString(CstyleEscapedString),
44    /// "National" string literal: i.e: N'string'
45    NationalStringLiteral(String),
46    /// Hexadecimal string literal: i.e.: X'deadbeef'
47    HexStringLiteral(String),
48    /// Parameter symbols: i.e:  $1, $2
49    Parameter(String),
50    /// Comma
51    Comma,
52    /// Whitespace (space, tab, etc)
53    Whitespace(Whitespace),
54    /// Custom Operator
55    Op(String),
56    /// Equality operator `=`
57    Eq,
58    /// Not Equals operator `<>` (or `!=` in some dialects)
59    Neq,
60    /// Less Than operator `<`
61    Lt,
62    /// Greater Than operator `>`
63    Gt,
64    /// Less Than Or Equals operator `<=`
65    LtEq,
66    /// Greater Than Or Equals operator `>=`
67    GtEq,
68    /// Plus operator `+`
69    Plus,
70    /// Minus operator `-`
71    Minus,
72    /// Multiplication operator `*`
73    Mul,
74    /// Division operator `/`
75    Div,
76    /// Modulo Operator `%`
77    Mod,
78    /// Left parenthesis `(`
79    LParen,
80    /// Right parenthesis `)`
81    RParen,
82    /// Period (used for compound identifiers or projections into nested types)
83    Period,
84    /// Colon `:`
85    Colon,
86    /// DoubleColon `::` (used for casting in postgresql)
87    DoubleColon,
88    /// SemiColon `;` used as separator for COPY and payload
89    SemiColon,
90    /// Backslash `\` used in terminating the COPY payload with `\.`
91    Backslash,
92    /// Left bracket `[`
93    LBracket,
94    /// Right bracket `]`
95    RBracket,
96    /// Pipe `|`
97    Pipe,
98    /// Caret `^`
99    Caret,
100    /// Left brace `{`
101    LBrace,
102    /// Right brace `}`
103    RBrace,
104    /// Right Arrow `=>`
105    RArrow,
106}
107
108impl fmt::Display for Token {
109    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
110        match self {
111            Token::EOF => f.write_str("EOF"),
112            Token::Word(w) => write!(f, "{}", w),
113            Token::Number(n) => write!(f, "{}", n),
114            Token::Char(c) => write!(f, "{}", c),
115            Token::SingleQuotedString(s) => write!(f, "'{}'", s),
116            Token::DollarQuotedString(s) => write!(f, "{}", s),
117            Token::NationalStringLiteral(s) => write!(f, "N'{}'", s),
118            Token::HexStringLiteral(s) => write!(f, "X'{}'", s),
119            Token::CstyleEscapesString(s) => write!(f, "E'{}'", s),
120            Token::Parameter(s) => write!(f, "${}", s),
121            Token::Comma => f.write_str(","),
122            Token::Whitespace(ws) => write!(f, "{}", ws),
123            Token::Op(op) => write!(f, "{}", op),
124            Token::Eq => f.write_str("="),
125            Token::Neq => f.write_str("<>"),
126            Token::Lt => f.write_str("<"),
127            Token::Gt => f.write_str(">"),
128            Token::LtEq => f.write_str("<="),
129            Token::GtEq => f.write_str(">="),
130            Token::Plus => f.write_str("+"),
131            Token::Minus => f.write_str("-"),
132            Token::Mul => f.write_str("*"),
133            Token::Div => f.write_str("/"),
134            Token::Mod => f.write_str("%"),
135            Token::LParen => f.write_str("("),
136            Token::RParen => f.write_str(")"),
137            Token::Period => f.write_str("."),
138            Token::Colon => f.write_str(":"),
139            Token::DoubleColon => f.write_str("::"),
140            Token::SemiColon => f.write_str(";"),
141            Token::Backslash => f.write_str("\\"),
142            Token::LBracket => f.write_str("["),
143            Token::RBracket => f.write_str("]"),
144            Token::Caret => f.write_str("^"),
145            Token::Pipe => f.write_str("|"),
146            Token::LBrace => f.write_str("{"),
147            Token::RBrace => f.write_str("}"),
148            Token::RArrow => f.write_str("=>"),
149        }
150    }
151}
152
153impl Token {
154    pub fn make_keyword(keyword: &str) -> Self {
155        Token::make_word(keyword, None)
156    }
157
158    pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
159        let word_uppercase = word.to_uppercase();
160        Token::Word(Word {
161            value: word.to_owned(),
162            quote_style,
163            keyword: if quote_style.is_none() {
164                let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
165                keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
166            } else {
167                Keyword::NoKeyword
168            },
169        })
170    }
171
172    pub fn with_location(self, location: Location) -> TokenWithLocation {
173        TokenWithLocation::new(self, location.line, location.column)
174    }
175}
176
177/// A keyword (like SELECT) or an optionally quoted SQL identifier
178#[derive(Debug, Clone, PartialEq, Eq, Hash)]
179pub struct Word {
180    /// The value of the token, without the enclosing quotes, and with the
181    /// escape sequences (if any) processed (TODO: escapes are not handled)
182    pub value: String,
183    /// An identifier can be "quoted" (&lt;delimited identifier> in ANSI parlance).
184    /// The standard and most implementations allow using double quotes for this,
185    /// but some implementations support other quoting styles as well (e.g. \[MS SQL])
186    pub quote_style: Option<char>,
187    /// If the word was not quoted and it matched one of the known keywords,
188    /// this will have one of the values from dialect::keywords, otherwise empty
189    pub keyword: Keyword,
190}
191
192impl fmt::Display for Word {
193    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
194        match self.quote_style {
195            Some(s) if s == '"' || s == '[' || s == '`' => {
196                write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
197            }
198            None => f.write_str(&self.value),
199            _ => panic!("Unexpected quote_style!"),
200        }
201    }
202}
203
204impl Word {
205    fn matching_end_quote(ch: char) -> char {
206        match ch {
207            '"' => '"', // ANSI and most dialects
208            '[' => ']', // MS SQL
209            '`' => '`', // MySQL
210            _ => panic!("unexpected quoting style!"),
211        }
212    }
213}
214
215#[derive(Debug, Clone, PartialEq, Eq, Hash)]
216pub enum Whitespace {
217    Space,
218    Newline,
219    Tab,
220    SingleLineComment { comment: String, prefix: String },
221    MultiLineComment(String),
222}
223
224impl fmt::Display for Whitespace {
225    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
226        match self {
227            Whitespace::Space => f.write_str(" "),
228            Whitespace::Newline => f.write_str("\n"),
229            Whitespace::Tab => f.write_str("\t"),
230            Whitespace::SingleLineComment { prefix, comment } => write!(f, "{}{}", prefix, comment),
231            Whitespace::MultiLineComment(s) => write!(f, "/*{}*/", s),
232        }
233    }
234}
235
236/// Location in input string
237#[derive(Debug, Eq, PartialEq, Clone)]
238pub struct Location {
239    /// Line number, starting from 1
240    pub line: u64,
241    /// Line column, starting from 1
242    pub column: u64,
243}
244
245/// A [Token] with [Location] attached to it
246#[derive(Debug, Eq, PartialEq, Clone)]
247pub struct TokenWithLocation {
248    pub token: Token,
249    pub location: Location,
250}
251
252impl TokenWithLocation {
253    pub fn new(token: Token, line: u64, column: u64) -> TokenWithLocation {
254        TokenWithLocation {
255            token,
256            location: Location { line, column },
257        }
258    }
259
260    pub fn eof() -> TokenWithLocation {
261        TokenWithLocation::new(Token::EOF, 0, 0)
262    }
263}
264
265impl PartialEq<Token> for TokenWithLocation {
266    fn eq(&self, other: &Token) -> bool {
267        &self.token == other
268    }
269}
270
271impl PartialEq<TokenWithLocation> for Token {
272    fn eq(&self, other: &TokenWithLocation) -> bool {
273        self == &other.token
274    }
275}
276
277impl fmt::Display for TokenWithLocation {
278    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
279        if self.token == Token::EOF {
280            write!(f, "end of input")
281        } else {
282            write!(
283                f,
284                "{} at line {}, column {}",
285                self.token, self.location.line, self.location.column
286            )
287        }
288    }
289}
290
291/// Tokenizer error
292#[derive(Debug, PartialEq)]
293pub struct TokenizerError {
294    pub message: String,
295    pub line: u64,
296    pub col: u64,
297    pub context: String,
298}
299
300impl fmt::Display for TokenizerError {
301    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
302        write!(
303            f,
304            "{} at line {}, column {}\n{}",
305            self.message, self.line, self.col, self.context
306        )
307    }
308}
309
310impl std::error::Error for TokenizerError {}
311
312/// SQL Tokenizer
313#[derive(Clone)]
314pub struct Tokenizer<'a> {
315    sql: &'a str,
316    chars: Peekable<Chars<'a>>,
317    line: u64,
318    col: u64,
319}
320
321impl<'a> Tokenizer<'a> {
322    /// Create a new SQL tokenizer for the specified SQL statement
323    pub fn new(query: &'a str) -> Self {
324        Self {
325            sql: query,
326            chars: query.chars().peekable(),
327            line: 1,
328            col: 1,
329        }
330    }
331
332    /// Consume the next character.
333    fn next(&mut self) -> Option<char> {
334        let ch = self.chars.next();
335        if let Some(ch) = ch {
336            match ch {
337                '\n' => {
338                    self.line += 1;
339                    self.col = 1;
340                }
341                '\t' => self.col += 4,
342                _ => self.col += 1,
343            }
344        }
345        ch
346    }
347
348    /// Return the next character without consuming it.
349    fn peek(&mut self) -> Option<char> {
350        self.chars.peek().cloned()
351    }
352
353    /// Tokenize the statement and produce a vector of tokens with locations.
354    ///
355    /// Whitespaces are skipped.
356    pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
357        let tokens = self.tokenize()?;
358        Ok(tokens
359            .into_iter()
360            .filter(|token| !matches!(&token.token, Token::Whitespace(_)))
361            .collect())
362    }
363
364    /// Tokenize the statement and produce a vector of tokens.
365    ///
366    /// Whitespaces are included.
367    #[allow(dead_code)]
368    fn tokenize_with_whitespace(&mut self) -> Result<Vec<Token>, TokenizerError> {
369        let tokens = self.tokenize()?;
370        Ok(tokens.into_iter().map(|t| t.token).collect())
371    }
372
373    /// Tokenize the statement and produce a vector of tokens.
374    ///
375    /// Whitespaces are included.
376    fn tokenize(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
377        let mut tokens = Vec::new();
378        while let Some(token) = self.next_token_with_location()? {
379            tokens.push(token);
380        }
381        Ok(tokens)
382    }
383
384    /// Get the next token or return None
385    fn next_token_with_location(&mut self) -> Result<Option<TokenWithLocation>, TokenizerError> {
386        let loc = Location {
387            line: self.line,
388            column: self.col,
389        };
390        self.next_token()
391            .map(|t| t.map(|token| token.with_location(loc)))
392    }
393
394    /// Get the next token or return None
395    fn next_token(&mut self) -> Result<Option<Token>, TokenizerError> {
396        macro_rules! op_chars {
397            // https://www.postgresql.org/docs/17/sql-syntax-lexical.html#SQL-SYNTAX-OPERATORS
398            (all as_pat) => {
399                '+' | '-' | '*' | '/' | '<' | '>' | '=' | op_chars!(ext as_pat)
400            };
401            (ext $m:ident) => {
402                op_chars!($m '~' '!' '@' '#' '%' '^' '&' '|' '`' '?')
403            };
404            (as_arr $($c:literal)+) => {
405                [ $($c),+ ]
406            };
407            (as_pat $($c:literal)+) => {
408                $($c)|+
409            };
410        }
411
412        match self.peek() {
413            Some(ch) => match ch {
414                ' ' => self.consume_and_return(Token::Whitespace(Whitespace::Space)),
415                '\t' => self.consume_and_return(Token::Whitespace(Whitespace::Tab)),
416                '\n' => self.consume_and_return(Token::Whitespace(Whitespace::Newline)),
417                '\r' => {
418                    // Emit a single Whitespace::Newline token for \r and \r\n
419                    self.next();
420                    if let Some('\n') = self.peek() {
421                        self.next();
422                    }
423                    Ok(Some(Token::Whitespace(Whitespace::Newline)))
424                }
425                'N' => {
426                    self.next(); // consume, to check the next char
427                    match self.peek() {
428                        Some('\'') => {
429                            // N'...' - a <national character string literal>
430                            let s = self.tokenize_single_quoted_string()?;
431                            Ok(Some(Token::NationalStringLiteral(s)))
432                        }
433                        _ => {
434                            // regular identifier starting with an "N"
435                            let s = self.tokenize_word('N');
436                            Ok(Some(Token::make_word(&s, None)))
437                        }
438                    }
439                }
440                x @ 'e' | x @ 'E' => {
441                    self.next(); // consume, to check the next char
442                    match self.peek() {
443                        Some('\'') => {
444                            // E'...' - a <character string literal>
445                            let s = self.tokenize_single_quoted_string_with_escape()?;
446                            Ok(Some(Token::CstyleEscapesString(s)))
447                        }
448                        _ => {
449                            // regular identifier starting with an "E"
450                            let s = self.tokenize_word(x);
451                            Ok(Some(Token::make_word(&s, None)))
452                        }
453                    }
454                }
455                // The spec only allows an uppercase 'X' to introduce a hex
456                // string, but PostgreSQL, at least, allows a lowercase 'x' too.
457                x @ 'x' | x @ 'X' => {
458                    self.next(); // consume, to check the next char
459                    match self.peek() {
460                        Some('\'') => {
461                            // X'...' - a <binary string literal>
462                            let s = self.tokenize_single_quoted_string()?;
463                            Ok(Some(Token::HexStringLiteral(s)))
464                        }
465                        _ => {
466                            // regular identifier starting with an "X"
467                            let s = self.tokenize_word(x);
468                            Ok(Some(Token::make_word(&s, None)))
469                        }
470                    }
471                }
472                // identifier or keyword
473                ch if is_identifier_start(ch) => {
474                    self.next(); // consume the first char
475                    let s = self.tokenize_word(ch);
476
477                    Ok(Some(Token::make_word(&s, None)))
478                }
479                // string
480                '\'' => {
481                    let s = self.tokenize_single_quoted_string()?;
482
483                    Ok(Some(Token::SingleQuotedString(s)))
484                }
485                // delimited (quoted) identifier
486                quote_start if is_delimited_identifier_start(quote_start) => {
487                    self.next(); // consume the opening quote
488                    let quote_end = Word::matching_end_quote(quote_start);
489                    let s = self.peeking_take_while(|ch| ch != quote_end);
490                    if self.next() == Some(quote_end) {
491                        Ok(Some(Token::make_word(&s, Some(quote_start))))
492                    } else {
493                        self.error(format!(
494                            "Expected close delimiter '{}' before EOF.",
495                            quote_end
496                        ))
497                    }
498                }
499                // numbers and period
500                '0'..='9' | '.' => {
501                    let mut s = self.peeking_take_while(|ch| ch.is_ascii_digit());
502
503                    // match binary literal that starts with 0x
504                    if s == "0"
505                        && let Some(radix) = self.peek()
506                        && "xob".contains(radix.to_ascii_lowercase())
507                    {
508                        self.next();
509                        let radix = radix.to_ascii_lowercase();
510                        let base = match radix {
511                            'x' => 16,
512                            'o' => 8,
513                            'b' => 2,
514                            _ => unreachable!(),
515                        };
516                        let s2 = self.peeking_take_while(|ch| ch.is_digit(base));
517                        if s2.is_empty() {
518                            return self.error("incomplete integer literal");
519                        }
520                        self.reject_number_junk()?;
521                        return Ok(Some(Token::Number(format!("0{radix}{s2}"))));
522                    }
523
524                    // match one period
525                    if let Some('.') = self.peek() {
526                        s.push('.');
527                        self.next();
528                    }
529                    s += &self.peeking_take_while(|ch| ch.is_ascii_digit());
530
531                    // No number -> Token::Period
532                    if s == "." {
533                        return Ok(Some(Token::Period));
534                    }
535
536                    match self.peek() {
537                        // Number is a scientific number (1e6)
538                        Some('e') | Some('E') => {
539                            s.push('e');
540                            self.next();
541
542                            if let Some('-') = self.peek() {
543                                s.push('-');
544                                self.next();
545                            }
546                            s += &self.peeking_take_while(|ch| ch.is_ascii_digit());
547                            self.reject_number_junk()?;
548                            return Ok(Some(Token::Number(s)));
549                        }
550                        // Not a scientific number
551                        _ => {}
552                    };
553                    self.reject_number_junk()?;
554                    Ok(Some(Token::Number(s)))
555                }
556                // punctuation
557                '(' => self.consume_and_return(Token::LParen),
558                ')' => self.consume_and_return(Token::RParen),
559                ',' => self.consume_and_return(Token::Comma),
560                ':' => {
561                    self.next();
562                    match self.peek() {
563                        Some(':') => self.consume_and_return(Token::DoubleColon),
564                        _ => Ok(Some(Token::Colon)),
565                    }
566                }
567                '$' => Ok(Some(self.tokenize_dollar_preceded_value()?)),
568                ';' => self.consume_and_return(Token::SemiColon),
569                '\\' => self.consume_and_return(Token::Backslash),
570                '[' => self.consume_and_return(Token::LBracket),
571                ']' => self.consume_and_return(Token::RBracket),
572                '{' => self.consume_and_return(Token::LBrace),
573                '}' => self.consume_and_return(Token::RBrace),
574                // operators
575                op_chars!(all as_pat) => {
576                    let mut trial = self.clone();
577                    let op_taken = trial.peeking_take_while(|c| matches!(c, op_chars!(all as_pat)));
578                    // It is safe to assume byte index is char index in `op_token` below.
579
580                    // https://www.postgresql.org/docs/17/sql-syntax-lexical.html#SQL-SYNTAX-OPERATORS
581                    // https://github.com/postgres/postgres/blob/REL_17_4/src/backend/parser/scan.l#L900-L1006
582                    let slash_star = op_taken.find("/*");
583                    let dash_dash = op_taken.find("--");
584                    let pos = match (slash_star, dash_dash) {
585                        (Some(s), Some(d)) => s.min(d),
586                        (Some(s), None) => s,
587                        (None, Some(d)) => d,
588                        (None, None) => op_taken.len(),
589                    };
590                    let mut op = &op_taken[..pos];
591                    if op.is_empty() {
592                        match self.next() {
593                            Some('-') => {
594                                self.next(); // consume the second '-', starting a single-line comment
595                                let comment = self.tokenize_single_line_comment();
596
597                                return Ok(Some(Token::Whitespace(
598                                    Whitespace::SingleLineComment {
599                                        prefix: "--".to_owned(),
600                                        comment,
601                                    },
602                                )));
603                            }
604                            Some('/') => {
605                                self.next(); // consume the '*', starting a multi-line comment
606                                return self.tokenize_multiline_comment();
607                            }
608                            _ => unreachable!(),
609                        }
610                    };
611                    if op.len() > 1
612                        && op.ends_with(['+', '-'])
613                        && !op.contains(op_chars!(ext as_arr))
614                    {
615                        op = op.trim_end_matches(['+', '-']);
616                        if op.is_empty() {
617                            op = &op_taken[..1];
618                        }
619                    }
620                    if op.len() == op_taken.len() {
621                        *self = trial;
622                    } else {
623                        for _ in op.chars() {
624                            self.next();
625                        }
626                    }
627                    match op {
628                        // https://github.com/postgres/postgres/blob/REL_17_4/src/backend/parser/scan.l#L965-L973
629                        "+" => Ok(Some(Token::Plus)),
630                        "-" => Ok(Some(Token::Minus)),
631                        "*" => Ok(Some(Token::Mul)),
632                        "/" => Ok(Some(Token::Div)),
633                        "%" => Ok(Some(Token::Mod)),
634                        "^" => Ok(Some(Token::Caret)),
635                        "<" => Ok(Some(Token::Lt)),
636                        ">" => Ok(Some(Token::Gt)),
637                        "=" => Ok(Some(Token::Eq)),
638                        // https://github.com/postgres/postgres/blob/REL_17_4/src/backend/parser/scan.l#L974-L992
639                        "=>" => Ok(Some(Token::RArrow)),
640                        "<=" => Ok(Some(Token::LtEq)),
641                        ">=" => Ok(Some(Token::GtEq)),
642                        "<>" => Ok(Some(Token::Neq)),
643                        "!=" => Ok(Some(Token::Neq)),
644                        // Our support of `Expr::LambdaFunction` makes us PostgreSQL-incompatible here.
645                        //     foo(bar, | x | x)
646                        // In PostgreSQL, this is unary operator `|` applied to `x`, then bitwise-or `x`.
647                        // In our dialect, this is a lambda function - the identity function.
648                        "|" => Ok(Some(Token::Pipe)),
649                        _ => Ok(Some(Token::Op(op.to_owned()))),
650                    }
651                }
652                other => self.consume_and_return(Token::Char(other)),
653            },
654            None => Ok(None),
655        }
656    }
657
658    /// Tokenize dollar preceded value (i.e: a string/placeholder)
659    fn tokenize_dollar_preceded_value(&mut self) -> Result<Token, TokenizerError> {
660        let mut s = String::new();
661        let mut value = String::new();
662
663        self.next();
664
665        if let Some('$') = self.peek() {
666            self.next();
667
668            let mut is_terminated = false;
669            let mut prev: Option<char> = None;
670
671            while let Some(ch) = self.peek() {
672                if prev == Some('$') {
673                    if ch == '$' {
674                        self.next();
675                        is_terminated = true;
676                        break;
677                    } else {
678                        s.push('$');
679                        s.push(ch);
680                    }
681                } else if ch != '$' {
682                    s.push(ch);
683                }
684
685                prev = Some(ch);
686                self.next();
687            }
688
689            return if self.peek().is_none() && !is_terminated {
690                self.error("Unterminated dollar-quoted string")
691            } else {
692                Ok(Token::DollarQuotedString(DollarQuotedString {
693                    value: s,
694                    tag: None,
695                }))
696            };
697        } else {
698            value.push_str(&self.peeking_take_while(|ch| ch.is_alphanumeric() || ch == '_'));
699
700            if let Some('$') = self.peek() {
701                self.next();
702                s.push_str(&self.peeking_take_while(|ch| ch != '$'));
703
704                match self.peek() {
705                    Some('$') => {
706                        self.next();
707                        for c in value.chars() {
708                            let next_char = self.next();
709                            if Some(c) != next_char {
710                                return self.error(format!(
711                                    "Unterminated dollar-quoted string at or near \"{}\"",
712                                    value
713                                ));
714                            }
715                        }
716
717                        if let Some('$') = self.peek() {
718                            self.next();
719                        } else {
720                            return self.error("Unterminated dollar-quoted string, expected $");
721                        }
722                    }
723                    _ => {
724                        return self.error("Unterminated dollar-quoted, expected $");
725                    }
726                }
727            } else {
728                return Ok(Token::Parameter(value));
729            }
730        }
731
732        Ok(Token::DollarQuotedString(DollarQuotedString {
733            value: s,
734            tag: if value.is_empty() { None } else { Some(value) },
735        }))
736    }
737
738    fn error<R>(&self, message: impl Into<String>) -> Result<R, TokenizerError> {
739        let prefix = format!("LINE {}: ", self.line);
740        let sql_line = self.sql.split('\n').nth(self.line as usize - 1).unwrap();
741        let cursor = " ".repeat(prefix.len() + self.col as usize - 1);
742        let context = format!("{}{}\n{}^", prefix, sql_line, cursor);
743        Err(TokenizerError {
744            message: message.into(),
745            col: self.col,
746            line: self.line,
747            context,
748        })
749    }
750
751    fn reject_number_junk(&mut self) -> Result<(), TokenizerError> {
752        if let Some(ch) = self.peek()
753            && is_identifier_start(ch)
754        {
755            return self.error("trailing junk after numeric literal");
756        }
757        Ok(())
758    }
759
760    // Consume characters until newline
761    fn tokenize_single_line_comment(&mut self) -> String {
762        let mut comment = self.peeking_take_while(|ch| ch != '\n');
763        if let Some(ch) = self.next() {
764            assert_eq!(ch, '\n');
765            comment.push(ch);
766        }
767        comment
768    }
769
770    /// Tokenize an identifier or keyword, after the first char is already consumed.
771    fn tokenize_word(&mut self, first_char: char) -> String {
772        let mut s = first_char.to_string();
773        s.push_str(&self.peeking_take_while(is_identifier_part));
774        s
775    }
776
777    /// Read a single quoted string, starting with the opening quote.
778    fn tokenize_single_quoted_string(&mut self) -> Result<String, TokenizerError> {
779        let mut s = String::new();
780        self.next(); // consume the opening quote
781
782        // slash escaping is specific to MySQL dialect
783        let mut is_escaped = false;
784        while let Some(ch) = self.peek() {
785            match ch {
786                '\'' => {
787                    self.next(); // consume
788                    if is_escaped {
789                        s.push(ch);
790                        is_escaped = false;
791                    } else if self.peek().map(|c| c == '\'').unwrap_or(false) {
792                        s.push(ch);
793                        self.next();
794                    } else {
795                        return Ok(s);
796                    }
797                }
798                '\\' => {
799                    s.push(ch);
800                    self.next();
801                }
802                _ => {
803                    self.next(); // consume
804                    s.push(ch);
805                }
806            }
807        }
808        self.error("Unterminated string literal")
809    }
810
811    /// Read a single qutoed string with escape
812    fn tokenize_single_quoted_string_with_escape(
813        &mut self,
814    ) -> Result<CstyleEscapedString, TokenizerError> {
815        let mut terminated = false;
816        let mut s = String::new();
817        self.next(); // consume the opening quote
818
819        while let Some(ch) = self.peek() {
820            match ch {
821                '\'' => {
822                    self.next(); // consume
823                    if self.peek().map(|c| c == '\'').unwrap_or(false) {
824                        s.push('\\');
825                        s.push(ch);
826                        self.next();
827                    } else {
828                        terminated = true;
829                        break;
830                    }
831                }
832                '\\' => {
833                    s.push(ch);
834                    self.next();
835                    if self.peek().map(|c| c == '\'' || c == '\\').unwrap_or(false) {
836                        s.push(self.next().unwrap());
837                    }
838                }
839                _ => {
840                    self.next(); // consume
841                    s.push(ch);
842                }
843            }
844        }
845
846        if !terminated {
847            return self.error("Unterminated string literal");
848        }
849
850        let unescaped = match Self::unescape_c_style(&s) {
851            Ok(unescaped) => unescaped,
852            Err(e) => return self.error(e),
853        };
854
855        Ok(CstyleEscapedString {
856            value: unescaped,
857            raw: s,
858        })
859    }
860
861    /// Helper function used to convert string with c-style escapes into a normal string
862    /// e.g. 'hello\x3fworld' -> 'hello?world'
863    ///
864    /// Detail of c-style escapes refer from:
865    /// <https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE:~:text=4.1.2.2.%C2%A0String%20Constants%20With%20C%2DStyle%20Escapes>
866    fn unescape_c_style(s: &str) -> Result<String, String> {
867        fn hex_byte_process(
868            chars: &mut Peekable<Chars<'_>>,
869            res: &mut String,
870            len: usize,
871            default_char: char,
872        ) -> Result<(), String> {
873            let mut unicode_seq: String = String::with_capacity(len);
874            for _ in 0..len {
875                if let Some(c) = chars.peek()
876                    && c.is_ascii_hexdigit()
877                {
878                    unicode_seq.push(chars.next().unwrap());
879                } else {
880                    break;
881                }
882            }
883
884            if unicode_seq.is_empty() && len == 2 {
885                res.push(default_char);
886                return Ok(());
887            } else if unicode_seq.len() < len && len != 2 {
888                return Err("invalid unicode sequence: must be \\uXXXX or \\UXXXXXXXX".to_owned());
889            }
890
891            if len == 2 {
892                let number = [u8::from_str_radix(&unicode_seq, 16)
893                    .map_err(|e| format!("invalid unicode sequence: {}", e))?];
894
895                res.push(
896                    std::str::from_utf8(&number)
897                        .map_err(|err| format!("invalid unicode sequence: {}", err))?
898                        .chars()
899                        .next()
900                        .unwrap(),
901                );
902            } else {
903                let number = u32::from_str_radix(&unicode_seq, 16)
904                    .map_err(|e| format!("invalid unicode sequence: {}", e))?;
905                res.push(
906                    char::from_u32(number)
907                        .ok_or_else(|| format!("invalid unicode sequence: {}", unicode_seq))?,
908                );
909            }
910            Ok(())
911        }
912
913        fn octal_byte_process(
914            chars: &mut Peekable<Chars<'_>>,
915            res: &mut String,
916            digit: char,
917        ) -> Result<(), String> {
918            let mut unicode_seq: String = String::with_capacity(3);
919            unicode_seq.push(digit);
920            for _ in 0..2 {
921                if let Some(c) = chars.peek()
922                    && matches!(*c, '0'..='7')
923                {
924                    unicode_seq.push(chars.next().unwrap());
925                } else {
926                    break;
927                }
928            }
929
930            let number = [u8::from_str_radix(&unicode_seq, 8)
931                .map_err(|e| format!("invalid unicode sequence: {}", e))?];
932
933            res.push(
934                std::str::from_utf8(&number)
935                    .map_err(|err| format!("invalid unicode sequence: {}", err))?
936                    .chars()
937                    .next()
938                    .unwrap(),
939            );
940            Ok(())
941        }
942
943        let mut chars = s.chars().peekable();
944        let mut res = String::with_capacity(s.len());
945
946        while let Some(c) = chars.next() {
947            if c == '\\' {
948                match chars.next() {
949                    None => {
950                        return Err("unterminated escape sequence".to_owned());
951                    }
952                    Some(next_c) => match next_c {
953                        'b' => res.push('\u{08}'),
954                        'f' => res.push('\u{0C}'),
955                        'n' => res.push('\n'),
956                        'r' => res.push('\r'),
957                        't' => res.push('\t'),
958                        'x' => hex_byte_process(&mut chars, &mut res, 2, 'x')?,
959                        'u' => hex_byte_process(&mut chars, &mut res, 4, 'u')?,
960                        'U' => hex_byte_process(&mut chars, &mut res, 8, 'U')?,
961                        digit @ '0'..='7' => octal_byte_process(&mut chars, &mut res, digit)?,
962                        _ => res.push(next_c),
963                    },
964                }
965            } else {
966                res.push(c);
967            }
968        }
969
970        Ok(res)
971    }
972
973    fn tokenize_multiline_comment(&mut self) -> Result<Option<Token>, TokenizerError> {
974        let mut s = String::new();
975
976        let mut nested = 1;
977        let mut last_ch = ' ';
978
979        loop {
980            match self.next() {
981                Some(ch) => {
982                    if last_ch == '/' && ch == '*' {
983                        nested += 1;
984                    } else if last_ch == '*' && ch == '/' {
985                        nested -= 1;
986                        if nested == 0 {
987                            s.pop();
988                            break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
989                        }
990                    }
991                    s.push(ch);
992                    last_ch = ch;
993                }
994                None => break self.error("Unexpected EOF while in a multi-line comment"),
995            }
996        }
997    }
998
999    #[allow(clippy::unnecessary_wraps)]
1000    fn consume_and_return(&mut self, t: Token) -> Result<Option<Token>, TokenizerError> {
1001        self.next();
1002        Ok(Some(t))
1003    }
1004
1005    /// Read from `self` until `predicate` returns `false` or EOF is hit.
1006    /// Return the characters read as String, and keep the first non-matching
1007    /// char available as `self.next()`.
1008    fn peeking_take_while(&mut self, mut predicate: impl FnMut(char) -> bool) -> String {
1009        let mut s = String::new();
1010        while let Some(ch) = self.peek() {
1011            if predicate(ch) {
1012                self.next(); // consume
1013                s.push(ch);
1014            } else {
1015                break;
1016            }
1017        }
1018        s
1019    }
1020}
1021
1022/// Determine if a character starts a quoted identifier. The default
1023/// implementation, accepting "double quoted" ids is both ANSI-compliant
1024/// and appropriate for most dialects (with the notable exception of
1025/// MySQL, MS SQL, and sqlite). You can accept one of characters listed
1026/// in `Word::matching_end_quote` here
1027fn is_delimited_identifier_start(ch: char) -> bool {
1028    ch == '"'
1029}
1030
1031/// Determine if a character is a valid start character for an unquoted identifier
1032fn is_identifier_start(ch: char) -> bool {
1033    // See https://www.postgresql.org/docs/11/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS
1034    // We don't yet support identifiers beginning with "letters with
1035    // diacritical marks and non-Latin letters"
1036    ch.is_ascii_alphabetic() || ch == '_'
1037}
1038
1039/// Determine if a character is a valid unquoted identifier character
1040fn is_identifier_part(ch: char) -> bool {
1041    ch.is_ascii_alphanumeric() || ch == '$' || ch == '_'
1042}
1043
1044#[cfg(test)]
1045mod tests {
1046    use super::*;
1047
1048    #[test]
1049    fn tokenizer_error_impl() {
1050        use std::error::Error;
1051
1052        let err = TokenizerError {
1053            message: "test".into(),
1054            line: 1,
1055            col: 1,
1056            context: "LINE 1:".to_owned(),
1057        };
1058
1059        assert!(err.source().is_none());
1060        assert_eq!(err.to_string(), "test at line 1, column 1\nLINE 1:");
1061    }
1062
1063    #[test]
1064    fn tokenize_select_1() {
1065        let sql = String::from("SELECT 1");
1066        let mut tokenizer = Tokenizer::new(&sql);
1067        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1068
1069        let expected = vec![
1070            Token::make_keyword("SELECT"),
1071            Token::Whitespace(Whitespace::Space),
1072            Token::Number(String::from("1")),
1073        ];
1074
1075        compare(expected, tokens);
1076    }
1077
1078    #[test]
1079    fn tokenize_select_float() {
1080        let sql = String::from("SELECT .1");
1081        let mut tokenizer = Tokenizer::new(&sql);
1082        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1083
1084        let expected = vec![
1085            Token::make_keyword("SELECT"),
1086            Token::Whitespace(Whitespace::Space),
1087            Token::Number(String::from(".1")),
1088        ];
1089
1090        compare(expected, tokens);
1091    }
1092
1093    #[test]
1094    fn tokenize_scalar_function() {
1095        let sql = String::from("SELECT sqrt(1)");
1096        let mut tokenizer = Tokenizer::new(&sql);
1097        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1098
1099        let expected = vec![
1100            Token::make_keyword("SELECT"),
1101            Token::Whitespace(Whitespace::Space),
1102            Token::make_word("sqrt", None),
1103            Token::LParen,
1104            Token::Number(String::from("1")),
1105            Token::RParen,
1106        ];
1107
1108        compare(expected, tokens);
1109    }
1110
1111    #[test]
1112    fn tokenize_string_string_concat() {
1113        let sql = String::from("SELECT 'a' || 'b'");
1114        let mut tokenizer = Tokenizer::new(&sql);
1115        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1116
1117        let expected = vec![
1118            Token::make_keyword("SELECT"),
1119            Token::Whitespace(Whitespace::Space),
1120            Token::SingleQuotedString(String::from("a")),
1121            Token::Whitespace(Whitespace::Space),
1122            Token::Op("||".to_owned()),
1123            Token::Whitespace(Whitespace::Space),
1124            Token::SingleQuotedString(String::from("b")),
1125        ];
1126
1127        compare(expected, tokens);
1128    }
1129
1130    #[test]
1131    fn tokenize_bitwise_op() {
1132        let sql = String::from("SELECT one | two ^ three");
1133        let mut tokenizer = Tokenizer::new(&sql);
1134        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1135
1136        let expected = vec![
1137            Token::make_keyword("SELECT"),
1138            Token::Whitespace(Whitespace::Space),
1139            Token::make_word("one", None),
1140            Token::Whitespace(Whitespace::Space),
1141            Token::Pipe,
1142            Token::Whitespace(Whitespace::Space),
1143            Token::make_word("two", None),
1144            Token::Whitespace(Whitespace::Space),
1145            Token::Caret,
1146            Token::Whitespace(Whitespace::Space),
1147            Token::make_word("three", None),
1148        ];
1149        compare(expected, tokens);
1150    }
1151
1152    #[test]
1153    fn tokenize_logical_xor() {
1154        let sql =
1155            String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
1156        let mut tokenizer = Tokenizer::new(&sql);
1157        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1158
1159        let expected = vec![
1160            Token::make_keyword("SELECT"),
1161            Token::Whitespace(Whitespace::Space),
1162            Token::make_keyword("true"),
1163            Token::Whitespace(Whitespace::Space),
1164            Token::make_keyword("XOR"),
1165            Token::Whitespace(Whitespace::Space),
1166            Token::make_keyword("true"),
1167            Token::Comma,
1168            Token::Whitespace(Whitespace::Space),
1169            Token::make_keyword("false"),
1170            Token::Whitespace(Whitespace::Space),
1171            Token::make_keyword("XOR"),
1172            Token::Whitespace(Whitespace::Space),
1173            Token::make_keyword("false"),
1174            Token::Comma,
1175            Token::Whitespace(Whitespace::Space),
1176            Token::make_keyword("true"),
1177            Token::Whitespace(Whitespace::Space),
1178            Token::make_keyword("XOR"),
1179            Token::Whitespace(Whitespace::Space),
1180            Token::make_keyword("false"),
1181            Token::Comma,
1182            Token::Whitespace(Whitespace::Space),
1183            Token::make_keyword("false"),
1184            Token::Whitespace(Whitespace::Space),
1185            Token::make_keyword("XOR"),
1186            Token::Whitespace(Whitespace::Space),
1187            Token::make_keyword("true"),
1188        ];
1189        compare(expected, tokens);
1190    }
1191
1192    #[test]
1193    fn tokenize_simple_select() {
1194        let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
1195        let mut tokenizer = Tokenizer::new(&sql);
1196        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1197
1198        let expected = vec![
1199            Token::make_keyword("SELECT"),
1200            Token::Whitespace(Whitespace::Space),
1201            Token::Mul,
1202            Token::Whitespace(Whitespace::Space),
1203            Token::make_keyword("FROM"),
1204            Token::Whitespace(Whitespace::Space),
1205            Token::make_word("customer", None),
1206            Token::Whitespace(Whitespace::Space),
1207            Token::make_keyword("WHERE"),
1208            Token::Whitespace(Whitespace::Space),
1209            Token::make_word("id", None),
1210            Token::Whitespace(Whitespace::Space),
1211            Token::Eq,
1212            Token::Whitespace(Whitespace::Space),
1213            Token::Number(String::from("1")),
1214            Token::Whitespace(Whitespace::Space),
1215            Token::make_keyword("LIMIT"),
1216            Token::Whitespace(Whitespace::Space),
1217            Token::Number(String::from("5")),
1218        ];
1219
1220        compare(expected, tokens);
1221    }
1222
1223    #[test]
1224    fn tokenize_explain_select() {
1225        let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
1226        let mut tokenizer = Tokenizer::new(&sql);
1227        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1228
1229        let expected = vec![
1230            Token::make_keyword("EXPLAIN"),
1231            Token::Whitespace(Whitespace::Space),
1232            Token::make_keyword("SELECT"),
1233            Token::Whitespace(Whitespace::Space),
1234            Token::Mul,
1235            Token::Whitespace(Whitespace::Space),
1236            Token::make_keyword("FROM"),
1237            Token::Whitespace(Whitespace::Space),
1238            Token::make_word("customer", None),
1239            Token::Whitespace(Whitespace::Space),
1240            Token::make_keyword("WHERE"),
1241            Token::Whitespace(Whitespace::Space),
1242            Token::make_word("id", None),
1243            Token::Whitespace(Whitespace::Space),
1244            Token::Eq,
1245            Token::Whitespace(Whitespace::Space),
1246            Token::Number(String::from("1")),
1247        ];
1248
1249        compare(expected, tokens);
1250    }
1251
1252    #[test]
1253    fn tokenize_explain_analyze_select() {
1254        let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
1255        let mut tokenizer = Tokenizer::new(&sql);
1256        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1257
1258        let expected = vec![
1259            Token::make_keyword("EXPLAIN"),
1260            Token::Whitespace(Whitespace::Space),
1261            Token::make_keyword("ANALYZE"),
1262            Token::Whitespace(Whitespace::Space),
1263            Token::make_keyword("SELECT"),
1264            Token::Whitespace(Whitespace::Space),
1265            Token::Mul,
1266            Token::Whitespace(Whitespace::Space),
1267            Token::make_keyword("FROM"),
1268            Token::Whitespace(Whitespace::Space),
1269            Token::make_word("customer", None),
1270            Token::Whitespace(Whitespace::Space),
1271            Token::make_keyword("WHERE"),
1272            Token::Whitespace(Whitespace::Space),
1273            Token::make_word("id", None),
1274            Token::Whitespace(Whitespace::Space),
1275            Token::Eq,
1276            Token::Whitespace(Whitespace::Space),
1277            Token::Number(String::from("1")),
1278        ];
1279
1280        compare(expected, tokens);
1281    }
1282
1283    #[test]
1284    fn tokenize_string_predicate() {
1285        let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
1286        let mut tokenizer = Tokenizer::new(&sql);
1287        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1288
1289        let expected = vec![
1290            Token::make_keyword("SELECT"),
1291            Token::Whitespace(Whitespace::Space),
1292            Token::Mul,
1293            Token::Whitespace(Whitespace::Space),
1294            Token::make_keyword("FROM"),
1295            Token::Whitespace(Whitespace::Space),
1296            Token::make_word("customer", None),
1297            Token::Whitespace(Whitespace::Space),
1298            Token::make_keyword("WHERE"),
1299            Token::Whitespace(Whitespace::Space),
1300            Token::make_word("salary", None),
1301            Token::Whitespace(Whitespace::Space),
1302            Token::Neq,
1303            Token::Whitespace(Whitespace::Space),
1304            Token::SingleQuotedString(String::from("Not Provided")),
1305        ];
1306
1307        compare(expected, tokens);
1308    }
1309
1310    #[test]
1311    fn tokenize_invalid_string() {
1312        let sql = String::from("\nمصطفىh");
1313        let mut tokenizer = Tokenizer::new(&sql);
1314        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1315        // println!("tokens: {:#?}", tokens);
1316        let expected = vec![
1317            Token::Whitespace(Whitespace::Newline),
1318            Token::Char('م'),
1319            Token::Char('ص'),
1320            Token::Char('ط'),
1321            Token::Char('ف'),
1322            Token::Char('ى'),
1323            Token::make_word("h", None),
1324        ];
1325        compare(expected, tokens);
1326    }
1327
1328    #[test]
1329    fn tokenize_newline_in_string_literal() {
1330        let sql = String::from("'foo\r\nbar\nbaz'");
1331        let mut tokenizer = Tokenizer::new(&sql);
1332        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1333        let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_owned())];
1334        compare(expected, tokens);
1335    }
1336
1337    #[test]
1338    fn tokenize_unterminated_string_literal() {
1339        let sql = String::from("select 'foo");
1340        let mut tokenizer = Tokenizer::new(&sql);
1341        assert_eq!(
1342            tokenizer.tokenize_with_whitespace(),
1343            Err(TokenizerError {
1344                message: "Unterminated string literal".to_owned(),
1345                line: 1,
1346                col: 12,
1347                context: "LINE 1: select 'foo\n                   ^".to_owned(),
1348            })
1349        );
1350    }
1351
1352    #[test]
1353    fn tokenize_invalid_string_cols() {
1354        let sql = String::from("\n\nSELECT * FROM table\tمصطفىh");
1355        let mut tokenizer = Tokenizer::new(&sql);
1356        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1357        // println!("tokens: {:#?}", tokens);
1358        let expected = vec![
1359            Token::Whitespace(Whitespace::Newline),
1360            Token::Whitespace(Whitespace::Newline),
1361            Token::make_keyword("SELECT"),
1362            Token::Whitespace(Whitespace::Space),
1363            Token::Mul,
1364            Token::Whitespace(Whitespace::Space),
1365            Token::make_keyword("FROM"),
1366            Token::Whitespace(Whitespace::Space),
1367            Token::make_keyword("table"),
1368            Token::Whitespace(Whitespace::Tab),
1369            Token::Char('م'),
1370            Token::Char('ص'),
1371            Token::Char('ط'),
1372            Token::Char('ف'),
1373            Token::Char('ى'),
1374            Token::make_word("h", None),
1375        ];
1376        compare(expected, tokens);
1377    }
1378
1379    #[test]
1380    fn tokenize_right_arrow() {
1381        let sql = String::from("FUNCTION(key=>value)");
1382        let mut tokenizer = Tokenizer::new(&sql);
1383        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1384        let expected = vec![
1385            Token::make_word("FUNCTION", None),
1386            Token::LParen,
1387            Token::make_word("key", None),
1388            Token::RArrow,
1389            Token::make_word("value", None),
1390            Token::RParen,
1391        ];
1392        compare(expected, tokens);
1393    }
1394
1395    #[test]
1396    fn tokenize_is_null() {
1397        let sql = String::from("a IS NULL");
1398        let mut tokenizer = Tokenizer::new(&sql);
1399        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1400
1401        let expected = vec![
1402            Token::make_word("a", None),
1403            Token::Whitespace(Whitespace::Space),
1404            Token::make_keyword("IS"),
1405            Token::Whitespace(Whitespace::Space),
1406            Token::make_keyword("NULL"),
1407        ];
1408
1409        compare(expected, tokens);
1410    }
1411
1412    #[test]
1413    fn tokenize_comment() {
1414        let sql = String::from("0--this is a comment\n1");
1415        let mut tokenizer = Tokenizer::new(&sql);
1416        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1417        let expected = vec![
1418            Token::Number("0".to_owned()),
1419            Token::Whitespace(Whitespace::SingleLineComment {
1420                prefix: "--".to_owned(),
1421                comment: "this is a comment\n".to_owned(),
1422            }),
1423            Token::Number("1".to_owned()),
1424        ];
1425        compare(expected, tokens);
1426    }
1427
1428    #[test]
1429    fn tokenize_comment_at_eof() {
1430        let sql = String::from("--this is a comment");
1431        let mut tokenizer = Tokenizer::new(&sql);
1432        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1433        let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
1434            prefix: "--".to_owned(),
1435            comment: "this is a comment".to_owned(),
1436        })];
1437        compare(expected, tokens);
1438    }
1439
1440    #[test]
1441    fn tokenize_multiline_comment() {
1442        let sql = String::from("0/*multi-line\n* /comment*/1");
1443        let mut tokenizer = Tokenizer::new(&sql);
1444        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1445        let expected = vec![
1446            Token::Number("0".to_owned()),
1447            Token::Whitespace(Whitespace::MultiLineComment(
1448                "multi-line\n* /comment".to_owned(),
1449            )),
1450            Token::Number("1".to_owned()),
1451        ];
1452        compare(expected, tokens);
1453    }
1454
1455    #[test]
1456    fn tokenize_nested_multiline_comment() {
1457        let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
1458        let mut tokenizer = Tokenizer::new(&sql);
1459        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1460        let expected = vec![
1461            Token::Number("0".to_owned()),
1462            Token::Whitespace(Whitespace::MultiLineComment(
1463                "multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_owned(),
1464            )),
1465            Token::Number("1".to_owned()),
1466        ];
1467        compare(expected, tokens);
1468    }
1469
1470    #[test]
1471    fn tokenize_multiline_comment_with_even_asterisks() {
1472        let sql = String::from("\n/** Comment **/\n");
1473        let mut tokenizer = Tokenizer::new(&sql);
1474        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1475        let expected = vec![
1476            Token::Whitespace(Whitespace::Newline),
1477            Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_owned())),
1478            Token::Whitespace(Whitespace::Newline),
1479        ];
1480        compare(expected, tokens);
1481    }
1482
1483    #[test]
1484    fn tokenize_mismatched_quotes() {
1485        let sql = String::from("\"foo");
1486        let mut tokenizer = Tokenizer::new(&sql);
1487        assert_eq!(
1488            tokenizer.tokenize_with_whitespace(),
1489            Err(TokenizerError {
1490                message: "Expected close delimiter '\"' before EOF.".to_owned(),
1491                line: 1,
1492                col: 5,
1493                context: "LINE 1: \"foo\n            ^".to_owned(),
1494            })
1495        );
1496    }
1497
1498    #[test]
1499    fn tokenize_newlines() {
1500        let sql = String::from("line1\nline2\rline3\r\nline4\r");
1501        let mut tokenizer = Tokenizer::new(&sql);
1502        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1503        let expected = vec![
1504            Token::make_word("line1", None),
1505            Token::Whitespace(Whitespace::Newline),
1506            Token::make_word("line2", None),
1507            Token::Whitespace(Whitespace::Newline),
1508            Token::make_word("line3", None),
1509            Token::Whitespace(Whitespace::Newline),
1510            Token::make_word("line4", None),
1511            Token::Whitespace(Whitespace::Newline),
1512        ];
1513        compare(expected, tokens);
1514    }
1515
1516    #[test]
1517    fn tokenize_pg_regex_match() {
1518        let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
1519        let mut tokenizer = Tokenizer::new(sql);
1520        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1521        let expected = vec![
1522            Token::make_keyword("SELECT"),
1523            Token::Whitespace(Whitespace::Space),
1524            Token::make_word("col", None),
1525            Token::Whitespace(Whitespace::Space),
1526            Token::Op("~".to_owned()),
1527            Token::Whitespace(Whitespace::Space),
1528            Token::SingleQuotedString("^a".into()),
1529            Token::Comma,
1530            Token::Whitespace(Whitespace::Space),
1531            Token::make_word("col", None),
1532            Token::Whitespace(Whitespace::Space),
1533            Token::Op("~*".to_owned()),
1534            Token::Whitespace(Whitespace::Space),
1535            Token::SingleQuotedString("^a".into()),
1536            Token::Comma,
1537            Token::Whitespace(Whitespace::Space),
1538            Token::make_word("col", None),
1539            Token::Whitespace(Whitespace::Space),
1540            Token::Op("!~".to_owned()),
1541            Token::Whitespace(Whitespace::Space),
1542            Token::SingleQuotedString("^a".into()),
1543            Token::Comma,
1544            Token::Whitespace(Whitespace::Space),
1545            Token::make_word("col", None),
1546            Token::Whitespace(Whitespace::Space),
1547            Token::Op("!~*".to_owned()),
1548            Token::Whitespace(Whitespace::Space),
1549            Token::SingleQuotedString("^a".into()),
1550        ];
1551        compare(expected, tokens);
1552    }
1553
1554    #[test]
1555    fn tokenize_select_array() {
1556        let sql = String::from("SELECT '{1, 2, 3}'");
1557        let mut tokenizer = Tokenizer::new(&sql);
1558        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1559
1560        let expected = vec![
1561            Token::make_keyword("SELECT"),
1562            Token::Whitespace(Whitespace::Space),
1563            Token::SingleQuotedString(String::from("{1, 2, 3}")),
1564        ];
1565
1566        compare(expected, tokens);
1567    }
1568
1569    fn compare(expected: Vec<Token>, actual: Vec<Token>) {
1570        // println!("------------------------------");
1571        // println!("tokens   = {:?}", actual);
1572        // println!("expected = {:?}", expected);
1573        // println!("------------------------------");
1574        assert_eq!(expected, actual);
1575    }
1576}