risingwave_sqlparser/
tokenizer.rs

1// Licensed under the Apache License, Version 2.0 (the "License");
2// you may not use this file except in compliance with the License.
3// You may obtain a copy of the License at
4//
5//     http://www.apache.org/licenses/LICENSE-2.0
6//
7// Unless required by applicable law or agreed to in writing, software
8// distributed under the License is distributed on an "AS IS" BASIS,
9// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10// See the License for the specific language governing permissions and
11// limitations under the License.
12
13//! SQL Tokenizer
14//!
15//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
16//!
17//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).
18
19#[cfg(not(feature = "std"))]
20use alloc::{
21    borrow::ToOwned,
22    format,
23    string::{String, ToString},
24    vec,
25    vec::Vec,
26};
27use core::fmt;
28use core::fmt::Debug;
29use core::iter::Peekable;
30use core::str::Chars;
31
32#[cfg(feature = "serde")]
33use serde::{Deserialize, Serialize};
34
35use crate::ast::{CstyleEscapedString, DollarQuotedString};
36use crate::keywords::{ALL_KEYWORDS, ALL_KEYWORDS_INDEX, Keyword};
37
38/// SQL Token enumeration
39#[derive(Debug, Clone, PartialEq, Eq, Hash)]
40#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
41pub enum Token {
42    /// An end-of-file marker, not a real token
43    EOF,
44    /// A keyword (like SELECT) or an optionally quoted SQL identifier
45    Word(Word),
46    /// An unsigned numeric literal
47    Number(String),
48    /// A character that could not be tokenized
49    Char(char),
50    /// Single quoted string: i.e: 'string'
51    SingleQuotedString(String),
52    /// Dollar quoted string: i.e: $$string$$ or $tag_name$string$tag_name$
53    DollarQuotedString(DollarQuotedString),
54    /// Single quoted string with c-style escapes: i.e: E'string'
55    CstyleEscapesString(CstyleEscapedString),
56    /// "National" string literal: i.e: N'string'
57    NationalStringLiteral(String),
58    /// Hexadecimal string literal: i.e.: X'deadbeef'
59    HexStringLiteral(String),
60    /// Parameter symbols: i.e:  $1, $2
61    Parameter(String),
62    /// Comma
63    Comma,
64    /// Whitespace (space, tab, etc)
65    Whitespace(Whitespace),
66    /// Double equals sign `==`
67    DoubleEq,
68    /// Equality operator `=`
69    Eq,
70    /// Not Equals operator `<>` (or `!=` in some dialects)
71    Neq,
72    /// Less Than operator `<`
73    Lt,
74    /// Greater Than operator `>`
75    Gt,
76    /// Less Than Or Equals operator `<=`
77    LtEq,
78    /// Greater Than Or Equals operator `>=`
79    GtEq,
80    /// Spaceship operator <=>
81    Spaceship,
82    /// Plus operator `+`
83    Plus,
84    /// Minus operator `-`
85    Minus,
86    /// Multiplication operator `*`
87    Mul,
88    /// Division operator `/`
89    Div,
90    /// Modulo Operator `%`
91    Mod,
92    /// String concatenation `||`
93    Concat,
94    /// Left parenthesis `(`
95    LParen,
96    /// Right parenthesis `)`
97    RParen,
98    /// Period (used for compound identifiers or projections into nested types)
99    Period,
100    /// Colon `:`
101    Colon,
102    /// DoubleColon `::` (used for casting in postgresql)
103    DoubleColon,
104    /// SemiColon `;` used as separator for COPY and payload
105    SemiColon,
106    /// Backslash `\` used in terminating the COPY payload with `\.`
107    Backslash,
108    /// Left bracket `[`
109    LBracket,
110    /// Right bracket `]`
111    RBracket,
112    /// Ampersand `&`
113    Ampersand,
114    /// Pipe `|`
115    Pipe,
116    /// Caret `^`
117    Caret,
118    /// Prefix `^@`
119    Prefix,
120    /// Left brace `{`
121    LBrace,
122    /// Right brace `}`
123    RBrace,
124    /// Right Arrow `=>`
125    RArrow,
126    /// Sharp `#` used for PostgreSQL Bitwise XOR operator
127    Sharp,
128    /// Tilde `~` used for PostgreSQL Bitwise NOT operator or case sensitive match regular
129    /// expression operator
130    Tilde,
131    /// `~*` , a case insensitive match regular expression operator in PostgreSQL
132    TildeAsterisk,
133    /// `!~` , a case sensitive not match regular expression operator in PostgreSQL
134    ExclamationMarkTilde,
135    /// `!~*` , a case insensitive not match regular expression operator in PostgreSQL
136    ExclamationMarkTildeAsterisk,
137    /// `~~`, a case sensitive LIKE expression operator in PostgreSQL
138    DoubleTilde,
139    /// `~~*` , a case insensitive ILIKE regular expression operator in PostgreSQL
140    DoubleTildeAsterisk,
141    /// `!~~` , a case sensitive NOT LIKE regular expression operator in PostgreSQL
142    ExclamationMarkDoubleTilde,
143    /// `!~~*` , a case insensitive NOT ILIKE regular expression operator in PostgreSQL
144    ExclamationMarkDoubleTildeAsterisk,
145    /// `<<`, a bitwise shift left operator in PostgreSQL
146    ShiftLeft,
147    /// `>>`, a bitwise shift right operator in PostgreSQL
148    ShiftRight,
149    /// Exclamation Mark `!` used for PostgreSQL factorial operator
150    ExclamationMark,
151    /// Double Exclamation Mark `!!` used for PostgreSQL prefix factorial operator
152    DoubleExclamationMark,
153    /// AtSign `@` used for PostgreSQL abs operator
154    AtSign,
155    /// `|/`, a square root math operator in PostgreSQL
156    PGSquareRoot,
157    /// `||/` , a cube root math operator in PostgreSQL
158    PGCubeRoot,
159    /// `->`, access JSON object field or array element in PostgreSQL
160    Arrow,
161    /// `->>`, access JSON object field or array element as text in PostgreSQL
162    LongArrow,
163    /// `#>`, extract JSON sub-object at the specified path in PostgreSQL
164    HashArrow,
165    /// `#>>`, extract JSON sub-object at the specified path as text in PostgreSQL
166    HashLongArrow,
167    /// `#-`, delete a key from a JSON object in PostgreSQL
168    HashMinus,
169    /// `@>`, does the left JSON value contain the right JSON path/value entries at the top level
170    AtArrow,
171    /// `<@`, does the right JSON value contain the left JSON path/value entries at the top level
172    ArrowAt,
173    /// `?`, does the string exist as a top-level key within the JSON value
174    QuestionMark,
175    /// `?|`, do any of the strings exist as top-level keys or array elements?
176    QuestionMarkPipe,
177    /// `?&`, do all of the strings exist as top-level keys or array elements?
178    QuestionMarkAmpersand,
179    /// `@?`, does JSON path return any item for the specified JSON value?
180    AtQuestionMark,
181    /// `@@`, returns the result of a JSON path predicate check for the specified JSON value.
182    AtAt,
183}
184
185impl fmt::Display for Token {
186    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
187        match self {
188            Token::EOF => f.write_str("EOF"),
189            Token::Word(w) => write!(f, "{}", w),
190            Token::Number(n) => write!(f, "{}", n),
191            Token::Char(c) => write!(f, "{}", c),
192            Token::SingleQuotedString(s) => write!(f, "'{}'", s),
193            Token::DollarQuotedString(s) => write!(f, "{}", s),
194            Token::NationalStringLiteral(s) => write!(f, "N'{}'", s),
195            Token::HexStringLiteral(s) => write!(f, "X'{}'", s),
196            Token::CstyleEscapesString(s) => write!(f, "E'{}'", s),
197            Token::Parameter(s) => write!(f, "${}", s),
198            Token::Comma => f.write_str(","),
199            Token::Whitespace(ws) => write!(f, "{}", ws),
200            Token::DoubleEq => f.write_str("=="),
201            Token::Spaceship => f.write_str("<=>"),
202            Token::Eq => f.write_str("="),
203            Token::Neq => f.write_str("<>"),
204            Token::Lt => f.write_str("<"),
205            Token::Gt => f.write_str(">"),
206            Token::LtEq => f.write_str("<="),
207            Token::GtEq => f.write_str(">="),
208            Token::Plus => f.write_str("+"),
209            Token::Minus => f.write_str("-"),
210            Token::Mul => f.write_str("*"),
211            Token::Div => f.write_str("/"),
212            Token::Concat => f.write_str("||"),
213            Token::Mod => f.write_str("%"),
214            Token::LParen => f.write_str("("),
215            Token::RParen => f.write_str(")"),
216            Token::Period => f.write_str("."),
217            Token::Colon => f.write_str(":"),
218            Token::DoubleColon => f.write_str("::"),
219            Token::SemiColon => f.write_str(";"),
220            Token::Backslash => f.write_str("\\"),
221            Token::LBracket => f.write_str("["),
222            Token::RBracket => f.write_str("]"),
223            Token::Ampersand => f.write_str("&"),
224            Token::Caret => f.write_str("^"),
225            Token::Prefix => f.write_str("^@"),
226            Token::Pipe => f.write_str("|"),
227            Token::LBrace => f.write_str("{"),
228            Token::RBrace => f.write_str("}"),
229            Token::RArrow => f.write_str("=>"),
230            Token::Sharp => f.write_str("#"),
231            Token::ExclamationMark => f.write_str("!"),
232            Token::DoubleExclamationMark => f.write_str("!!"),
233            Token::Tilde => f.write_str("~"),
234            Token::TildeAsterisk => f.write_str("~*"),
235            Token::ExclamationMarkTilde => f.write_str("!~"),
236            Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
237            Token::DoubleTilde => f.write_str("~~"),
238            Token::DoubleTildeAsterisk => f.write_str("~~*"),
239            Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
240            Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
241            Token::AtSign => f.write_str("@"),
242            Token::ShiftLeft => f.write_str("<<"),
243            Token::ShiftRight => f.write_str(">>"),
244            Token::PGSquareRoot => f.write_str("|/"),
245            Token::PGCubeRoot => f.write_str("||/"),
246            Token::Arrow => f.write_str("->"),
247            Token::LongArrow => f.write_str("->>"),
248            Token::HashArrow => f.write_str("#>"),
249            Token::HashLongArrow => f.write_str("#>>"),
250            Token::HashMinus => f.write_str("#-"),
251            Token::AtArrow => f.write_str("@>"),
252            Token::ArrowAt => f.write_str("<@"),
253            Token::QuestionMark => f.write_str("?"),
254            Token::QuestionMarkPipe => f.write_str("?|"),
255            Token::QuestionMarkAmpersand => f.write_str("?&"),
256            Token::AtQuestionMark => f.write_str("@?"),
257            Token::AtAt => f.write_str("@@"),
258        }
259    }
260}
261
262impl Token {
263    pub fn make_keyword(keyword: &str) -> Self {
264        Token::make_word(keyword, None)
265    }
266
267    pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
268        let word_uppercase = word.to_uppercase();
269        Token::Word(Word {
270            value: word.to_owned(),
271            quote_style,
272            keyword: if quote_style.is_none() {
273                let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
274                keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
275            } else {
276                Keyword::NoKeyword
277            },
278        })
279    }
280
281    pub fn with_location(self, location: Location) -> TokenWithLocation {
282        TokenWithLocation::new(self, location.line, location.column)
283    }
284}
285
286/// A keyword (like SELECT) or an optionally quoted SQL identifier
287#[derive(Debug, Clone, PartialEq, Eq, Hash)]
288#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
289pub struct Word {
290    /// The value of the token, without the enclosing quotes, and with the
291    /// escape sequences (if any) processed (TODO: escapes are not handled)
292    pub value: String,
293    /// An identifier can be "quoted" (&lt;delimited identifier> in ANSI parlance).
294    /// The standard and most implementations allow using double quotes for this,
295    /// but some implementations support other quoting styles as well (e.g. \[MS SQL])
296    pub quote_style: Option<char>,
297    /// If the word was not quoted and it matched one of the known keywords,
298    /// this will have one of the values from dialect::keywords, otherwise empty
299    pub keyword: Keyword,
300}
301
302impl fmt::Display for Word {
303    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
304        match self.quote_style {
305            Some(s) if s == '"' || s == '[' || s == '`' => {
306                write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
307            }
308            None => f.write_str(&self.value),
309            _ => panic!("Unexpected quote_style!"),
310        }
311    }
312}
313
314impl Word {
315    fn matching_end_quote(ch: char) -> char {
316        match ch {
317            '"' => '"', // ANSI and most dialects
318            '[' => ']', // MS SQL
319            '`' => '`', // MySQL
320            _ => panic!("unexpected quoting style!"),
321        }
322    }
323}
324
325#[derive(Debug, Clone, PartialEq, Eq, Hash)]
326#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
327pub enum Whitespace {
328    Space,
329    Newline,
330    Tab,
331    SingleLineComment { comment: String, prefix: String },
332    MultiLineComment(String),
333}
334
335impl fmt::Display for Whitespace {
336    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
337        match self {
338            Whitespace::Space => f.write_str(" "),
339            Whitespace::Newline => f.write_str("\n"),
340            Whitespace::Tab => f.write_str("\t"),
341            Whitespace::SingleLineComment { prefix, comment } => write!(f, "{}{}", prefix, comment),
342            Whitespace::MultiLineComment(s) => write!(f, "/*{}*/", s),
343        }
344    }
345}
346
347/// Location in input string
348#[derive(Debug, Eq, PartialEq, Clone)]
349pub struct Location {
350    /// Line number, starting from 1
351    pub line: u64,
352    /// Line column, starting from 1
353    pub column: u64,
354}
355
356/// A [Token] with [Location] attached to it
357#[derive(Debug, Eq, PartialEq, Clone)]
358pub struct TokenWithLocation {
359    pub token: Token,
360    pub location: Location,
361}
362
363impl TokenWithLocation {
364    pub fn new(token: Token, line: u64, column: u64) -> TokenWithLocation {
365        TokenWithLocation {
366            token,
367            location: Location { line, column },
368        }
369    }
370
371    pub fn eof() -> TokenWithLocation {
372        TokenWithLocation::new(Token::EOF, 0, 0)
373    }
374}
375
376impl PartialEq<Token> for TokenWithLocation {
377    fn eq(&self, other: &Token) -> bool {
378        &self.token == other
379    }
380}
381
382impl PartialEq<TokenWithLocation> for Token {
383    fn eq(&self, other: &TokenWithLocation) -> bool {
384        self == &other.token
385    }
386}
387
388impl fmt::Display for TokenWithLocation {
389    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
390        if self.token == Token::EOF {
391            write!(f, "end of input")
392        } else {
393            write!(
394                f,
395                "{} at line {}, column {}",
396                self.token, self.location.line, self.location.column
397            )
398        }
399    }
400}
401
402/// Tokenizer error
403#[derive(Debug, PartialEq)]
404pub struct TokenizerError {
405    pub message: String,
406    pub line: u64,
407    pub col: u64,
408    pub context: String,
409}
410
411impl fmt::Display for TokenizerError {
412    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
413        write!(
414            f,
415            "{} at line {}, column {}\n{}",
416            self.message, self.line, self.col, self.context
417        )
418    }
419}
420
421#[cfg(feature = "std")]
422impl std::error::Error for TokenizerError {}
423
424/// SQL Tokenizer
425pub struct Tokenizer<'a> {
426    sql: &'a str,
427    chars: Peekable<Chars<'a>>,
428    line: u64,
429    col: u64,
430}
431
432impl<'a> Tokenizer<'a> {
433    /// Create a new SQL tokenizer for the specified SQL statement
434    pub fn new(query: &'a str) -> Self {
435        Self {
436            sql: query,
437            chars: query.chars().peekable(),
438            line: 1,
439            col: 1,
440        }
441    }
442
443    /// Consume the next character.
444    fn next(&mut self) -> Option<char> {
445        let ch = self.chars.next();
446        if let Some(ch) = ch {
447            match ch {
448                '\n' => {
449                    self.line += 1;
450                    self.col = 1;
451                }
452                '\t' => self.col += 4,
453                _ => self.col += 1,
454            }
455        }
456        ch
457    }
458
459    /// Return the next character without consuming it.
460    fn peek(&mut self) -> Option<char> {
461        self.chars.peek().cloned()
462    }
463
464    /// Tokenize the statement and produce a vector of tokens with locations.
465    ///
466    /// Whitespaces are skipped.
467    pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
468        let tokens = self.tokenize()?;
469        Ok(tokens
470            .into_iter()
471            .filter(|token| !matches!(&token.token, Token::Whitespace(_)))
472            .collect())
473    }
474
475    /// Tokenize the statement and produce a vector of tokens.
476    ///
477    /// Whitespaces are included.
478    #[allow(dead_code)]
479    fn tokenize_with_whitespace(&mut self) -> Result<Vec<Token>, TokenizerError> {
480        let tokens = self.tokenize()?;
481        Ok(tokens.into_iter().map(|t| t.token).collect())
482    }
483
484    /// Tokenize the statement and produce a vector of tokens.
485    ///
486    /// Whitespaces are included.
487    fn tokenize(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
488        let mut tokens = Vec::new();
489        while let Some(token) = self.next_token_with_location()? {
490            tokens.push(token);
491        }
492        Ok(tokens)
493    }
494
495    /// Get the next token or return None
496    fn next_token_with_location(&mut self) -> Result<Option<TokenWithLocation>, TokenizerError> {
497        let loc = Location {
498            line: self.line,
499            column: self.col,
500        };
501        self.next_token()
502            .map(|t| t.map(|token| token.with_location(loc)))
503    }
504
505    /// Get the next token or return None
506    fn next_token(&mut self) -> Result<Option<Token>, TokenizerError> {
507        match self.peek() {
508            Some(ch) => match ch {
509                ' ' => self.consume_and_return(Token::Whitespace(Whitespace::Space)),
510                '\t' => self.consume_and_return(Token::Whitespace(Whitespace::Tab)),
511                '\n' => self.consume_and_return(Token::Whitespace(Whitespace::Newline)),
512                '\r' => {
513                    // Emit a single Whitespace::Newline token for \r and \r\n
514                    self.next();
515                    if let Some('\n') = self.peek() {
516                        self.next();
517                    }
518                    Ok(Some(Token::Whitespace(Whitespace::Newline)))
519                }
520                'N' => {
521                    self.next(); // consume, to check the next char
522                    match self.peek() {
523                        Some('\'') => {
524                            // N'...' - a <national character string literal>
525                            let s = self.tokenize_single_quoted_string()?;
526                            Ok(Some(Token::NationalStringLiteral(s)))
527                        }
528                        _ => {
529                            // regular identifier starting with an "N"
530                            let s = self.tokenize_word('N');
531                            Ok(Some(Token::make_word(&s, None)))
532                        }
533                    }
534                }
535                x @ 'e' | x @ 'E' => {
536                    self.next(); // consume, to check the next char
537                    match self.peek() {
538                        Some('\'') => {
539                            // E'...' - a <character string literal>
540                            let s = self.tokenize_single_quoted_string_with_escape()?;
541                            Ok(Some(Token::CstyleEscapesString(s)))
542                        }
543                        _ => {
544                            // regular identifier starting with an "E"
545                            let s = self.tokenize_word(x);
546                            Ok(Some(Token::make_word(&s, None)))
547                        }
548                    }
549                }
550                // The spec only allows an uppercase 'X' to introduce a hex
551                // string, but PostgreSQL, at least, allows a lowercase 'x' too.
552                x @ 'x' | x @ 'X' => {
553                    self.next(); // consume, to check the next char
554                    match self.peek() {
555                        Some('\'') => {
556                            // X'...' - a <binary string literal>
557                            let s = self.tokenize_single_quoted_string()?;
558                            Ok(Some(Token::HexStringLiteral(s)))
559                        }
560                        _ => {
561                            // regular identifier starting with an "X"
562                            let s = self.tokenize_word(x);
563                            Ok(Some(Token::make_word(&s, None)))
564                        }
565                    }
566                }
567                // identifier or keyword
568                ch if is_identifier_start(ch) => {
569                    self.next(); // consume the first char
570                    let s = self.tokenize_word(ch);
571
572                    Ok(Some(Token::make_word(&s, None)))
573                }
574                // string
575                '\'' => {
576                    let s = self.tokenize_single_quoted_string()?;
577
578                    Ok(Some(Token::SingleQuotedString(s)))
579                }
580                // delimited (quoted) identifier
581                quote_start if is_delimited_identifier_start(quote_start) => {
582                    self.next(); // consume the opening quote
583                    let quote_end = Word::matching_end_quote(quote_start);
584                    let s = self.peeking_take_while(|ch| ch != quote_end);
585                    if self.next() == Some(quote_end) {
586                        Ok(Some(Token::make_word(&s, Some(quote_start))))
587                    } else {
588                        self.error(format!(
589                            "Expected close delimiter '{}' before EOF.",
590                            quote_end
591                        ))
592                    }
593                }
594                // numbers and period
595                '0'..='9' | '.' => {
596                    let mut s = self.peeking_take_while(|ch| ch.is_ascii_digit());
597
598                    // match binary literal that starts with 0x
599                    if s == "0"
600                        && let Some(radix) = self.peek()
601                        && "xob".contains(radix.to_ascii_lowercase())
602                    {
603                        self.next();
604                        let radix = radix.to_ascii_lowercase();
605                        let base = match radix {
606                            'x' => 16,
607                            'o' => 8,
608                            'b' => 2,
609                            _ => unreachable!(),
610                        };
611                        let s2 = self.peeking_take_while(|ch| ch.is_digit(base));
612                        if s2.is_empty() {
613                            return self.error("incomplete integer literal");
614                        }
615                        self.reject_number_junk()?;
616                        return Ok(Some(Token::Number(format!("0{radix}{s2}"))));
617                    }
618
619                    // match one period
620                    if let Some('.') = self.peek() {
621                        s.push('.');
622                        self.next();
623                    }
624                    s += &self.peeking_take_while(|ch| ch.is_ascii_digit());
625
626                    // No number -> Token::Period
627                    if s == "." {
628                        return Ok(Some(Token::Period));
629                    }
630
631                    match self.peek() {
632                        // Number is a scientific number (1e6)
633                        Some('e') | Some('E') => {
634                            s.push('e');
635                            self.next();
636
637                            if let Some('-') = self.peek() {
638                                s.push('-');
639                                self.next();
640                            }
641                            s += &self.peeking_take_while(|ch| ch.is_ascii_digit());
642                            self.reject_number_junk()?;
643                            return Ok(Some(Token::Number(s)));
644                        }
645                        // Not a scientific number
646                        _ => {}
647                    };
648                    self.reject_number_junk()?;
649                    Ok(Some(Token::Number(s)))
650                }
651                // punctuation
652                '(' => self.consume_and_return(Token::LParen),
653                ')' => self.consume_and_return(Token::RParen),
654                ',' => self.consume_and_return(Token::Comma),
655                // operators
656                '-' => {
657                    self.next(); // consume the '-'
658                    match self.peek() {
659                        Some('-') => {
660                            self.next(); // consume the second '-', starting a single-line comment
661                            let comment = self.tokenize_single_line_comment();
662                            Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
663                                prefix: "--".to_owned(),
664                                comment,
665                            })))
666                        }
667                        Some('>') => {
668                            self.next(); // consume first '>'
669                            match self.peek() {
670                                Some('>') => {
671                                    self.next(); // consume second '>'
672                                    Ok(Some(Token::LongArrow))
673                                }
674                                _ => Ok(Some(Token::Arrow)),
675                            }
676                        }
677                        // a regular '-' operator
678                        _ => Ok(Some(Token::Minus)),
679                    }
680                }
681                '/' => {
682                    self.next(); // consume the '/'
683                    match self.peek() {
684                        Some('*') => {
685                            self.next(); // consume the '*', starting a multi-line comment
686                            self.tokenize_multiline_comment()
687                        }
688                        // a regular '/' operator
689                        _ => Ok(Some(Token::Div)),
690                    }
691                }
692                '+' => self.consume_and_return(Token::Plus),
693                '*' => self.consume_and_return(Token::Mul),
694                '%' => self.consume_and_return(Token::Mod),
695                '|' => {
696                    self.next(); // consume the '|'
697                    match self.peek() {
698                        Some('/') => self.consume_and_return(Token::PGSquareRoot),
699                        Some('|') => {
700                            self.next(); // consume the second '|'
701                            match self.peek() {
702                                Some('/') => self.consume_and_return(Token::PGCubeRoot),
703                                _ => Ok(Some(Token::Concat)),
704                            }
705                        }
706                        // Bitshift '|' operator
707                        _ => Ok(Some(Token::Pipe)),
708                    }
709                }
710                '=' => {
711                    self.next(); // consume
712                    match self.peek() {
713                        Some('>') => self.consume_and_return(Token::RArrow),
714                        _ => Ok(Some(Token::Eq)),
715                    }
716                }
717                '!' => {
718                    self.next(); // consume
719                    match self.peek() {
720                        Some('=') => self.consume_and_return(Token::Neq),
721                        Some('!') => self.consume_and_return(Token::DoubleExclamationMark),
722                        Some('~') => {
723                            self.next();
724                            match self.peek() {
725                                Some('~') => {
726                                    self.next();
727                                    match self.peek() {
728                                        Some('*') => self.consume_and_return(
729                                            Token::ExclamationMarkDoubleTildeAsterisk,
730                                        ),
731                                        _ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
732                                    }
733                                }
734                                Some('*') => {
735                                    self.consume_and_return(Token::ExclamationMarkTildeAsterisk)
736                                }
737                                _ => Ok(Some(Token::ExclamationMarkTilde)),
738                            }
739                        }
740                        _ => Ok(Some(Token::ExclamationMark)),
741                    }
742                }
743                '<' => {
744                    self.next(); // consume
745                    match self.peek() {
746                        Some('=') => {
747                            self.next();
748                            match self.peek() {
749                                Some('>') => self.consume_and_return(Token::Spaceship),
750                                _ => Ok(Some(Token::LtEq)),
751                            }
752                        }
753                        Some('>') => self.consume_and_return(Token::Neq),
754                        Some('<') => self.consume_and_return(Token::ShiftLeft),
755                        Some('@') => self.consume_and_return(Token::ArrowAt),
756                        _ => Ok(Some(Token::Lt)),
757                    }
758                }
759                '>' => {
760                    self.next(); // consume
761                    match self.peek() {
762                        Some('=') => self.consume_and_return(Token::GtEq),
763                        Some('>') => self.consume_and_return(Token::ShiftRight),
764                        _ => Ok(Some(Token::Gt)),
765                    }
766                }
767                ':' => {
768                    self.next();
769                    match self.peek() {
770                        Some(':') => self.consume_and_return(Token::DoubleColon),
771                        _ => Ok(Some(Token::Colon)),
772                    }
773                }
774                '$' => Ok(Some(self.tokenize_dollar_preceded_value()?)),
775                ';' => self.consume_and_return(Token::SemiColon),
776                '\\' => self.consume_and_return(Token::Backslash),
777                '[' => self.consume_and_return(Token::LBracket),
778                ']' => self.consume_and_return(Token::RBracket),
779                '&' => self.consume_and_return(Token::Ampersand),
780                '^' => {
781                    self.next();
782                    match self.peek() {
783                        Some('@') => self.consume_and_return(Token::Prefix),
784                        _ => Ok(Some(Token::Caret)),
785                    }
786                }
787                '{' => self.consume_and_return(Token::LBrace),
788                '}' => self.consume_and_return(Token::RBrace),
789                '~' => {
790                    self.next(); // consume
791                    match self.peek() {
792                        Some('~') => {
793                            self.next();
794                            match self.peek() {
795                                Some('*') => self.consume_and_return(Token::DoubleTildeAsterisk),
796                                _ => Ok(Some(Token::DoubleTilde)),
797                            }
798                        }
799                        Some('*') => self.consume_and_return(Token::TildeAsterisk),
800                        _ => Ok(Some(Token::Tilde)),
801                    }
802                }
803                '#' => {
804                    self.next(); // consume the '#'
805                    match self.peek() {
806                        Some('-') => self.consume_and_return(Token::HashMinus),
807                        Some('>') => {
808                            self.next(); // consume first '>'
809                            match self.peek() {
810                                Some('>') => {
811                                    self.next(); // consume second '>'
812                                    Ok(Some(Token::HashLongArrow))
813                                }
814                                _ => Ok(Some(Token::HashArrow)),
815                            }
816                        }
817                        // a regular '#' operator
818                        _ => Ok(Some(Token::Sharp)),
819                    }
820                }
821                '@' => {
822                    self.next(); // consume the '@'
823                    match self.peek() {
824                        Some('>') => self.consume_and_return(Token::AtArrow),
825                        Some('?') => self.consume_and_return(Token::AtQuestionMark),
826                        Some('@') => self.consume_and_return(Token::AtAt),
827                        // a regular '@' operator
828                        _ => Ok(Some(Token::AtSign)),
829                    }
830                }
831                '?' => {
832                    self.next(); // consume the '?'
833                    match self.peek() {
834                        Some('|') => self.consume_and_return(Token::QuestionMarkPipe),
835                        Some('&') => self.consume_and_return(Token::QuestionMarkAmpersand),
836                        // a regular '?' operator
837                        _ => Ok(Some(Token::QuestionMark)),
838                    }
839                }
840                other => self.consume_and_return(Token::Char(other)),
841            },
842            None => Ok(None),
843        }
844    }
845
846    /// Tokenize dollar preceded value (i.e: a string/placeholder)
847    fn tokenize_dollar_preceded_value(&mut self) -> Result<Token, TokenizerError> {
848        let mut s = String::new();
849        let mut value = String::new();
850
851        self.next();
852
853        if let Some('$') = self.peek() {
854            self.next();
855
856            let mut is_terminated = false;
857            let mut prev: Option<char> = None;
858
859            while let Some(ch) = self.peek() {
860                if prev == Some('$') {
861                    if ch == '$' {
862                        self.next();
863                        is_terminated = true;
864                        break;
865                    } else {
866                        s.push('$');
867                        s.push(ch);
868                    }
869                } else if ch != '$' {
870                    s.push(ch);
871                }
872
873                prev = Some(ch);
874                self.next();
875            }
876
877            return if self.peek().is_none() && !is_terminated {
878                self.error("Unterminated dollar-quoted string")
879            } else {
880                Ok(Token::DollarQuotedString(DollarQuotedString {
881                    value: s,
882                    tag: None,
883                }))
884            };
885        } else {
886            value.push_str(&self.peeking_take_while(|ch| ch.is_alphanumeric() || ch == '_'));
887
888            if let Some('$') = self.peek() {
889                self.next();
890                s.push_str(&self.peeking_take_while(|ch| ch != '$'));
891
892                match self.peek() {
893                    Some('$') => {
894                        self.next();
895                        for c in value.chars() {
896                            let next_char = self.next();
897                            if Some(c) != next_char {
898                                return self.error(format!(
899                                    "Unterminated dollar-quoted string at or near \"{}\"",
900                                    value
901                                ));
902                            }
903                        }
904
905                        if let Some('$') = self.peek() {
906                            self.next();
907                        } else {
908                            return self.error("Unterminated dollar-quoted string, expected $");
909                        }
910                    }
911                    _ => {
912                        return self.error("Unterminated dollar-quoted, expected $");
913                    }
914                }
915            } else {
916                return Ok(Token::Parameter(value));
917            }
918        }
919
920        Ok(Token::DollarQuotedString(DollarQuotedString {
921            value: s,
922            tag: if value.is_empty() { None } else { Some(value) },
923        }))
924    }
925
926    fn error<R>(&self, message: impl Into<String>) -> Result<R, TokenizerError> {
927        let prefix = format!("LINE {}: ", self.line);
928        let sql_line = self.sql.split('\n').nth(self.line as usize - 1).unwrap();
929        let cursor = " ".repeat(prefix.len() + self.col as usize - 1);
930        let context = format!("{}{}\n{}^", prefix, sql_line, cursor);
931        Err(TokenizerError {
932            message: message.into(),
933            col: self.col,
934            line: self.line,
935            context,
936        })
937    }
938
939    fn reject_number_junk(&mut self) -> Result<(), TokenizerError> {
940        if let Some(ch) = self.peek()
941            && is_identifier_start(ch)
942        {
943            return self.error("trailing junk after numeric literal");
944        }
945        Ok(())
946    }
947
948    // Consume characters until newline
949    fn tokenize_single_line_comment(&mut self) -> String {
950        let mut comment = self.peeking_take_while(|ch| ch != '\n');
951        if let Some(ch) = self.next() {
952            assert_eq!(ch, '\n');
953            comment.push(ch);
954        }
955        comment
956    }
957
958    /// Tokenize an identifier or keyword, after the first char is already consumed.
959    fn tokenize_word(&mut self, first_char: char) -> String {
960        let mut s = first_char.to_string();
961        s.push_str(&self.peeking_take_while(is_identifier_part));
962        s
963    }
964
965    /// Read a single quoted string, starting with the opening quote.
966    fn tokenize_single_quoted_string(&mut self) -> Result<String, TokenizerError> {
967        let mut s = String::new();
968        self.next(); // consume the opening quote
969
970        // slash escaping is specific to MySQL dialect
971        let mut is_escaped = false;
972        while let Some(ch) = self.peek() {
973            match ch {
974                '\'' => {
975                    self.next(); // consume
976                    if is_escaped {
977                        s.push(ch);
978                        is_escaped = false;
979                    } else if self.peek().map(|c| c == '\'').unwrap_or(false) {
980                        s.push(ch);
981                        self.next();
982                    } else {
983                        return Ok(s);
984                    }
985                }
986                '\\' => {
987                    s.push(ch);
988                    self.next();
989                }
990                _ => {
991                    self.next(); // consume
992                    s.push(ch);
993                }
994            }
995        }
996        self.error("Unterminated string literal")
997    }
998
999    /// Read a single qutoed string with escape
1000    fn tokenize_single_quoted_string_with_escape(
1001        &mut self,
1002    ) -> Result<CstyleEscapedString, TokenizerError> {
1003        let mut terminated = false;
1004        let mut s = String::new();
1005        self.next(); // consume the opening quote
1006
1007        while let Some(ch) = self.peek() {
1008            match ch {
1009                '\'' => {
1010                    self.next(); // consume
1011                    if self.peek().map(|c| c == '\'').unwrap_or(false) {
1012                        s.push('\\');
1013                        s.push(ch);
1014                        self.next();
1015                    } else {
1016                        terminated = true;
1017                        break;
1018                    }
1019                }
1020                '\\' => {
1021                    s.push(ch);
1022                    self.next();
1023                    if self.peek().map(|c| c == '\'' || c == '\\').unwrap_or(false) {
1024                        s.push(self.next().unwrap());
1025                    }
1026                }
1027                _ => {
1028                    self.next(); // consume
1029                    s.push(ch);
1030                }
1031            }
1032        }
1033
1034        if !terminated {
1035            return self.error("Unterminated string literal");
1036        }
1037
1038        let unescaped = match Self::unescape_c_style(&s) {
1039            Ok(unescaped) => unescaped,
1040            Err(e) => return self.error(e),
1041        };
1042
1043        Ok(CstyleEscapedString {
1044            value: unescaped,
1045            raw: s,
1046        })
1047    }
1048
1049    /// Helper function used to convert string with c-style escapes into a normal string
1050    /// e.g. 'hello\x3fworld' -> 'hello?world'
1051    ///
1052    /// Detail of c-style escapes refer from:
1053    /// <https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE:~:text=4.1.2.2.%C2%A0String%20Constants%20With%20C%2DStyle%20Escapes>
1054    fn unescape_c_style(s: &str) -> Result<String, String> {
1055        fn hex_byte_process(
1056            chars: &mut Peekable<Chars<'_>>,
1057            res: &mut String,
1058            len: usize,
1059            default_char: char,
1060        ) -> Result<(), String> {
1061            let mut unicode_seq: String = String::with_capacity(len);
1062            for _ in 0..len {
1063                if let Some(c) = chars.peek()
1064                    && c.is_ascii_hexdigit()
1065                {
1066                    unicode_seq.push(chars.next().unwrap());
1067                } else {
1068                    break;
1069                }
1070            }
1071
1072            if unicode_seq.is_empty() && len == 2 {
1073                res.push(default_char);
1074                return Ok(());
1075            } else if unicode_seq.len() < len && len != 2 {
1076                return Err("invalid unicode sequence: must be \\uXXXX or \\UXXXXXXXX".to_owned());
1077            }
1078
1079            if len == 2 {
1080                let number = [u8::from_str_radix(&unicode_seq, 16)
1081                    .map_err(|e| format!("invalid unicode sequence: {}", e))?];
1082
1083                res.push(
1084                    std::str::from_utf8(&number)
1085                        .map_err(|err| format!("invalid unicode sequence: {}", err))?
1086                        .chars()
1087                        .next()
1088                        .unwrap(),
1089                );
1090            } else {
1091                let number = u32::from_str_radix(&unicode_seq, 16)
1092                    .map_err(|e| format!("invalid unicode sequence: {}", e))?;
1093                res.push(
1094                    char::from_u32(number)
1095                        .ok_or_else(|| format!("invalid unicode sequence: {}", unicode_seq))?,
1096                );
1097            }
1098            Ok(())
1099        }
1100
1101        fn octal_byte_process(
1102            chars: &mut Peekable<Chars<'_>>,
1103            res: &mut String,
1104            digit: char,
1105        ) -> Result<(), String> {
1106            let mut unicode_seq: String = String::with_capacity(3);
1107            unicode_seq.push(digit);
1108            for _ in 0..2 {
1109                if let Some(c) = chars.peek()
1110                    && matches!(*c, '0'..='7')
1111                {
1112                    unicode_seq.push(chars.next().unwrap());
1113                } else {
1114                    break;
1115                }
1116            }
1117
1118            let number = [u8::from_str_radix(&unicode_seq, 8)
1119                .map_err(|e| format!("invalid unicode sequence: {}", e))?];
1120
1121            res.push(
1122                std::str::from_utf8(&number)
1123                    .map_err(|err| format!("invalid unicode sequence: {}", err))?
1124                    .chars()
1125                    .next()
1126                    .unwrap(),
1127            );
1128            Ok(())
1129        }
1130
1131        let mut chars = s.chars().peekable();
1132        let mut res = String::with_capacity(s.len());
1133
1134        while let Some(c) = chars.next() {
1135            if c == '\\' {
1136                match chars.next() {
1137                    None => {
1138                        return Err("unterminated escape sequence".to_owned());
1139                    }
1140                    Some(next_c) => match next_c {
1141                        'b' => res.push('\u{08}'),
1142                        'f' => res.push('\u{0C}'),
1143                        'n' => res.push('\n'),
1144                        'r' => res.push('\r'),
1145                        't' => res.push('\t'),
1146                        'x' => hex_byte_process(&mut chars, &mut res, 2, 'x')?,
1147                        'u' => hex_byte_process(&mut chars, &mut res, 4, 'u')?,
1148                        'U' => hex_byte_process(&mut chars, &mut res, 8, 'U')?,
1149                        digit @ '0'..='7' => octal_byte_process(&mut chars, &mut res, digit)?,
1150                        _ => res.push(next_c),
1151                    },
1152                }
1153            } else {
1154                res.push(c);
1155            }
1156        }
1157
1158        Ok(res)
1159    }
1160
1161    fn tokenize_multiline_comment(&mut self) -> Result<Option<Token>, TokenizerError> {
1162        let mut s = String::new();
1163
1164        let mut nested = 1;
1165        let mut last_ch = ' ';
1166
1167        loop {
1168            match self.next() {
1169                Some(ch) => {
1170                    if last_ch == '/' && ch == '*' {
1171                        nested += 1;
1172                    } else if last_ch == '*' && ch == '/' {
1173                        nested -= 1;
1174                        if nested == 0 {
1175                            s.pop();
1176                            break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
1177                        }
1178                    }
1179                    s.push(ch);
1180                    last_ch = ch;
1181                }
1182                None => break self.error("Unexpected EOF while in a multi-line comment"),
1183            }
1184        }
1185    }
1186
1187    #[allow(clippy::unnecessary_wraps)]
1188    fn consume_and_return(&mut self, t: Token) -> Result<Option<Token>, TokenizerError> {
1189        self.next();
1190        Ok(Some(t))
1191    }
1192
1193    /// Read from `self` until `predicate` returns `false` or EOF is hit.
1194    /// Return the characters read as String, and keep the first non-matching
1195    /// char available as `self.next()`.
1196    fn peeking_take_while(&mut self, mut predicate: impl FnMut(char) -> bool) -> String {
1197        let mut s = String::new();
1198        while let Some(ch) = self.peek() {
1199            if predicate(ch) {
1200                self.next(); // consume
1201                s.push(ch);
1202            } else {
1203                break;
1204            }
1205        }
1206        s
1207    }
1208}
1209
1210/// Determine if a character starts a quoted identifier. The default
1211/// implementation, accepting "double quoted" ids is both ANSI-compliant
1212/// and appropriate for most dialects (with the notable exception of
1213/// MySQL, MS SQL, and sqlite). You can accept one of characters listed
1214/// in `Word::matching_end_quote` here
1215fn is_delimited_identifier_start(ch: char) -> bool {
1216    ch == '"'
1217}
1218
1219/// Determine if a character is a valid start character for an unquoted identifier
1220fn is_identifier_start(ch: char) -> bool {
1221    // See https://www.postgresql.org/docs/11/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS
1222    // We don't yet support identifiers beginning with "letters with
1223    // diacritical marks and non-Latin letters"
1224    ch.is_ascii_alphabetic() || ch == '_'
1225}
1226
1227/// Determine if a character is a valid unquoted identifier character
1228fn is_identifier_part(ch: char) -> bool {
1229    ch.is_ascii_alphanumeric() || ch == '$' || ch == '_'
1230}
1231
1232#[cfg(test)]
1233mod tests {
1234    use super::*;
1235
1236    #[test]
1237    fn tokenizer_error_impl() {
1238        let err = TokenizerError {
1239            message: "test".into(),
1240            line: 1,
1241            col: 1,
1242            context: "LINE 1:".to_owned(),
1243        };
1244        #[cfg(feature = "std")]
1245        {
1246            use std::error::Error;
1247            assert!(err.source().is_none());
1248        }
1249        assert_eq!(err.to_string(), "test at line 1, column 1\nLINE 1:");
1250    }
1251
1252    #[test]
1253    fn tokenize_select_1() {
1254        let sql = String::from("SELECT 1");
1255        let mut tokenizer = Tokenizer::new(&sql);
1256        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1257
1258        let expected = vec![
1259            Token::make_keyword("SELECT"),
1260            Token::Whitespace(Whitespace::Space),
1261            Token::Number(String::from("1")),
1262        ];
1263
1264        compare(expected, tokens);
1265    }
1266
1267    #[test]
1268    fn tokenize_select_float() {
1269        let sql = String::from("SELECT .1");
1270        let mut tokenizer = Tokenizer::new(&sql);
1271        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1272
1273        let expected = vec![
1274            Token::make_keyword("SELECT"),
1275            Token::Whitespace(Whitespace::Space),
1276            Token::Number(String::from(".1")),
1277        ];
1278
1279        compare(expected, tokens);
1280    }
1281
1282    #[test]
1283    fn tokenize_scalar_function() {
1284        let sql = String::from("SELECT sqrt(1)");
1285        let mut tokenizer = Tokenizer::new(&sql);
1286        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1287
1288        let expected = vec![
1289            Token::make_keyword("SELECT"),
1290            Token::Whitespace(Whitespace::Space),
1291            Token::make_word("sqrt", None),
1292            Token::LParen,
1293            Token::Number(String::from("1")),
1294            Token::RParen,
1295        ];
1296
1297        compare(expected, tokens);
1298    }
1299
1300    #[test]
1301    fn tokenize_string_string_concat() {
1302        let sql = String::from("SELECT 'a' || 'b'");
1303        let mut tokenizer = Tokenizer::new(&sql);
1304        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1305
1306        let expected = vec![
1307            Token::make_keyword("SELECT"),
1308            Token::Whitespace(Whitespace::Space),
1309            Token::SingleQuotedString(String::from("a")),
1310            Token::Whitespace(Whitespace::Space),
1311            Token::Concat,
1312            Token::Whitespace(Whitespace::Space),
1313            Token::SingleQuotedString(String::from("b")),
1314        ];
1315
1316        compare(expected, tokens);
1317    }
1318
1319    #[test]
1320    fn tokenize_bitwise_op() {
1321        let sql = String::from("SELECT one | two ^ three");
1322        let mut tokenizer = Tokenizer::new(&sql);
1323        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1324
1325        let expected = vec![
1326            Token::make_keyword("SELECT"),
1327            Token::Whitespace(Whitespace::Space),
1328            Token::make_word("one", None),
1329            Token::Whitespace(Whitespace::Space),
1330            Token::Pipe,
1331            Token::Whitespace(Whitespace::Space),
1332            Token::make_word("two", None),
1333            Token::Whitespace(Whitespace::Space),
1334            Token::Caret,
1335            Token::Whitespace(Whitespace::Space),
1336            Token::make_word("three", None),
1337        ];
1338        compare(expected, tokens);
1339    }
1340
1341    #[test]
1342    fn tokenize_logical_xor() {
1343        let sql =
1344            String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
1345        let mut tokenizer = Tokenizer::new(&sql);
1346        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1347
1348        let expected = vec![
1349            Token::make_keyword("SELECT"),
1350            Token::Whitespace(Whitespace::Space),
1351            Token::make_keyword("true"),
1352            Token::Whitespace(Whitespace::Space),
1353            Token::make_keyword("XOR"),
1354            Token::Whitespace(Whitespace::Space),
1355            Token::make_keyword("true"),
1356            Token::Comma,
1357            Token::Whitespace(Whitespace::Space),
1358            Token::make_keyword("false"),
1359            Token::Whitespace(Whitespace::Space),
1360            Token::make_keyword("XOR"),
1361            Token::Whitespace(Whitespace::Space),
1362            Token::make_keyword("false"),
1363            Token::Comma,
1364            Token::Whitespace(Whitespace::Space),
1365            Token::make_keyword("true"),
1366            Token::Whitespace(Whitespace::Space),
1367            Token::make_keyword("XOR"),
1368            Token::Whitespace(Whitespace::Space),
1369            Token::make_keyword("false"),
1370            Token::Comma,
1371            Token::Whitespace(Whitespace::Space),
1372            Token::make_keyword("false"),
1373            Token::Whitespace(Whitespace::Space),
1374            Token::make_keyword("XOR"),
1375            Token::Whitespace(Whitespace::Space),
1376            Token::make_keyword("true"),
1377        ];
1378        compare(expected, tokens);
1379    }
1380
1381    #[test]
1382    fn tokenize_simple_select() {
1383        let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
1384        let mut tokenizer = Tokenizer::new(&sql);
1385        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1386
1387        let expected = vec![
1388            Token::make_keyword("SELECT"),
1389            Token::Whitespace(Whitespace::Space),
1390            Token::Mul,
1391            Token::Whitespace(Whitespace::Space),
1392            Token::make_keyword("FROM"),
1393            Token::Whitespace(Whitespace::Space),
1394            Token::make_word("customer", None),
1395            Token::Whitespace(Whitespace::Space),
1396            Token::make_keyword("WHERE"),
1397            Token::Whitespace(Whitespace::Space),
1398            Token::make_word("id", None),
1399            Token::Whitespace(Whitespace::Space),
1400            Token::Eq,
1401            Token::Whitespace(Whitespace::Space),
1402            Token::Number(String::from("1")),
1403            Token::Whitespace(Whitespace::Space),
1404            Token::make_keyword("LIMIT"),
1405            Token::Whitespace(Whitespace::Space),
1406            Token::Number(String::from("5")),
1407        ];
1408
1409        compare(expected, tokens);
1410    }
1411
1412    #[test]
1413    fn tokenize_explain_select() {
1414        let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
1415        let mut tokenizer = Tokenizer::new(&sql);
1416        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1417
1418        let expected = vec![
1419            Token::make_keyword("EXPLAIN"),
1420            Token::Whitespace(Whitespace::Space),
1421            Token::make_keyword("SELECT"),
1422            Token::Whitespace(Whitespace::Space),
1423            Token::Mul,
1424            Token::Whitespace(Whitespace::Space),
1425            Token::make_keyword("FROM"),
1426            Token::Whitespace(Whitespace::Space),
1427            Token::make_word("customer", None),
1428            Token::Whitespace(Whitespace::Space),
1429            Token::make_keyword("WHERE"),
1430            Token::Whitespace(Whitespace::Space),
1431            Token::make_word("id", None),
1432            Token::Whitespace(Whitespace::Space),
1433            Token::Eq,
1434            Token::Whitespace(Whitespace::Space),
1435            Token::Number(String::from("1")),
1436        ];
1437
1438        compare(expected, tokens);
1439    }
1440
1441    #[test]
1442    fn tokenize_explain_analyze_select() {
1443        let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
1444        let mut tokenizer = Tokenizer::new(&sql);
1445        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1446
1447        let expected = vec![
1448            Token::make_keyword("EXPLAIN"),
1449            Token::Whitespace(Whitespace::Space),
1450            Token::make_keyword("ANALYZE"),
1451            Token::Whitespace(Whitespace::Space),
1452            Token::make_keyword("SELECT"),
1453            Token::Whitespace(Whitespace::Space),
1454            Token::Mul,
1455            Token::Whitespace(Whitespace::Space),
1456            Token::make_keyword("FROM"),
1457            Token::Whitespace(Whitespace::Space),
1458            Token::make_word("customer", None),
1459            Token::Whitespace(Whitespace::Space),
1460            Token::make_keyword("WHERE"),
1461            Token::Whitespace(Whitespace::Space),
1462            Token::make_word("id", None),
1463            Token::Whitespace(Whitespace::Space),
1464            Token::Eq,
1465            Token::Whitespace(Whitespace::Space),
1466            Token::Number(String::from("1")),
1467        ];
1468
1469        compare(expected, tokens);
1470    }
1471
1472    #[test]
1473    fn tokenize_string_predicate() {
1474        let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
1475        let mut tokenizer = Tokenizer::new(&sql);
1476        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1477
1478        let expected = vec![
1479            Token::make_keyword("SELECT"),
1480            Token::Whitespace(Whitespace::Space),
1481            Token::Mul,
1482            Token::Whitespace(Whitespace::Space),
1483            Token::make_keyword("FROM"),
1484            Token::Whitespace(Whitespace::Space),
1485            Token::make_word("customer", None),
1486            Token::Whitespace(Whitespace::Space),
1487            Token::make_keyword("WHERE"),
1488            Token::Whitespace(Whitespace::Space),
1489            Token::make_word("salary", None),
1490            Token::Whitespace(Whitespace::Space),
1491            Token::Neq,
1492            Token::Whitespace(Whitespace::Space),
1493            Token::SingleQuotedString(String::from("Not Provided")),
1494        ];
1495
1496        compare(expected, tokens);
1497    }
1498
1499    #[test]
1500    fn tokenize_invalid_string() {
1501        let sql = String::from("\nمصطفىh");
1502        let mut tokenizer = Tokenizer::new(&sql);
1503        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1504        // println!("tokens: {:#?}", tokens);
1505        let expected = vec![
1506            Token::Whitespace(Whitespace::Newline),
1507            Token::Char('م'),
1508            Token::Char('ص'),
1509            Token::Char('ط'),
1510            Token::Char('ف'),
1511            Token::Char('ى'),
1512            Token::make_word("h", None),
1513        ];
1514        compare(expected, tokens);
1515    }
1516
1517    #[test]
1518    fn tokenize_newline_in_string_literal() {
1519        let sql = String::from("'foo\r\nbar\nbaz'");
1520        let mut tokenizer = Tokenizer::new(&sql);
1521        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1522        let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_owned())];
1523        compare(expected, tokens);
1524    }
1525
1526    #[test]
1527    fn tokenize_unterminated_string_literal() {
1528        let sql = String::from("select 'foo");
1529        let mut tokenizer = Tokenizer::new(&sql);
1530        assert_eq!(
1531            tokenizer.tokenize_with_whitespace(),
1532            Err(TokenizerError {
1533                message: "Unterminated string literal".to_owned(),
1534                line: 1,
1535                col: 12,
1536                context: "LINE 1: select 'foo\n                   ^".to_owned(),
1537            })
1538        );
1539    }
1540
1541    #[test]
1542    fn tokenize_invalid_string_cols() {
1543        let sql = String::from("\n\nSELECT * FROM table\tمصطفىh");
1544        let mut tokenizer = Tokenizer::new(&sql);
1545        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1546        // println!("tokens: {:#?}", tokens);
1547        let expected = vec![
1548            Token::Whitespace(Whitespace::Newline),
1549            Token::Whitespace(Whitespace::Newline),
1550            Token::make_keyword("SELECT"),
1551            Token::Whitespace(Whitespace::Space),
1552            Token::Mul,
1553            Token::Whitespace(Whitespace::Space),
1554            Token::make_keyword("FROM"),
1555            Token::Whitespace(Whitespace::Space),
1556            Token::make_keyword("table"),
1557            Token::Whitespace(Whitespace::Tab),
1558            Token::Char('م'),
1559            Token::Char('ص'),
1560            Token::Char('ط'),
1561            Token::Char('ف'),
1562            Token::Char('ى'),
1563            Token::make_word("h", None),
1564        ];
1565        compare(expected, tokens);
1566    }
1567
1568    #[test]
1569    fn tokenize_right_arrow() {
1570        let sql = String::from("FUNCTION(key=>value)");
1571        let mut tokenizer = Tokenizer::new(&sql);
1572        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1573        let expected = vec![
1574            Token::make_word("FUNCTION", None),
1575            Token::LParen,
1576            Token::make_word("key", None),
1577            Token::RArrow,
1578            Token::make_word("value", None),
1579            Token::RParen,
1580        ];
1581        compare(expected, tokens);
1582    }
1583
1584    #[test]
1585    fn tokenize_is_null() {
1586        let sql = String::from("a IS NULL");
1587        let mut tokenizer = Tokenizer::new(&sql);
1588        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1589
1590        let expected = vec![
1591            Token::make_word("a", None),
1592            Token::Whitespace(Whitespace::Space),
1593            Token::make_keyword("IS"),
1594            Token::Whitespace(Whitespace::Space),
1595            Token::make_keyword("NULL"),
1596        ];
1597
1598        compare(expected, tokens);
1599    }
1600
1601    #[test]
1602    fn tokenize_comment() {
1603        let sql = String::from("0--this is a comment\n1");
1604        let mut tokenizer = Tokenizer::new(&sql);
1605        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1606        let expected = vec![
1607            Token::Number("0".to_owned()),
1608            Token::Whitespace(Whitespace::SingleLineComment {
1609                prefix: "--".to_owned(),
1610                comment: "this is a comment\n".to_owned(),
1611            }),
1612            Token::Number("1".to_owned()),
1613        ];
1614        compare(expected, tokens);
1615    }
1616
1617    #[test]
1618    fn tokenize_comment_at_eof() {
1619        let sql = String::from("--this is a comment");
1620        let mut tokenizer = Tokenizer::new(&sql);
1621        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1622        let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
1623            prefix: "--".to_owned(),
1624            comment: "this is a comment".to_owned(),
1625        })];
1626        compare(expected, tokens);
1627    }
1628
1629    #[test]
1630    fn tokenize_multiline_comment() {
1631        let sql = String::from("0/*multi-line\n* /comment*/1");
1632        let mut tokenizer = Tokenizer::new(&sql);
1633        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1634        let expected = vec![
1635            Token::Number("0".to_owned()),
1636            Token::Whitespace(Whitespace::MultiLineComment(
1637                "multi-line\n* /comment".to_owned(),
1638            )),
1639            Token::Number("1".to_owned()),
1640        ];
1641        compare(expected, tokens);
1642    }
1643
1644    #[test]
1645    fn tokenize_nested_multiline_comment() {
1646        let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
1647        let mut tokenizer = Tokenizer::new(&sql);
1648        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1649        let expected = vec![
1650            Token::Number("0".to_owned()),
1651            Token::Whitespace(Whitespace::MultiLineComment(
1652                "multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_owned(),
1653            )),
1654            Token::Number("1".to_owned()),
1655        ];
1656        compare(expected, tokens);
1657    }
1658
1659    #[test]
1660    fn tokenize_multiline_comment_with_even_asterisks() {
1661        let sql = String::from("\n/** Comment **/\n");
1662        let mut tokenizer = Tokenizer::new(&sql);
1663        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1664        let expected = vec![
1665            Token::Whitespace(Whitespace::Newline),
1666            Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_owned())),
1667            Token::Whitespace(Whitespace::Newline),
1668        ];
1669        compare(expected, tokens);
1670    }
1671
1672    #[test]
1673    fn tokenize_mismatched_quotes() {
1674        let sql = String::from("\"foo");
1675        let mut tokenizer = Tokenizer::new(&sql);
1676        assert_eq!(
1677            tokenizer.tokenize_with_whitespace(),
1678            Err(TokenizerError {
1679                message: "Expected close delimiter '\"' before EOF.".to_owned(),
1680                line: 1,
1681                col: 5,
1682                context: "LINE 1: \"foo\n            ^".to_owned(),
1683            })
1684        );
1685    }
1686
1687    #[test]
1688    fn tokenize_newlines() {
1689        let sql = String::from("line1\nline2\rline3\r\nline4\r");
1690        let mut tokenizer = Tokenizer::new(&sql);
1691        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1692        let expected = vec![
1693            Token::make_word("line1", None),
1694            Token::Whitespace(Whitespace::Newline),
1695            Token::make_word("line2", None),
1696            Token::Whitespace(Whitespace::Newline),
1697            Token::make_word("line3", None),
1698            Token::Whitespace(Whitespace::Newline),
1699            Token::make_word("line4", None),
1700            Token::Whitespace(Whitespace::Newline),
1701        ];
1702        compare(expected, tokens);
1703    }
1704
1705    #[test]
1706    fn tokenize_pg_regex_match() {
1707        let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
1708        let mut tokenizer = Tokenizer::new(sql);
1709        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1710        let expected = vec![
1711            Token::make_keyword("SELECT"),
1712            Token::Whitespace(Whitespace::Space),
1713            Token::make_word("col", None),
1714            Token::Whitespace(Whitespace::Space),
1715            Token::Tilde,
1716            Token::Whitespace(Whitespace::Space),
1717            Token::SingleQuotedString("^a".into()),
1718            Token::Comma,
1719            Token::Whitespace(Whitespace::Space),
1720            Token::make_word("col", None),
1721            Token::Whitespace(Whitespace::Space),
1722            Token::TildeAsterisk,
1723            Token::Whitespace(Whitespace::Space),
1724            Token::SingleQuotedString("^a".into()),
1725            Token::Comma,
1726            Token::Whitespace(Whitespace::Space),
1727            Token::make_word("col", None),
1728            Token::Whitespace(Whitespace::Space),
1729            Token::ExclamationMarkTilde,
1730            Token::Whitespace(Whitespace::Space),
1731            Token::SingleQuotedString("^a".into()),
1732            Token::Comma,
1733            Token::Whitespace(Whitespace::Space),
1734            Token::make_word("col", None),
1735            Token::Whitespace(Whitespace::Space),
1736            Token::ExclamationMarkTildeAsterisk,
1737            Token::Whitespace(Whitespace::Space),
1738            Token::SingleQuotedString("^a".into()),
1739        ];
1740        compare(expected, tokens);
1741    }
1742
1743    #[test]
1744    fn tokenize_select_array() {
1745        let sql = String::from("SELECT '{1, 2, 3}'");
1746        let mut tokenizer = Tokenizer::new(&sql);
1747        let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1748
1749        let expected = vec![
1750            Token::make_keyword("SELECT"),
1751            Token::Whitespace(Whitespace::Space),
1752            Token::SingleQuotedString(String::from("{1, 2, 3}")),
1753        ];
1754
1755        compare(expected, tokens);
1756    }
1757
1758    fn compare(expected: Vec<Token>, actual: Vec<Token>) {
1759        // println!("------------------------------");
1760        // println!("tokens   = {:?}", actual);
1761        // println!("expected = {:?}", expected);
1762        // println!("------------------------------");
1763        assert_eq!(expected, actual);
1764    }
1765}