1use std::fmt;
20use std::fmt::Debug;
21use std::iter::Peekable;
22use std::str::Chars;
23
24use crate::ast::{CstyleEscapedString, DollarQuotedString};
25use crate::keywords::{ALL_KEYWORDS, ALL_KEYWORDS_INDEX, Keyword};
26
27#[derive(Debug, Clone, PartialEq, Eq, Hash)]
29pub enum Token {
30 EOF,
32 Word(Word),
34 Number(String),
36 Char(char),
38 SingleQuotedString(String),
40 DollarQuotedString(DollarQuotedString),
42 CstyleEscapesString(CstyleEscapedString),
44 NationalStringLiteral(String),
46 HexStringLiteral(String),
48 Parameter(String),
50 Comma,
52 Whitespace(Whitespace),
54 Op(String),
56 Eq,
58 Neq,
60 Lt,
62 Gt,
64 LtEq,
66 GtEq,
68 Plus,
70 Minus,
72 Mul,
74 Div,
76 Mod,
78 LParen,
80 RParen,
82 Period,
84 Colon,
86 DoubleColon,
88 SemiColon,
90 Backslash,
92 LBracket,
94 RBracket,
96 Pipe,
98 Caret,
100 LBrace,
102 RBrace,
104 RArrow,
106}
107
108impl fmt::Display for Token {
109 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
110 match self {
111 Token::EOF => f.write_str("EOF"),
112 Token::Word(w) => write!(f, "{}", w),
113 Token::Number(n) => write!(f, "{}", n),
114 Token::Char(c) => write!(f, "{}", c),
115 Token::SingleQuotedString(s) => write!(f, "'{}'", s),
116 Token::DollarQuotedString(s) => write!(f, "{}", s),
117 Token::NationalStringLiteral(s) => write!(f, "N'{}'", s),
118 Token::HexStringLiteral(s) => write!(f, "X'{}'", s),
119 Token::CstyleEscapesString(s) => write!(f, "E'{}'", s),
120 Token::Parameter(s) => write!(f, "${}", s),
121 Token::Comma => f.write_str(","),
122 Token::Whitespace(ws) => write!(f, "{}", ws),
123 Token::Op(op) => write!(f, "{}", op),
124 Token::Eq => f.write_str("="),
125 Token::Neq => f.write_str("<>"),
126 Token::Lt => f.write_str("<"),
127 Token::Gt => f.write_str(">"),
128 Token::LtEq => f.write_str("<="),
129 Token::GtEq => f.write_str(">="),
130 Token::Plus => f.write_str("+"),
131 Token::Minus => f.write_str("-"),
132 Token::Mul => f.write_str("*"),
133 Token::Div => f.write_str("/"),
134 Token::Mod => f.write_str("%"),
135 Token::LParen => f.write_str("("),
136 Token::RParen => f.write_str(")"),
137 Token::Period => f.write_str("."),
138 Token::Colon => f.write_str(":"),
139 Token::DoubleColon => f.write_str("::"),
140 Token::SemiColon => f.write_str(";"),
141 Token::Backslash => f.write_str("\\"),
142 Token::LBracket => f.write_str("["),
143 Token::RBracket => f.write_str("]"),
144 Token::Caret => f.write_str("^"),
145 Token::Pipe => f.write_str("|"),
146 Token::LBrace => f.write_str("{"),
147 Token::RBrace => f.write_str("}"),
148 Token::RArrow => f.write_str("=>"),
149 }
150 }
151}
152
153impl Token {
154 pub fn make_keyword(keyword: &str) -> Self {
155 Token::make_word(keyword, None)
156 }
157
158 pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
159 let word_uppercase = word.to_uppercase();
160 Token::Word(Word {
161 value: word.to_owned(),
162 quote_style,
163 keyword: if quote_style.is_none() {
164 let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
165 keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
166 } else {
167 Keyword::NoKeyword
168 },
169 })
170 }
171
172 pub fn with_location(self, location: Location) -> TokenWithLocation {
173 TokenWithLocation::new(self, location.line, location.column)
174 }
175}
176
177#[derive(Debug, Clone, PartialEq, Eq, Hash)]
179pub struct Word {
180 pub value: String,
183 pub quote_style: Option<char>,
187 pub keyword: Keyword,
190}
191
192impl fmt::Display for Word {
193 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
194 match self.quote_style {
195 Some(s) if s == '"' || s == '[' || s == '`' => {
196 write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
197 }
198 None => f.write_str(&self.value),
199 _ => panic!("Unexpected quote_style!"),
200 }
201 }
202}
203
204impl Word {
205 fn matching_end_quote(ch: char) -> char {
206 match ch {
207 '"' => '"', '[' => ']', '`' => '`', _ => panic!("unexpected quoting style!"),
211 }
212 }
213}
214
215#[derive(Debug, Clone, PartialEq, Eq, Hash)]
216pub enum Whitespace {
217 Space,
218 Newline,
219 Tab,
220 SingleLineComment { comment: String, prefix: String },
221 MultiLineComment(String),
222}
223
224impl fmt::Display for Whitespace {
225 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
226 match self {
227 Whitespace::Space => f.write_str(" "),
228 Whitespace::Newline => f.write_str("\n"),
229 Whitespace::Tab => f.write_str("\t"),
230 Whitespace::SingleLineComment { prefix, comment } => write!(f, "{}{}", prefix, comment),
231 Whitespace::MultiLineComment(s) => write!(f, "/*{}*/", s),
232 }
233 }
234}
235
236#[derive(Debug, Eq, PartialEq, Clone)]
238pub struct Location {
239 pub line: u64,
241 pub column: u64,
243}
244
245#[derive(Debug, Eq, PartialEq, Clone)]
247pub struct TokenWithLocation {
248 pub token: Token,
249 pub location: Location,
250}
251
252impl TokenWithLocation {
253 pub fn new(token: Token, line: u64, column: u64) -> TokenWithLocation {
254 TokenWithLocation {
255 token,
256 location: Location { line, column },
257 }
258 }
259
260 pub fn eof() -> TokenWithLocation {
261 TokenWithLocation::new(Token::EOF, 0, 0)
262 }
263}
264
265impl PartialEq<Token> for TokenWithLocation {
266 fn eq(&self, other: &Token) -> bool {
267 &self.token == other
268 }
269}
270
271impl PartialEq<TokenWithLocation> for Token {
272 fn eq(&self, other: &TokenWithLocation) -> bool {
273 self == &other.token
274 }
275}
276
277impl fmt::Display for TokenWithLocation {
278 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
279 if self.token == Token::EOF {
280 write!(f, "end of input")
281 } else {
282 write!(
283 f,
284 "{} at line {}, column {}",
285 self.token, self.location.line, self.location.column
286 )
287 }
288 }
289}
290
291#[derive(Debug, PartialEq)]
293pub struct TokenizerError {
294 pub message: String,
295 pub line: u64,
296 pub col: u64,
297 pub context: String,
298}
299
300impl fmt::Display for TokenizerError {
301 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
302 write!(
303 f,
304 "{} at line {}, column {}\n{}",
305 self.message, self.line, self.col, self.context
306 )
307 }
308}
309
310impl std::error::Error for TokenizerError {}
311
312#[derive(Clone)]
314pub struct Tokenizer<'a> {
315 sql: &'a str,
316 chars: Peekable<Chars<'a>>,
317 line: u64,
318 col: u64,
319}
320
321impl<'a> Tokenizer<'a> {
322 pub fn new(query: &'a str) -> Self {
324 Self {
325 sql: query,
326 chars: query.chars().peekable(),
327 line: 1,
328 col: 1,
329 }
330 }
331
332 fn next(&mut self) -> Option<char> {
334 let ch = self.chars.next();
335 if let Some(ch) = ch {
336 match ch {
337 '\n' => {
338 self.line += 1;
339 self.col = 1;
340 }
341 '\t' => self.col += 4,
342 _ => self.col += 1,
343 }
344 }
345 ch
346 }
347
348 fn peek(&mut self) -> Option<char> {
350 self.chars.peek().cloned()
351 }
352
353 pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
357 let tokens = self.tokenize()?;
358 Ok(tokens
359 .into_iter()
360 .filter(|token| !matches!(&token.token, Token::Whitespace(_)))
361 .collect())
362 }
363
364 #[allow(dead_code)]
368 fn tokenize_with_whitespace(&mut self) -> Result<Vec<Token>, TokenizerError> {
369 let tokens = self.tokenize()?;
370 Ok(tokens.into_iter().map(|t| t.token).collect())
371 }
372
373 fn tokenize(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
377 let mut tokens = Vec::new();
378 while let Some(token) = self.next_token_with_location()? {
379 tokens.push(token);
380 }
381 Ok(tokens)
382 }
383
384 fn next_token_with_location(&mut self) -> Result<Option<TokenWithLocation>, TokenizerError> {
386 let loc = Location {
387 line: self.line,
388 column: self.col,
389 };
390 self.next_token()
391 .map(|t| t.map(|token| token.with_location(loc)))
392 }
393
394 fn next_token(&mut self) -> Result<Option<Token>, TokenizerError> {
396 macro_rules! op_chars {
397 (all as_pat) => {
399 '+' | '-' | '*' | '/' | '<' | '>' | '=' | op_chars!(ext as_pat)
400 };
401 (ext $m:ident) => {
402 op_chars!($m '~' '!' '@' '#' '%' '^' '&' '|' '`' '?')
403 };
404 (as_arr $($c:literal)+) => {
405 [ $($c),+ ]
406 };
407 (as_pat $($c:literal)+) => {
408 $($c)|+
409 };
410 }
411
412 match self.peek() {
413 Some(ch) => match ch {
414 ' ' => self.consume_and_return(Token::Whitespace(Whitespace::Space)),
415 '\t' => self.consume_and_return(Token::Whitespace(Whitespace::Tab)),
416 '\n' => self.consume_and_return(Token::Whitespace(Whitespace::Newline)),
417 '\r' => {
418 self.next();
420 if let Some('\n') = self.peek() {
421 self.next();
422 }
423 Ok(Some(Token::Whitespace(Whitespace::Newline)))
424 }
425 'N' => {
426 self.next(); match self.peek() {
428 Some('\'') => {
429 let s = self.tokenize_single_quoted_string()?;
431 Ok(Some(Token::NationalStringLiteral(s)))
432 }
433 _ => {
434 let s = self.tokenize_word('N');
436 Ok(Some(Token::make_word(&s, None)))
437 }
438 }
439 }
440 x @ 'e' | x @ 'E' => {
441 self.next(); match self.peek() {
443 Some('\'') => {
444 let s = self.tokenize_single_quoted_string_with_escape()?;
446 Ok(Some(Token::CstyleEscapesString(s)))
447 }
448 _ => {
449 let s = self.tokenize_word(x);
451 Ok(Some(Token::make_word(&s, None)))
452 }
453 }
454 }
455 x @ 'x' | x @ 'X' => {
458 self.next(); match self.peek() {
460 Some('\'') => {
461 let s = self.tokenize_single_quoted_string()?;
463 Ok(Some(Token::HexStringLiteral(s)))
464 }
465 _ => {
466 let s = self.tokenize_word(x);
468 Ok(Some(Token::make_word(&s, None)))
469 }
470 }
471 }
472 ch if is_identifier_start(ch) => {
474 self.next(); let s = self.tokenize_word(ch);
476
477 Ok(Some(Token::make_word(&s, None)))
478 }
479 '\'' => {
481 let s = self.tokenize_single_quoted_string()?;
482
483 Ok(Some(Token::SingleQuotedString(s)))
484 }
485 quote_start if is_delimited_identifier_start(quote_start) => {
487 self.next(); let quote_end = Word::matching_end_quote(quote_start);
489 let s = self.peeking_take_while(|ch| ch != quote_end);
490 if self.next() == Some(quote_end) {
491 Ok(Some(Token::make_word(&s, Some(quote_start))))
492 } else {
493 self.error(format!(
494 "Expected close delimiter '{}' before EOF.",
495 quote_end
496 ))
497 }
498 }
499 '0'..='9' | '.' => {
501 let mut s = self.peeking_take_while(|ch| ch.is_ascii_digit());
502
503 if s == "0"
505 && let Some(radix) = self.peek()
506 && "xob".contains(radix.to_ascii_lowercase())
507 {
508 self.next();
509 let radix = radix.to_ascii_lowercase();
510 let base = match radix {
511 'x' => 16,
512 'o' => 8,
513 'b' => 2,
514 _ => unreachable!(),
515 };
516 let s2 = self.peeking_take_while(|ch| ch.is_digit(base));
517 if s2.is_empty() {
518 return self.error("incomplete integer literal");
519 }
520 self.reject_number_junk()?;
521 return Ok(Some(Token::Number(format!("0{radix}{s2}"))));
522 }
523
524 if let Some('.') = self.peek() {
526 s.push('.');
527 self.next();
528 }
529 s += &self.peeking_take_while(|ch| ch.is_ascii_digit());
530
531 if s == "." {
533 return Ok(Some(Token::Period));
534 }
535
536 match self.peek() {
537 Some('e') | Some('E') => {
539 s.push('e');
540 self.next();
541
542 if let Some('-') = self.peek() {
543 s.push('-');
544 self.next();
545 }
546 s += &self.peeking_take_while(|ch| ch.is_ascii_digit());
547 self.reject_number_junk()?;
548 return Ok(Some(Token::Number(s)));
549 }
550 _ => {}
552 };
553 self.reject_number_junk()?;
554 Ok(Some(Token::Number(s)))
555 }
556 '(' => self.consume_and_return(Token::LParen),
558 ')' => self.consume_and_return(Token::RParen),
559 ',' => self.consume_and_return(Token::Comma),
560 ':' => {
561 self.next();
562 match self.peek() {
563 Some(':') => self.consume_and_return(Token::DoubleColon),
564 _ => Ok(Some(Token::Colon)),
565 }
566 }
567 '$' => Ok(Some(self.tokenize_dollar_preceded_value()?)),
568 ';' => self.consume_and_return(Token::SemiColon),
569 '\\' => self.consume_and_return(Token::Backslash),
570 '[' => self.consume_and_return(Token::LBracket),
571 ']' => self.consume_and_return(Token::RBracket),
572 '{' => self.consume_and_return(Token::LBrace),
573 '}' => self.consume_and_return(Token::RBrace),
574 op_chars!(all as_pat) => {
576 let mut trial = self.clone();
577 let op_taken = trial.peeking_take_while(|c| matches!(c, op_chars!(all as_pat)));
578 let slash_star = op_taken.find("/*");
583 let dash_dash = op_taken.find("--");
584 let pos = match (slash_star, dash_dash) {
585 (Some(s), Some(d)) => s.min(d),
586 (Some(s), None) => s,
587 (None, Some(d)) => d,
588 (None, None) => op_taken.len(),
589 };
590 let mut op = &op_taken[..pos];
591 if op.is_empty() {
592 match self.next() {
593 Some('-') => {
594 self.next(); let comment = self.tokenize_single_line_comment();
596
597 return Ok(Some(Token::Whitespace(
598 Whitespace::SingleLineComment {
599 prefix: "--".to_owned(),
600 comment,
601 },
602 )));
603 }
604 Some('/') => {
605 self.next(); return self.tokenize_multiline_comment();
607 }
608 _ => unreachable!(),
609 }
610 };
611 if op.len() > 1
612 && op.ends_with(['+', '-'])
613 && !op.contains(op_chars!(ext as_arr))
614 {
615 op = op.trim_end_matches(['+', '-']);
616 if op.is_empty() {
617 op = &op_taken[..1];
618 }
619 }
620 if op.len() == op_taken.len() {
621 *self = trial;
622 } else {
623 for _ in op.chars() {
624 self.next();
625 }
626 }
627 match op {
628 "+" => Ok(Some(Token::Plus)),
630 "-" => Ok(Some(Token::Minus)),
631 "*" => Ok(Some(Token::Mul)),
632 "/" => Ok(Some(Token::Div)),
633 "%" => Ok(Some(Token::Mod)),
634 "^" => Ok(Some(Token::Caret)),
635 "<" => Ok(Some(Token::Lt)),
636 ">" => Ok(Some(Token::Gt)),
637 "=" => Ok(Some(Token::Eq)),
638 "=>" => Ok(Some(Token::RArrow)),
640 "<=" => Ok(Some(Token::LtEq)),
641 ">=" => Ok(Some(Token::GtEq)),
642 "<>" => Ok(Some(Token::Neq)),
643 "!=" => Ok(Some(Token::Neq)),
644 "|" => Ok(Some(Token::Pipe)),
649 _ => Ok(Some(Token::Op(op.to_owned()))),
650 }
651 }
652 other => self.consume_and_return(Token::Char(other)),
653 },
654 None => Ok(None),
655 }
656 }
657
658 fn tokenize_dollar_preceded_value(&mut self) -> Result<Token, TokenizerError> {
660 let mut s = String::new();
661 let mut value = String::new();
662
663 self.next();
664
665 if let Some('$') = self.peek() {
666 self.next();
667
668 let mut is_terminated = false;
669 let mut prev: Option<char> = None;
670
671 while let Some(ch) = self.peek() {
672 if prev == Some('$') {
673 if ch == '$' {
674 self.next();
675 is_terminated = true;
676 break;
677 } else {
678 s.push('$');
679 s.push(ch);
680 }
681 } else if ch != '$' {
682 s.push(ch);
683 }
684
685 prev = Some(ch);
686 self.next();
687 }
688
689 return if self.peek().is_none() && !is_terminated {
690 self.error("Unterminated dollar-quoted string")
691 } else {
692 Ok(Token::DollarQuotedString(DollarQuotedString {
693 value: s,
694 tag: None,
695 }))
696 };
697 } else {
698 value.push_str(&self.peeking_take_while(|ch| ch.is_alphanumeric() || ch == '_'));
699
700 if let Some('$') = self.peek() {
701 self.next();
702 s.push_str(&self.peeking_take_while(|ch| ch != '$'));
703
704 match self.peek() {
705 Some('$') => {
706 self.next();
707 for c in value.chars() {
708 let next_char = self.next();
709 if Some(c) != next_char {
710 return self.error(format!(
711 "Unterminated dollar-quoted string at or near \"{}\"",
712 value
713 ));
714 }
715 }
716
717 if let Some('$') = self.peek() {
718 self.next();
719 } else {
720 return self.error("Unterminated dollar-quoted string, expected $");
721 }
722 }
723 _ => {
724 return self.error("Unterminated dollar-quoted, expected $");
725 }
726 }
727 } else {
728 return Ok(Token::Parameter(value));
729 }
730 }
731
732 Ok(Token::DollarQuotedString(DollarQuotedString {
733 value: s,
734 tag: if value.is_empty() { None } else { Some(value) },
735 }))
736 }
737
738 fn error<R>(&self, message: impl Into<String>) -> Result<R, TokenizerError> {
739 let prefix = format!("LINE {}: ", self.line);
740 let sql_line = self.sql.split('\n').nth(self.line as usize - 1).unwrap();
741 let cursor = " ".repeat(prefix.len() + self.col as usize - 1);
742 let context = format!("{}{}\n{}^", prefix, sql_line, cursor);
743 Err(TokenizerError {
744 message: message.into(),
745 col: self.col,
746 line: self.line,
747 context,
748 })
749 }
750
751 fn reject_number_junk(&mut self) -> Result<(), TokenizerError> {
752 if let Some(ch) = self.peek()
753 && is_identifier_start(ch)
754 {
755 return self.error("trailing junk after numeric literal");
756 }
757 Ok(())
758 }
759
760 fn tokenize_single_line_comment(&mut self) -> String {
762 let mut comment = self.peeking_take_while(|ch| ch != '\n');
763 if let Some(ch) = self.next() {
764 assert_eq!(ch, '\n');
765 comment.push(ch);
766 }
767 comment
768 }
769
770 fn tokenize_word(&mut self, first_char: char) -> String {
772 let mut s = first_char.to_string();
773 s.push_str(&self.peeking_take_while(is_identifier_part));
774 s
775 }
776
777 fn tokenize_single_quoted_string(&mut self) -> Result<String, TokenizerError> {
779 let mut s = String::new();
780 self.next(); let mut is_escaped = false;
784 while let Some(ch) = self.peek() {
785 match ch {
786 '\'' => {
787 self.next(); if is_escaped {
789 s.push(ch);
790 is_escaped = false;
791 } else if self.peek().map(|c| c == '\'').unwrap_or(false) {
792 s.push(ch);
793 self.next();
794 } else {
795 return Ok(s);
796 }
797 }
798 '\\' => {
799 s.push(ch);
800 self.next();
801 }
802 _ => {
803 self.next(); s.push(ch);
805 }
806 }
807 }
808 self.error("Unterminated string literal")
809 }
810
811 fn tokenize_single_quoted_string_with_escape(
813 &mut self,
814 ) -> Result<CstyleEscapedString, TokenizerError> {
815 let mut terminated = false;
816 let mut s = String::new();
817 self.next(); while let Some(ch) = self.peek() {
820 match ch {
821 '\'' => {
822 self.next(); if self.peek().map(|c| c == '\'').unwrap_or(false) {
824 s.push('\\');
825 s.push(ch);
826 self.next();
827 } else {
828 terminated = true;
829 break;
830 }
831 }
832 '\\' => {
833 s.push(ch);
834 self.next();
835 if self.peek().map(|c| c == '\'' || c == '\\').unwrap_or(false) {
836 s.push(self.next().unwrap());
837 }
838 }
839 _ => {
840 self.next(); s.push(ch);
842 }
843 }
844 }
845
846 if !terminated {
847 return self.error("Unterminated string literal");
848 }
849
850 let unescaped = match Self::unescape_c_style(&s) {
851 Ok(unescaped) => unescaped,
852 Err(e) => return self.error(e),
853 };
854
855 Ok(CstyleEscapedString {
856 value: unescaped,
857 raw: s,
858 })
859 }
860
861 fn unescape_c_style(s: &str) -> Result<String, String> {
867 fn hex_byte_process(
868 chars: &mut Peekable<Chars<'_>>,
869 res: &mut String,
870 len: usize,
871 default_char: char,
872 ) -> Result<(), String> {
873 let mut unicode_seq: String = String::with_capacity(len);
874 for _ in 0..len {
875 if let Some(c) = chars.peek()
876 && c.is_ascii_hexdigit()
877 {
878 unicode_seq.push(chars.next().unwrap());
879 } else {
880 break;
881 }
882 }
883
884 if unicode_seq.is_empty() && len == 2 {
885 res.push(default_char);
886 return Ok(());
887 } else if unicode_seq.len() < len && len != 2 {
888 return Err("invalid unicode sequence: must be \\uXXXX or \\UXXXXXXXX".to_owned());
889 }
890
891 if len == 2 {
892 let number = [u8::from_str_radix(&unicode_seq, 16)
893 .map_err(|e| format!("invalid unicode sequence: {}", e))?];
894
895 res.push(
896 std::str::from_utf8(&number)
897 .map_err(|err| format!("invalid unicode sequence: {}", err))?
898 .chars()
899 .next()
900 .unwrap(),
901 );
902 } else {
903 let number = u32::from_str_radix(&unicode_seq, 16)
904 .map_err(|e| format!("invalid unicode sequence: {}", e))?;
905 res.push(
906 char::from_u32(number)
907 .ok_or_else(|| format!("invalid unicode sequence: {}", unicode_seq))?,
908 );
909 }
910 Ok(())
911 }
912
913 fn octal_byte_process(
914 chars: &mut Peekable<Chars<'_>>,
915 res: &mut String,
916 digit: char,
917 ) -> Result<(), String> {
918 let mut unicode_seq: String = String::with_capacity(3);
919 unicode_seq.push(digit);
920 for _ in 0..2 {
921 if let Some(c) = chars.peek()
922 && matches!(*c, '0'..='7')
923 {
924 unicode_seq.push(chars.next().unwrap());
925 } else {
926 break;
927 }
928 }
929
930 let number = [u8::from_str_radix(&unicode_seq, 8)
931 .map_err(|e| format!("invalid unicode sequence: {}", e))?];
932
933 res.push(
934 std::str::from_utf8(&number)
935 .map_err(|err| format!("invalid unicode sequence: {}", err))?
936 .chars()
937 .next()
938 .unwrap(),
939 );
940 Ok(())
941 }
942
943 let mut chars = s.chars().peekable();
944 let mut res = String::with_capacity(s.len());
945
946 while let Some(c) = chars.next() {
947 if c == '\\' {
948 match chars.next() {
949 None => {
950 return Err("unterminated escape sequence".to_owned());
951 }
952 Some(next_c) => match next_c {
953 'b' => res.push('\u{08}'),
954 'f' => res.push('\u{0C}'),
955 'n' => res.push('\n'),
956 'r' => res.push('\r'),
957 't' => res.push('\t'),
958 'x' => hex_byte_process(&mut chars, &mut res, 2, 'x')?,
959 'u' => hex_byte_process(&mut chars, &mut res, 4, 'u')?,
960 'U' => hex_byte_process(&mut chars, &mut res, 8, 'U')?,
961 digit @ '0'..='7' => octal_byte_process(&mut chars, &mut res, digit)?,
962 _ => res.push(next_c),
963 },
964 }
965 } else {
966 res.push(c);
967 }
968 }
969
970 Ok(res)
971 }
972
973 fn tokenize_multiline_comment(&mut self) -> Result<Option<Token>, TokenizerError> {
974 let mut s = String::new();
975
976 let mut nested = 1;
977 let mut last_ch = ' ';
978
979 loop {
980 match self.next() {
981 Some(ch) => {
982 if last_ch == '/' && ch == '*' {
983 nested += 1;
984 } else if last_ch == '*' && ch == '/' {
985 nested -= 1;
986 if nested == 0 {
987 s.pop();
988 break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
989 }
990 }
991 s.push(ch);
992 last_ch = ch;
993 }
994 None => break self.error("Unexpected EOF while in a multi-line comment"),
995 }
996 }
997 }
998
999 #[allow(clippy::unnecessary_wraps)]
1000 fn consume_and_return(&mut self, t: Token) -> Result<Option<Token>, TokenizerError> {
1001 self.next();
1002 Ok(Some(t))
1003 }
1004
1005 fn peeking_take_while(&mut self, mut predicate: impl FnMut(char) -> bool) -> String {
1009 let mut s = String::new();
1010 while let Some(ch) = self.peek() {
1011 if predicate(ch) {
1012 self.next(); s.push(ch);
1014 } else {
1015 break;
1016 }
1017 }
1018 s
1019 }
1020}
1021
1022fn is_delimited_identifier_start(ch: char) -> bool {
1028 ch == '"'
1029}
1030
1031fn is_identifier_start(ch: char) -> bool {
1033 ch.is_ascii_alphabetic() || ch == '_'
1037}
1038
1039fn is_identifier_part(ch: char) -> bool {
1041 ch.is_ascii_alphanumeric() || ch == '$' || ch == '_'
1042}
1043
1044#[cfg(test)]
1045mod tests {
1046 use super::*;
1047
1048 #[test]
1049 fn tokenizer_error_impl() {
1050 use std::error::Error;
1051
1052 let err = TokenizerError {
1053 message: "test".into(),
1054 line: 1,
1055 col: 1,
1056 context: "LINE 1:".to_owned(),
1057 };
1058
1059 assert!(err.source().is_none());
1060 assert_eq!(err.to_string(), "test at line 1, column 1\nLINE 1:");
1061 }
1062
1063 #[test]
1064 fn tokenize_select_1() {
1065 let sql = String::from("SELECT 1");
1066 let mut tokenizer = Tokenizer::new(&sql);
1067 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1068
1069 let expected = vec![
1070 Token::make_keyword("SELECT"),
1071 Token::Whitespace(Whitespace::Space),
1072 Token::Number(String::from("1")),
1073 ];
1074
1075 compare(expected, tokens);
1076 }
1077
1078 #[test]
1079 fn tokenize_select_float() {
1080 let sql = String::from("SELECT .1");
1081 let mut tokenizer = Tokenizer::new(&sql);
1082 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1083
1084 let expected = vec![
1085 Token::make_keyword("SELECT"),
1086 Token::Whitespace(Whitespace::Space),
1087 Token::Number(String::from(".1")),
1088 ];
1089
1090 compare(expected, tokens);
1091 }
1092
1093 #[test]
1094 fn tokenize_scalar_function() {
1095 let sql = String::from("SELECT sqrt(1)");
1096 let mut tokenizer = Tokenizer::new(&sql);
1097 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1098
1099 let expected = vec![
1100 Token::make_keyword("SELECT"),
1101 Token::Whitespace(Whitespace::Space),
1102 Token::make_word("sqrt", None),
1103 Token::LParen,
1104 Token::Number(String::from("1")),
1105 Token::RParen,
1106 ];
1107
1108 compare(expected, tokens);
1109 }
1110
1111 #[test]
1112 fn tokenize_string_string_concat() {
1113 let sql = String::from("SELECT 'a' || 'b'");
1114 let mut tokenizer = Tokenizer::new(&sql);
1115 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1116
1117 let expected = vec![
1118 Token::make_keyword("SELECT"),
1119 Token::Whitespace(Whitespace::Space),
1120 Token::SingleQuotedString(String::from("a")),
1121 Token::Whitespace(Whitespace::Space),
1122 Token::Op("||".to_owned()),
1123 Token::Whitespace(Whitespace::Space),
1124 Token::SingleQuotedString(String::from("b")),
1125 ];
1126
1127 compare(expected, tokens);
1128 }
1129
1130 #[test]
1131 fn tokenize_bitwise_op() {
1132 let sql = String::from("SELECT one | two ^ three");
1133 let mut tokenizer = Tokenizer::new(&sql);
1134 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1135
1136 let expected = vec![
1137 Token::make_keyword("SELECT"),
1138 Token::Whitespace(Whitespace::Space),
1139 Token::make_word("one", None),
1140 Token::Whitespace(Whitespace::Space),
1141 Token::Pipe,
1142 Token::Whitespace(Whitespace::Space),
1143 Token::make_word("two", None),
1144 Token::Whitespace(Whitespace::Space),
1145 Token::Caret,
1146 Token::Whitespace(Whitespace::Space),
1147 Token::make_word("three", None),
1148 ];
1149 compare(expected, tokens);
1150 }
1151
1152 #[test]
1153 fn tokenize_logical_xor() {
1154 let sql =
1155 String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
1156 let mut tokenizer = Tokenizer::new(&sql);
1157 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1158
1159 let expected = vec![
1160 Token::make_keyword("SELECT"),
1161 Token::Whitespace(Whitespace::Space),
1162 Token::make_keyword("true"),
1163 Token::Whitespace(Whitespace::Space),
1164 Token::make_keyword("XOR"),
1165 Token::Whitespace(Whitespace::Space),
1166 Token::make_keyword("true"),
1167 Token::Comma,
1168 Token::Whitespace(Whitespace::Space),
1169 Token::make_keyword("false"),
1170 Token::Whitespace(Whitespace::Space),
1171 Token::make_keyword("XOR"),
1172 Token::Whitespace(Whitespace::Space),
1173 Token::make_keyword("false"),
1174 Token::Comma,
1175 Token::Whitespace(Whitespace::Space),
1176 Token::make_keyword("true"),
1177 Token::Whitespace(Whitespace::Space),
1178 Token::make_keyword("XOR"),
1179 Token::Whitespace(Whitespace::Space),
1180 Token::make_keyword("false"),
1181 Token::Comma,
1182 Token::Whitespace(Whitespace::Space),
1183 Token::make_keyword("false"),
1184 Token::Whitespace(Whitespace::Space),
1185 Token::make_keyword("XOR"),
1186 Token::Whitespace(Whitespace::Space),
1187 Token::make_keyword("true"),
1188 ];
1189 compare(expected, tokens);
1190 }
1191
1192 #[test]
1193 fn tokenize_simple_select() {
1194 let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
1195 let mut tokenizer = Tokenizer::new(&sql);
1196 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1197
1198 let expected = vec![
1199 Token::make_keyword("SELECT"),
1200 Token::Whitespace(Whitespace::Space),
1201 Token::Mul,
1202 Token::Whitespace(Whitespace::Space),
1203 Token::make_keyword("FROM"),
1204 Token::Whitespace(Whitespace::Space),
1205 Token::make_word("customer", None),
1206 Token::Whitespace(Whitespace::Space),
1207 Token::make_keyword("WHERE"),
1208 Token::Whitespace(Whitespace::Space),
1209 Token::make_word("id", None),
1210 Token::Whitespace(Whitespace::Space),
1211 Token::Eq,
1212 Token::Whitespace(Whitespace::Space),
1213 Token::Number(String::from("1")),
1214 Token::Whitespace(Whitespace::Space),
1215 Token::make_keyword("LIMIT"),
1216 Token::Whitespace(Whitespace::Space),
1217 Token::Number(String::from("5")),
1218 ];
1219
1220 compare(expected, tokens);
1221 }
1222
1223 #[test]
1224 fn tokenize_explain_select() {
1225 let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
1226 let mut tokenizer = Tokenizer::new(&sql);
1227 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1228
1229 let expected = vec![
1230 Token::make_keyword("EXPLAIN"),
1231 Token::Whitespace(Whitespace::Space),
1232 Token::make_keyword("SELECT"),
1233 Token::Whitespace(Whitespace::Space),
1234 Token::Mul,
1235 Token::Whitespace(Whitespace::Space),
1236 Token::make_keyword("FROM"),
1237 Token::Whitespace(Whitespace::Space),
1238 Token::make_word("customer", None),
1239 Token::Whitespace(Whitespace::Space),
1240 Token::make_keyword("WHERE"),
1241 Token::Whitespace(Whitespace::Space),
1242 Token::make_word("id", None),
1243 Token::Whitespace(Whitespace::Space),
1244 Token::Eq,
1245 Token::Whitespace(Whitespace::Space),
1246 Token::Number(String::from("1")),
1247 ];
1248
1249 compare(expected, tokens);
1250 }
1251
1252 #[test]
1253 fn tokenize_explain_analyze_select() {
1254 let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
1255 let mut tokenizer = Tokenizer::new(&sql);
1256 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1257
1258 let expected = vec![
1259 Token::make_keyword("EXPLAIN"),
1260 Token::Whitespace(Whitespace::Space),
1261 Token::make_keyword("ANALYZE"),
1262 Token::Whitespace(Whitespace::Space),
1263 Token::make_keyword("SELECT"),
1264 Token::Whitespace(Whitespace::Space),
1265 Token::Mul,
1266 Token::Whitespace(Whitespace::Space),
1267 Token::make_keyword("FROM"),
1268 Token::Whitespace(Whitespace::Space),
1269 Token::make_word("customer", None),
1270 Token::Whitespace(Whitespace::Space),
1271 Token::make_keyword("WHERE"),
1272 Token::Whitespace(Whitespace::Space),
1273 Token::make_word("id", None),
1274 Token::Whitespace(Whitespace::Space),
1275 Token::Eq,
1276 Token::Whitespace(Whitespace::Space),
1277 Token::Number(String::from("1")),
1278 ];
1279
1280 compare(expected, tokens);
1281 }
1282
1283 #[test]
1284 fn tokenize_string_predicate() {
1285 let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
1286 let mut tokenizer = Tokenizer::new(&sql);
1287 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1288
1289 let expected = vec![
1290 Token::make_keyword("SELECT"),
1291 Token::Whitespace(Whitespace::Space),
1292 Token::Mul,
1293 Token::Whitespace(Whitespace::Space),
1294 Token::make_keyword("FROM"),
1295 Token::Whitespace(Whitespace::Space),
1296 Token::make_word("customer", None),
1297 Token::Whitespace(Whitespace::Space),
1298 Token::make_keyword("WHERE"),
1299 Token::Whitespace(Whitespace::Space),
1300 Token::make_word("salary", None),
1301 Token::Whitespace(Whitespace::Space),
1302 Token::Neq,
1303 Token::Whitespace(Whitespace::Space),
1304 Token::SingleQuotedString(String::from("Not Provided")),
1305 ];
1306
1307 compare(expected, tokens);
1308 }
1309
1310 #[test]
1311 fn tokenize_invalid_string() {
1312 let sql = String::from("\nمصطفىh");
1313 let mut tokenizer = Tokenizer::new(&sql);
1314 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1315 let expected = vec![
1317 Token::Whitespace(Whitespace::Newline),
1318 Token::Char('م'),
1319 Token::Char('ص'),
1320 Token::Char('ط'),
1321 Token::Char('ف'),
1322 Token::Char('ى'),
1323 Token::make_word("h", None),
1324 ];
1325 compare(expected, tokens);
1326 }
1327
1328 #[test]
1329 fn tokenize_newline_in_string_literal() {
1330 let sql = String::from("'foo\r\nbar\nbaz'");
1331 let mut tokenizer = Tokenizer::new(&sql);
1332 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1333 let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_owned())];
1334 compare(expected, tokens);
1335 }
1336
1337 #[test]
1338 fn tokenize_unterminated_string_literal() {
1339 let sql = String::from("select 'foo");
1340 let mut tokenizer = Tokenizer::new(&sql);
1341 assert_eq!(
1342 tokenizer.tokenize_with_whitespace(),
1343 Err(TokenizerError {
1344 message: "Unterminated string literal".to_owned(),
1345 line: 1,
1346 col: 12,
1347 context: "LINE 1: select 'foo\n ^".to_owned(),
1348 })
1349 );
1350 }
1351
1352 #[test]
1353 fn tokenize_invalid_string_cols() {
1354 let sql = String::from("\n\nSELECT * FROM table\tمصطفىh");
1355 let mut tokenizer = Tokenizer::new(&sql);
1356 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1357 let expected = vec![
1359 Token::Whitespace(Whitespace::Newline),
1360 Token::Whitespace(Whitespace::Newline),
1361 Token::make_keyword("SELECT"),
1362 Token::Whitespace(Whitespace::Space),
1363 Token::Mul,
1364 Token::Whitespace(Whitespace::Space),
1365 Token::make_keyword("FROM"),
1366 Token::Whitespace(Whitespace::Space),
1367 Token::make_keyword("table"),
1368 Token::Whitespace(Whitespace::Tab),
1369 Token::Char('م'),
1370 Token::Char('ص'),
1371 Token::Char('ط'),
1372 Token::Char('ف'),
1373 Token::Char('ى'),
1374 Token::make_word("h", None),
1375 ];
1376 compare(expected, tokens);
1377 }
1378
1379 #[test]
1380 fn tokenize_right_arrow() {
1381 let sql = String::from("FUNCTION(key=>value)");
1382 let mut tokenizer = Tokenizer::new(&sql);
1383 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1384 let expected = vec![
1385 Token::make_word("FUNCTION", None),
1386 Token::LParen,
1387 Token::make_word("key", None),
1388 Token::RArrow,
1389 Token::make_word("value", None),
1390 Token::RParen,
1391 ];
1392 compare(expected, tokens);
1393 }
1394
1395 #[test]
1396 fn tokenize_is_null() {
1397 let sql = String::from("a IS NULL");
1398 let mut tokenizer = Tokenizer::new(&sql);
1399 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1400
1401 let expected = vec![
1402 Token::make_word("a", None),
1403 Token::Whitespace(Whitespace::Space),
1404 Token::make_keyword("IS"),
1405 Token::Whitespace(Whitespace::Space),
1406 Token::make_keyword("NULL"),
1407 ];
1408
1409 compare(expected, tokens);
1410 }
1411
1412 #[test]
1413 fn tokenize_comment() {
1414 let sql = String::from("0--this is a comment\n1");
1415 let mut tokenizer = Tokenizer::new(&sql);
1416 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1417 let expected = vec![
1418 Token::Number("0".to_owned()),
1419 Token::Whitespace(Whitespace::SingleLineComment {
1420 prefix: "--".to_owned(),
1421 comment: "this is a comment\n".to_owned(),
1422 }),
1423 Token::Number("1".to_owned()),
1424 ];
1425 compare(expected, tokens);
1426 }
1427
1428 #[test]
1429 fn tokenize_comment_at_eof() {
1430 let sql = String::from("--this is a comment");
1431 let mut tokenizer = Tokenizer::new(&sql);
1432 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1433 let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
1434 prefix: "--".to_owned(),
1435 comment: "this is a comment".to_owned(),
1436 })];
1437 compare(expected, tokens);
1438 }
1439
1440 #[test]
1441 fn tokenize_multiline_comment() {
1442 let sql = String::from("0/*multi-line\n* /comment*/1");
1443 let mut tokenizer = Tokenizer::new(&sql);
1444 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1445 let expected = vec![
1446 Token::Number("0".to_owned()),
1447 Token::Whitespace(Whitespace::MultiLineComment(
1448 "multi-line\n* /comment".to_owned(),
1449 )),
1450 Token::Number("1".to_owned()),
1451 ];
1452 compare(expected, tokens);
1453 }
1454
1455 #[test]
1456 fn tokenize_nested_multiline_comment() {
1457 let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
1458 let mut tokenizer = Tokenizer::new(&sql);
1459 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1460 let expected = vec![
1461 Token::Number("0".to_owned()),
1462 Token::Whitespace(Whitespace::MultiLineComment(
1463 "multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_owned(),
1464 )),
1465 Token::Number("1".to_owned()),
1466 ];
1467 compare(expected, tokens);
1468 }
1469
1470 #[test]
1471 fn tokenize_multiline_comment_with_even_asterisks() {
1472 let sql = String::from("\n/** Comment **/\n");
1473 let mut tokenizer = Tokenizer::new(&sql);
1474 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1475 let expected = vec![
1476 Token::Whitespace(Whitespace::Newline),
1477 Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_owned())),
1478 Token::Whitespace(Whitespace::Newline),
1479 ];
1480 compare(expected, tokens);
1481 }
1482
1483 #[test]
1484 fn tokenize_mismatched_quotes() {
1485 let sql = String::from("\"foo");
1486 let mut tokenizer = Tokenizer::new(&sql);
1487 assert_eq!(
1488 tokenizer.tokenize_with_whitespace(),
1489 Err(TokenizerError {
1490 message: "Expected close delimiter '\"' before EOF.".to_owned(),
1491 line: 1,
1492 col: 5,
1493 context: "LINE 1: \"foo\n ^".to_owned(),
1494 })
1495 );
1496 }
1497
1498 #[test]
1499 fn tokenize_newlines() {
1500 let sql = String::from("line1\nline2\rline3\r\nline4\r");
1501 let mut tokenizer = Tokenizer::new(&sql);
1502 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1503 let expected = vec![
1504 Token::make_word("line1", None),
1505 Token::Whitespace(Whitespace::Newline),
1506 Token::make_word("line2", None),
1507 Token::Whitespace(Whitespace::Newline),
1508 Token::make_word("line3", None),
1509 Token::Whitespace(Whitespace::Newline),
1510 Token::make_word("line4", None),
1511 Token::Whitespace(Whitespace::Newline),
1512 ];
1513 compare(expected, tokens);
1514 }
1515
1516 #[test]
1517 fn tokenize_pg_regex_match() {
1518 let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
1519 let mut tokenizer = Tokenizer::new(sql);
1520 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1521 let expected = vec![
1522 Token::make_keyword("SELECT"),
1523 Token::Whitespace(Whitespace::Space),
1524 Token::make_word("col", None),
1525 Token::Whitespace(Whitespace::Space),
1526 Token::Op("~".to_owned()),
1527 Token::Whitespace(Whitespace::Space),
1528 Token::SingleQuotedString("^a".into()),
1529 Token::Comma,
1530 Token::Whitespace(Whitespace::Space),
1531 Token::make_word("col", None),
1532 Token::Whitespace(Whitespace::Space),
1533 Token::Op("~*".to_owned()),
1534 Token::Whitespace(Whitespace::Space),
1535 Token::SingleQuotedString("^a".into()),
1536 Token::Comma,
1537 Token::Whitespace(Whitespace::Space),
1538 Token::make_word("col", None),
1539 Token::Whitespace(Whitespace::Space),
1540 Token::Op("!~".to_owned()),
1541 Token::Whitespace(Whitespace::Space),
1542 Token::SingleQuotedString("^a".into()),
1543 Token::Comma,
1544 Token::Whitespace(Whitespace::Space),
1545 Token::make_word("col", None),
1546 Token::Whitespace(Whitespace::Space),
1547 Token::Op("!~*".to_owned()),
1548 Token::Whitespace(Whitespace::Space),
1549 Token::SingleQuotedString("^a".into()),
1550 ];
1551 compare(expected, tokens);
1552 }
1553
1554 #[test]
1555 fn tokenize_select_array() {
1556 let sql = String::from("SELECT '{1, 2, 3}'");
1557 let mut tokenizer = Tokenizer::new(&sql);
1558 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1559
1560 let expected = vec![
1561 Token::make_keyword("SELECT"),
1562 Token::Whitespace(Whitespace::Space),
1563 Token::SingleQuotedString(String::from("{1, 2, 3}")),
1564 ];
1565
1566 compare(expected, tokens);
1567 }
1568
1569 fn compare(expected: Vec<Token>, actual: Vec<Token>) {
1570 assert_eq!(expected, actual);
1575 }
1576}