1#[cfg(not(feature = "std"))]
20use alloc::{
21 borrow::ToOwned,
22 format,
23 string::{String, ToString},
24 vec,
25 vec::Vec,
26};
27use core::fmt;
28use core::fmt::Debug;
29use core::iter::Peekable;
30use core::str::Chars;
31
32#[cfg(feature = "serde")]
33use serde::{Deserialize, Serialize};
34
35use crate::ast::{CstyleEscapedString, DollarQuotedString};
36use crate::keywords::{ALL_KEYWORDS, ALL_KEYWORDS_INDEX, Keyword};
37
38#[derive(Debug, Clone, PartialEq, Eq, Hash)]
40#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
41pub enum Token {
42 EOF,
44 Word(Word),
46 Number(String),
48 Char(char),
50 SingleQuotedString(String),
52 DollarQuotedString(DollarQuotedString),
54 CstyleEscapesString(CstyleEscapedString),
56 NationalStringLiteral(String),
58 HexStringLiteral(String),
60 Parameter(String),
62 Comma,
64 Whitespace(Whitespace),
66 Op(String),
68 Eq,
70 Neq,
72 Lt,
74 Gt,
76 LtEq,
78 GtEq,
80 Plus,
82 Minus,
84 Mul,
86 Div,
88 Mod,
90 LParen,
92 RParen,
94 Period,
96 Colon,
98 DoubleColon,
100 SemiColon,
102 Backslash,
104 LBracket,
106 RBracket,
108 Pipe,
110 Caret,
112 LBrace,
114 RBrace,
116 RArrow,
118}
119
120impl fmt::Display for Token {
121 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
122 match self {
123 Token::EOF => f.write_str("EOF"),
124 Token::Word(w) => write!(f, "{}", w),
125 Token::Number(n) => write!(f, "{}", n),
126 Token::Char(c) => write!(f, "{}", c),
127 Token::SingleQuotedString(s) => write!(f, "'{}'", s),
128 Token::DollarQuotedString(s) => write!(f, "{}", s),
129 Token::NationalStringLiteral(s) => write!(f, "N'{}'", s),
130 Token::HexStringLiteral(s) => write!(f, "X'{}'", s),
131 Token::CstyleEscapesString(s) => write!(f, "E'{}'", s),
132 Token::Parameter(s) => write!(f, "${}", s),
133 Token::Comma => f.write_str(","),
134 Token::Whitespace(ws) => write!(f, "{}", ws),
135 Token::Op(op) => write!(f, "{}", op),
136 Token::Eq => f.write_str("="),
137 Token::Neq => f.write_str("<>"),
138 Token::Lt => f.write_str("<"),
139 Token::Gt => f.write_str(">"),
140 Token::LtEq => f.write_str("<="),
141 Token::GtEq => f.write_str(">="),
142 Token::Plus => f.write_str("+"),
143 Token::Minus => f.write_str("-"),
144 Token::Mul => f.write_str("*"),
145 Token::Div => f.write_str("/"),
146 Token::Mod => f.write_str("%"),
147 Token::LParen => f.write_str("("),
148 Token::RParen => f.write_str(")"),
149 Token::Period => f.write_str("."),
150 Token::Colon => f.write_str(":"),
151 Token::DoubleColon => f.write_str("::"),
152 Token::SemiColon => f.write_str(";"),
153 Token::Backslash => f.write_str("\\"),
154 Token::LBracket => f.write_str("["),
155 Token::RBracket => f.write_str("]"),
156 Token::Caret => f.write_str("^"),
157 Token::Pipe => f.write_str("|"),
158 Token::LBrace => f.write_str("{"),
159 Token::RBrace => f.write_str("}"),
160 Token::RArrow => f.write_str("=>"),
161 }
162 }
163}
164
165impl Token {
166 pub fn make_keyword(keyword: &str) -> Self {
167 Token::make_word(keyword, None)
168 }
169
170 pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
171 let word_uppercase = word.to_uppercase();
172 Token::Word(Word {
173 value: word.to_owned(),
174 quote_style,
175 keyword: if quote_style.is_none() {
176 let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
177 keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
178 } else {
179 Keyword::NoKeyword
180 },
181 })
182 }
183
184 pub fn with_location(self, location: Location) -> TokenWithLocation {
185 TokenWithLocation::new(self, location.line, location.column)
186 }
187}
188
189#[derive(Debug, Clone, PartialEq, Eq, Hash)]
191#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
192pub struct Word {
193 pub value: String,
196 pub quote_style: Option<char>,
200 pub keyword: Keyword,
203}
204
205impl fmt::Display for Word {
206 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
207 match self.quote_style {
208 Some(s) if s == '"' || s == '[' || s == '`' => {
209 write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
210 }
211 None => f.write_str(&self.value),
212 _ => panic!("Unexpected quote_style!"),
213 }
214 }
215}
216
217impl Word {
218 fn matching_end_quote(ch: char) -> char {
219 match ch {
220 '"' => '"', '[' => ']', '`' => '`', _ => panic!("unexpected quoting style!"),
224 }
225 }
226}
227
228#[derive(Debug, Clone, PartialEq, Eq, Hash)]
229#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
230pub enum Whitespace {
231 Space,
232 Newline,
233 Tab,
234 SingleLineComment { comment: String, prefix: String },
235 MultiLineComment(String),
236}
237
238impl fmt::Display for Whitespace {
239 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
240 match self {
241 Whitespace::Space => f.write_str(" "),
242 Whitespace::Newline => f.write_str("\n"),
243 Whitespace::Tab => f.write_str("\t"),
244 Whitespace::SingleLineComment { prefix, comment } => write!(f, "{}{}", prefix, comment),
245 Whitespace::MultiLineComment(s) => write!(f, "/*{}*/", s),
246 }
247 }
248}
249
250#[derive(Debug, Eq, PartialEq, Clone)]
252pub struct Location {
253 pub line: u64,
255 pub column: u64,
257}
258
259#[derive(Debug, Eq, PartialEq, Clone)]
261pub struct TokenWithLocation {
262 pub token: Token,
263 pub location: Location,
264}
265
266impl TokenWithLocation {
267 pub fn new(token: Token, line: u64, column: u64) -> TokenWithLocation {
268 TokenWithLocation {
269 token,
270 location: Location { line, column },
271 }
272 }
273
274 pub fn eof() -> TokenWithLocation {
275 TokenWithLocation::new(Token::EOF, 0, 0)
276 }
277}
278
279impl PartialEq<Token> for TokenWithLocation {
280 fn eq(&self, other: &Token) -> bool {
281 &self.token == other
282 }
283}
284
285impl PartialEq<TokenWithLocation> for Token {
286 fn eq(&self, other: &TokenWithLocation) -> bool {
287 self == &other.token
288 }
289}
290
291impl fmt::Display for TokenWithLocation {
292 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
293 if self.token == Token::EOF {
294 write!(f, "end of input")
295 } else {
296 write!(
297 f,
298 "{} at line {}, column {}",
299 self.token, self.location.line, self.location.column
300 )
301 }
302 }
303}
304
305#[derive(Debug, PartialEq)]
307pub struct TokenizerError {
308 pub message: String,
309 pub line: u64,
310 pub col: u64,
311 pub context: String,
312}
313
314impl fmt::Display for TokenizerError {
315 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
316 write!(
317 f,
318 "{} at line {}, column {}\n{}",
319 self.message, self.line, self.col, self.context
320 )
321 }
322}
323
324#[cfg(feature = "std")]
325impl std::error::Error for TokenizerError {}
326
327#[derive(Clone)]
329pub struct Tokenizer<'a> {
330 sql: &'a str,
331 chars: Peekable<Chars<'a>>,
332 line: u64,
333 col: u64,
334}
335
336impl<'a> Tokenizer<'a> {
337 pub fn new(query: &'a str) -> Self {
339 Self {
340 sql: query,
341 chars: query.chars().peekable(),
342 line: 1,
343 col: 1,
344 }
345 }
346
347 fn next(&mut self) -> Option<char> {
349 let ch = self.chars.next();
350 if let Some(ch) = ch {
351 match ch {
352 '\n' => {
353 self.line += 1;
354 self.col = 1;
355 }
356 '\t' => self.col += 4,
357 _ => self.col += 1,
358 }
359 }
360 ch
361 }
362
363 fn peek(&mut self) -> Option<char> {
365 self.chars.peek().cloned()
366 }
367
368 pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
372 let tokens = self.tokenize()?;
373 Ok(tokens
374 .into_iter()
375 .filter(|token| !matches!(&token.token, Token::Whitespace(_)))
376 .collect())
377 }
378
379 #[allow(dead_code)]
383 fn tokenize_with_whitespace(&mut self) -> Result<Vec<Token>, TokenizerError> {
384 let tokens = self.tokenize()?;
385 Ok(tokens.into_iter().map(|t| t.token).collect())
386 }
387
388 fn tokenize(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
392 let mut tokens = Vec::new();
393 while let Some(token) = self.next_token_with_location()? {
394 tokens.push(token);
395 }
396 Ok(tokens)
397 }
398
399 fn next_token_with_location(&mut self) -> Result<Option<TokenWithLocation>, TokenizerError> {
401 let loc = Location {
402 line: self.line,
403 column: self.col,
404 };
405 self.next_token()
406 .map(|t| t.map(|token| token.with_location(loc)))
407 }
408
409 fn next_token(&mut self) -> Result<Option<Token>, TokenizerError> {
411 macro_rules! op_chars {
412 (all as_pat) => {
414 '+' | '-' | '*' | '/' | '<' | '>' | '=' | op_chars!(ext as_pat)
415 };
416 (ext $m:ident) => {
417 op_chars!($m '~' '!' '@' '#' '%' '^' '&' '|' '`' '?')
418 };
419 (as_arr $($c:literal)+) => {
420 [ $($c),+ ]
421 };
422 (as_pat $($c:literal)+) => {
423 $($c)|+
424 };
425 }
426
427 match self.peek() {
428 Some(ch) => match ch {
429 ' ' => self.consume_and_return(Token::Whitespace(Whitespace::Space)),
430 '\t' => self.consume_and_return(Token::Whitespace(Whitespace::Tab)),
431 '\n' => self.consume_and_return(Token::Whitespace(Whitespace::Newline)),
432 '\r' => {
433 self.next();
435 if let Some('\n') = self.peek() {
436 self.next();
437 }
438 Ok(Some(Token::Whitespace(Whitespace::Newline)))
439 }
440 'N' => {
441 self.next(); match self.peek() {
443 Some('\'') => {
444 let s = self.tokenize_single_quoted_string()?;
446 Ok(Some(Token::NationalStringLiteral(s)))
447 }
448 _ => {
449 let s = self.tokenize_word('N');
451 Ok(Some(Token::make_word(&s, None)))
452 }
453 }
454 }
455 x @ 'e' | x @ 'E' => {
456 self.next(); match self.peek() {
458 Some('\'') => {
459 let s = self.tokenize_single_quoted_string_with_escape()?;
461 Ok(Some(Token::CstyleEscapesString(s)))
462 }
463 _ => {
464 let s = self.tokenize_word(x);
466 Ok(Some(Token::make_word(&s, None)))
467 }
468 }
469 }
470 x @ 'x' | x @ 'X' => {
473 self.next(); match self.peek() {
475 Some('\'') => {
476 let s = self.tokenize_single_quoted_string()?;
478 Ok(Some(Token::HexStringLiteral(s)))
479 }
480 _ => {
481 let s = self.tokenize_word(x);
483 Ok(Some(Token::make_word(&s, None)))
484 }
485 }
486 }
487 ch if is_identifier_start(ch) => {
489 self.next(); let s = self.tokenize_word(ch);
491
492 Ok(Some(Token::make_word(&s, None)))
493 }
494 '\'' => {
496 let s = self.tokenize_single_quoted_string()?;
497
498 Ok(Some(Token::SingleQuotedString(s)))
499 }
500 quote_start if is_delimited_identifier_start(quote_start) => {
502 self.next(); let quote_end = Word::matching_end_quote(quote_start);
504 let s = self.peeking_take_while(|ch| ch != quote_end);
505 if self.next() == Some(quote_end) {
506 Ok(Some(Token::make_word(&s, Some(quote_start))))
507 } else {
508 self.error(format!(
509 "Expected close delimiter '{}' before EOF.",
510 quote_end
511 ))
512 }
513 }
514 '0'..='9' | '.' => {
516 let mut s = self.peeking_take_while(|ch| ch.is_ascii_digit());
517
518 if s == "0"
520 && let Some(radix) = self.peek()
521 && "xob".contains(radix.to_ascii_lowercase())
522 {
523 self.next();
524 let radix = radix.to_ascii_lowercase();
525 let base = match radix {
526 'x' => 16,
527 'o' => 8,
528 'b' => 2,
529 _ => unreachable!(),
530 };
531 let s2 = self.peeking_take_while(|ch| ch.is_digit(base));
532 if s2.is_empty() {
533 return self.error("incomplete integer literal");
534 }
535 self.reject_number_junk()?;
536 return Ok(Some(Token::Number(format!("0{radix}{s2}"))));
537 }
538
539 if let Some('.') = self.peek() {
541 s.push('.');
542 self.next();
543 }
544 s += &self.peeking_take_while(|ch| ch.is_ascii_digit());
545
546 if s == "." {
548 return Ok(Some(Token::Period));
549 }
550
551 match self.peek() {
552 Some('e') | Some('E') => {
554 s.push('e');
555 self.next();
556
557 if let Some('-') = self.peek() {
558 s.push('-');
559 self.next();
560 }
561 s += &self.peeking_take_while(|ch| ch.is_ascii_digit());
562 self.reject_number_junk()?;
563 return Ok(Some(Token::Number(s)));
564 }
565 _ => {}
567 };
568 self.reject_number_junk()?;
569 Ok(Some(Token::Number(s)))
570 }
571 '(' => self.consume_and_return(Token::LParen),
573 ')' => self.consume_and_return(Token::RParen),
574 ',' => self.consume_and_return(Token::Comma),
575 ':' => {
576 self.next();
577 match self.peek() {
578 Some(':') => self.consume_and_return(Token::DoubleColon),
579 _ => Ok(Some(Token::Colon)),
580 }
581 }
582 '$' => Ok(Some(self.tokenize_dollar_preceded_value()?)),
583 ';' => self.consume_and_return(Token::SemiColon),
584 '\\' => self.consume_and_return(Token::Backslash),
585 '[' => self.consume_and_return(Token::LBracket),
586 ']' => self.consume_and_return(Token::RBracket),
587 '{' => self.consume_and_return(Token::LBrace),
588 '}' => self.consume_and_return(Token::RBrace),
589 op_chars!(all as_pat) => {
591 let mut trial = self.clone();
592 let op_taken = trial.peeking_take_while(|c| matches!(c, op_chars!(all as_pat)));
593 let slash_star = op_taken.find("/*");
598 let dash_dash = op_taken.find("--");
599 let pos = match (slash_star, dash_dash) {
600 (Some(s), Some(d)) => s.min(d),
601 (Some(s), None) => s,
602 (None, Some(d)) => d,
603 (None, None) => op_taken.len(),
604 };
605 let mut op = &op_taken[..pos];
606 if op.is_empty() {
607 match self.next() {
608 Some('-') => {
609 self.next(); let comment = self.tokenize_single_line_comment();
611
612 return Ok(Some(Token::Whitespace(
613 Whitespace::SingleLineComment {
614 prefix: "--".to_owned(),
615 comment,
616 },
617 )));
618 }
619 Some('/') => {
620 self.next(); return self.tokenize_multiline_comment();
622 }
623 _ => unreachable!(),
624 }
625 };
626 if op.len() > 1
627 && op.ends_with(['+', '-'])
628 && !op.contains(op_chars!(ext as_arr))
629 {
630 op = op.trim_end_matches(['+', '-']);
631 if op.is_empty() {
632 op = &op_taken[..1];
633 }
634 }
635 if op.len() == op_taken.len() {
636 *self = trial;
637 } else {
638 for _ in op.chars() {
639 self.next();
640 }
641 }
642 match op {
643 "+" => Ok(Some(Token::Plus)),
645 "-" => Ok(Some(Token::Minus)),
646 "*" => Ok(Some(Token::Mul)),
647 "/" => Ok(Some(Token::Div)),
648 "%" => Ok(Some(Token::Mod)),
649 "^" => Ok(Some(Token::Caret)),
650 "<" => Ok(Some(Token::Lt)),
651 ">" => Ok(Some(Token::Gt)),
652 "=" => Ok(Some(Token::Eq)),
653 "=>" => Ok(Some(Token::RArrow)),
655 "<=" => Ok(Some(Token::LtEq)),
656 ">=" => Ok(Some(Token::GtEq)),
657 "<>" => Ok(Some(Token::Neq)),
658 "!=" => Ok(Some(Token::Neq)),
659 "|" => Ok(Some(Token::Pipe)),
664 _ => Ok(Some(Token::Op(op.to_owned()))),
665 }
666 }
667 other => self.consume_and_return(Token::Char(other)),
668 },
669 None => Ok(None),
670 }
671 }
672
673 fn tokenize_dollar_preceded_value(&mut self) -> Result<Token, TokenizerError> {
675 let mut s = String::new();
676 let mut value = String::new();
677
678 self.next();
679
680 if let Some('$') = self.peek() {
681 self.next();
682
683 let mut is_terminated = false;
684 let mut prev: Option<char> = None;
685
686 while let Some(ch) = self.peek() {
687 if prev == Some('$') {
688 if ch == '$' {
689 self.next();
690 is_terminated = true;
691 break;
692 } else {
693 s.push('$');
694 s.push(ch);
695 }
696 } else if ch != '$' {
697 s.push(ch);
698 }
699
700 prev = Some(ch);
701 self.next();
702 }
703
704 return if self.peek().is_none() && !is_terminated {
705 self.error("Unterminated dollar-quoted string")
706 } else {
707 Ok(Token::DollarQuotedString(DollarQuotedString {
708 value: s,
709 tag: None,
710 }))
711 };
712 } else {
713 value.push_str(&self.peeking_take_while(|ch| ch.is_alphanumeric() || ch == '_'));
714
715 if let Some('$') = self.peek() {
716 self.next();
717 s.push_str(&self.peeking_take_while(|ch| ch != '$'));
718
719 match self.peek() {
720 Some('$') => {
721 self.next();
722 for c in value.chars() {
723 let next_char = self.next();
724 if Some(c) != next_char {
725 return self.error(format!(
726 "Unterminated dollar-quoted string at or near \"{}\"",
727 value
728 ));
729 }
730 }
731
732 if let Some('$') = self.peek() {
733 self.next();
734 } else {
735 return self.error("Unterminated dollar-quoted string, expected $");
736 }
737 }
738 _ => {
739 return self.error("Unterminated dollar-quoted, expected $");
740 }
741 }
742 } else {
743 return Ok(Token::Parameter(value));
744 }
745 }
746
747 Ok(Token::DollarQuotedString(DollarQuotedString {
748 value: s,
749 tag: if value.is_empty() { None } else { Some(value) },
750 }))
751 }
752
753 fn error<R>(&self, message: impl Into<String>) -> Result<R, TokenizerError> {
754 let prefix = format!("LINE {}: ", self.line);
755 let sql_line = self.sql.split('\n').nth(self.line as usize - 1).unwrap();
756 let cursor = " ".repeat(prefix.len() + self.col as usize - 1);
757 let context = format!("{}{}\n{}^", prefix, sql_line, cursor);
758 Err(TokenizerError {
759 message: message.into(),
760 col: self.col,
761 line: self.line,
762 context,
763 })
764 }
765
766 fn reject_number_junk(&mut self) -> Result<(), TokenizerError> {
767 if let Some(ch) = self.peek()
768 && is_identifier_start(ch)
769 {
770 return self.error("trailing junk after numeric literal");
771 }
772 Ok(())
773 }
774
775 fn tokenize_single_line_comment(&mut self) -> String {
777 let mut comment = self.peeking_take_while(|ch| ch != '\n');
778 if let Some(ch) = self.next() {
779 assert_eq!(ch, '\n');
780 comment.push(ch);
781 }
782 comment
783 }
784
785 fn tokenize_word(&mut self, first_char: char) -> String {
787 let mut s = first_char.to_string();
788 s.push_str(&self.peeking_take_while(is_identifier_part));
789 s
790 }
791
792 fn tokenize_single_quoted_string(&mut self) -> Result<String, TokenizerError> {
794 let mut s = String::new();
795 self.next(); let mut is_escaped = false;
799 while let Some(ch) = self.peek() {
800 match ch {
801 '\'' => {
802 self.next(); if is_escaped {
804 s.push(ch);
805 is_escaped = false;
806 } else if self.peek().map(|c| c == '\'').unwrap_or(false) {
807 s.push(ch);
808 self.next();
809 } else {
810 return Ok(s);
811 }
812 }
813 '\\' => {
814 s.push(ch);
815 self.next();
816 }
817 _ => {
818 self.next(); s.push(ch);
820 }
821 }
822 }
823 self.error("Unterminated string literal")
824 }
825
826 fn tokenize_single_quoted_string_with_escape(
828 &mut self,
829 ) -> Result<CstyleEscapedString, TokenizerError> {
830 let mut terminated = false;
831 let mut s = String::new();
832 self.next(); while let Some(ch) = self.peek() {
835 match ch {
836 '\'' => {
837 self.next(); if self.peek().map(|c| c == '\'').unwrap_or(false) {
839 s.push('\\');
840 s.push(ch);
841 self.next();
842 } else {
843 terminated = true;
844 break;
845 }
846 }
847 '\\' => {
848 s.push(ch);
849 self.next();
850 if self.peek().map(|c| c == '\'' || c == '\\').unwrap_or(false) {
851 s.push(self.next().unwrap());
852 }
853 }
854 _ => {
855 self.next(); s.push(ch);
857 }
858 }
859 }
860
861 if !terminated {
862 return self.error("Unterminated string literal");
863 }
864
865 let unescaped = match Self::unescape_c_style(&s) {
866 Ok(unescaped) => unescaped,
867 Err(e) => return self.error(e),
868 };
869
870 Ok(CstyleEscapedString {
871 value: unescaped,
872 raw: s,
873 })
874 }
875
876 fn unescape_c_style(s: &str) -> Result<String, String> {
882 fn hex_byte_process(
883 chars: &mut Peekable<Chars<'_>>,
884 res: &mut String,
885 len: usize,
886 default_char: char,
887 ) -> Result<(), String> {
888 let mut unicode_seq: String = String::with_capacity(len);
889 for _ in 0..len {
890 if let Some(c) = chars.peek()
891 && c.is_ascii_hexdigit()
892 {
893 unicode_seq.push(chars.next().unwrap());
894 } else {
895 break;
896 }
897 }
898
899 if unicode_seq.is_empty() && len == 2 {
900 res.push(default_char);
901 return Ok(());
902 } else if unicode_seq.len() < len && len != 2 {
903 return Err("invalid unicode sequence: must be \\uXXXX or \\UXXXXXXXX".to_owned());
904 }
905
906 if len == 2 {
907 let number = [u8::from_str_radix(&unicode_seq, 16)
908 .map_err(|e| format!("invalid unicode sequence: {}", e))?];
909
910 res.push(
911 std::str::from_utf8(&number)
912 .map_err(|err| format!("invalid unicode sequence: {}", err))?
913 .chars()
914 .next()
915 .unwrap(),
916 );
917 } else {
918 let number = u32::from_str_radix(&unicode_seq, 16)
919 .map_err(|e| format!("invalid unicode sequence: {}", e))?;
920 res.push(
921 char::from_u32(number)
922 .ok_or_else(|| format!("invalid unicode sequence: {}", unicode_seq))?,
923 );
924 }
925 Ok(())
926 }
927
928 fn octal_byte_process(
929 chars: &mut Peekable<Chars<'_>>,
930 res: &mut String,
931 digit: char,
932 ) -> Result<(), String> {
933 let mut unicode_seq: String = String::with_capacity(3);
934 unicode_seq.push(digit);
935 for _ in 0..2 {
936 if let Some(c) = chars.peek()
937 && matches!(*c, '0'..='7')
938 {
939 unicode_seq.push(chars.next().unwrap());
940 } else {
941 break;
942 }
943 }
944
945 let number = [u8::from_str_radix(&unicode_seq, 8)
946 .map_err(|e| format!("invalid unicode sequence: {}", e))?];
947
948 res.push(
949 std::str::from_utf8(&number)
950 .map_err(|err| format!("invalid unicode sequence: {}", err))?
951 .chars()
952 .next()
953 .unwrap(),
954 );
955 Ok(())
956 }
957
958 let mut chars = s.chars().peekable();
959 let mut res = String::with_capacity(s.len());
960
961 while let Some(c) = chars.next() {
962 if c == '\\' {
963 match chars.next() {
964 None => {
965 return Err("unterminated escape sequence".to_owned());
966 }
967 Some(next_c) => match next_c {
968 'b' => res.push('\u{08}'),
969 'f' => res.push('\u{0C}'),
970 'n' => res.push('\n'),
971 'r' => res.push('\r'),
972 't' => res.push('\t'),
973 'x' => hex_byte_process(&mut chars, &mut res, 2, 'x')?,
974 'u' => hex_byte_process(&mut chars, &mut res, 4, 'u')?,
975 'U' => hex_byte_process(&mut chars, &mut res, 8, 'U')?,
976 digit @ '0'..='7' => octal_byte_process(&mut chars, &mut res, digit)?,
977 _ => res.push(next_c),
978 },
979 }
980 } else {
981 res.push(c);
982 }
983 }
984
985 Ok(res)
986 }
987
988 fn tokenize_multiline_comment(&mut self) -> Result<Option<Token>, TokenizerError> {
989 let mut s = String::new();
990
991 let mut nested = 1;
992 let mut last_ch = ' ';
993
994 loop {
995 match self.next() {
996 Some(ch) => {
997 if last_ch == '/' && ch == '*' {
998 nested += 1;
999 } else if last_ch == '*' && ch == '/' {
1000 nested -= 1;
1001 if nested == 0 {
1002 s.pop();
1003 break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
1004 }
1005 }
1006 s.push(ch);
1007 last_ch = ch;
1008 }
1009 None => break self.error("Unexpected EOF while in a multi-line comment"),
1010 }
1011 }
1012 }
1013
1014 #[allow(clippy::unnecessary_wraps)]
1015 fn consume_and_return(&mut self, t: Token) -> Result<Option<Token>, TokenizerError> {
1016 self.next();
1017 Ok(Some(t))
1018 }
1019
1020 fn peeking_take_while(&mut self, mut predicate: impl FnMut(char) -> bool) -> String {
1024 let mut s = String::new();
1025 while let Some(ch) = self.peek() {
1026 if predicate(ch) {
1027 self.next(); s.push(ch);
1029 } else {
1030 break;
1031 }
1032 }
1033 s
1034 }
1035}
1036
1037fn is_delimited_identifier_start(ch: char) -> bool {
1043 ch == '"'
1044}
1045
1046fn is_identifier_start(ch: char) -> bool {
1048 ch.is_ascii_alphabetic() || ch == '_'
1052}
1053
1054fn is_identifier_part(ch: char) -> bool {
1056 ch.is_ascii_alphanumeric() || ch == '$' || ch == '_'
1057}
1058
1059#[cfg(test)]
1060mod tests {
1061 use super::*;
1062
1063 #[test]
1064 fn tokenizer_error_impl() {
1065 let err = TokenizerError {
1066 message: "test".into(),
1067 line: 1,
1068 col: 1,
1069 context: "LINE 1:".to_owned(),
1070 };
1071 #[cfg(feature = "std")]
1072 {
1073 use std::error::Error;
1074 assert!(err.source().is_none());
1075 }
1076 assert_eq!(err.to_string(), "test at line 1, column 1\nLINE 1:");
1077 }
1078
1079 #[test]
1080 fn tokenize_select_1() {
1081 let sql = String::from("SELECT 1");
1082 let mut tokenizer = Tokenizer::new(&sql);
1083 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1084
1085 let expected = vec![
1086 Token::make_keyword("SELECT"),
1087 Token::Whitespace(Whitespace::Space),
1088 Token::Number(String::from("1")),
1089 ];
1090
1091 compare(expected, tokens);
1092 }
1093
1094 #[test]
1095 fn tokenize_select_float() {
1096 let sql = String::from("SELECT .1");
1097 let mut tokenizer = Tokenizer::new(&sql);
1098 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1099
1100 let expected = vec![
1101 Token::make_keyword("SELECT"),
1102 Token::Whitespace(Whitespace::Space),
1103 Token::Number(String::from(".1")),
1104 ];
1105
1106 compare(expected, tokens);
1107 }
1108
1109 #[test]
1110 fn tokenize_scalar_function() {
1111 let sql = String::from("SELECT sqrt(1)");
1112 let mut tokenizer = Tokenizer::new(&sql);
1113 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1114
1115 let expected = vec![
1116 Token::make_keyword("SELECT"),
1117 Token::Whitespace(Whitespace::Space),
1118 Token::make_word("sqrt", None),
1119 Token::LParen,
1120 Token::Number(String::from("1")),
1121 Token::RParen,
1122 ];
1123
1124 compare(expected, tokens);
1125 }
1126
1127 #[test]
1128 fn tokenize_string_string_concat() {
1129 let sql = String::from("SELECT 'a' || 'b'");
1130 let mut tokenizer = Tokenizer::new(&sql);
1131 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1132
1133 let expected = vec![
1134 Token::make_keyword("SELECT"),
1135 Token::Whitespace(Whitespace::Space),
1136 Token::SingleQuotedString(String::from("a")),
1137 Token::Whitespace(Whitespace::Space),
1138 Token::Op("||".to_owned()),
1139 Token::Whitespace(Whitespace::Space),
1140 Token::SingleQuotedString(String::from("b")),
1141 ];
1142
1143 compare(expected, tokens);
1144 }
1145
1146 #[test]
1147 fn tokenize_bitwise_op() {
1148 let sql = String::from("SELECT one | two ^ three");
1149 let mut tokenizer = Tokenizer::new(&sql);
1150 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1151
1152 let expected = vec![
1153 Token::make_keyword("SELECT"),
1154 Token::Whitespace(Whitespace::Space),
1155 Token::make_word("one", None),
1156 Token::Whitespace(Whitespace::Space),
1157 Token::Pipe,
1158 Token::Whitespace(Whitespace::Space),
1159 Token::make_word("two", None),
1160 Token::Whitespace(Whitespace::Space),
1161 Token::Caret,
1162 Token::Whitespace(Whitespace::Space),
1163 Token::make_word("three", None),
1164 ];
1165 compare(expected, tokens);
1166 }
1167
1168 #[test]
1169 fn tokenize_logical_xor() {
1170 let sql =
1171 String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
1172 let mut tokenizer = Tokenizer::new(&sql);
1173 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1174
1175 let expected = vec![
1176 Token::make_keyword("SELECT"),
1177 Token::Whitespace(Whitespace::Space),
1178 Token::make_keyword("true"),
1179 Token::Whitespace(Whitespace::Space),
1180 Token::make_keyword("XOR"),
1181 Token::Whitespace(Whitespace::Space),
1182 Token::make_keyword("true"),
1183 Token::Comma,
1184 Token::Whitespace(Whitespace::Space),
1185 Token::make_keyword("false"),
1186 Token::Whitespace(Whitespace::Space),
1187 Token::make_keyword("XOR"),
1188 Token::Whitespace(Whitespace::Space),
1189 Token::make_keyword("false"),
1190 Token::Comma,
1191 Token::Whitespace(Whitespace::Space),
1192 Token::make_keyword("true"),
1193 Token::Whitespace(Whitespace::Space),
1194 Token::make_keyword("XOR"),
1195 Token::Whitespace(Whitespace::Space),
1196 Token::make_keyword("false"),
1197 Token::Comma,
1198 Token::Whitespace(Whitespace::Space),
1199 Token::make_keyword("false"),
1200 Token::Whitespace(Whitespace::Space),
1201 Token::make_keyword("XOR"),
1202 Token::Whitespace(Whitespace::Space),
1203 Token::make_keyword("true"),
1204 ];
1205 compare(expected, tokens);
1206 }
1207
1208 #[test]
1209 fn tokenize_simple_select() {
1210 let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
1211 let mut tokenizer = Tokenizer::new(&sql);
1212 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1213
1214 let expected = vec![
1215 Token::make_keyword("SELECT"),
1216 Token::Whitespace(Whitespace::Space),
1217 Token::Mul,
1218 Token::Whitespace(Whitespace::Space),
1219 Token::make_keyword("FROM"),
1220 Token::Whitespace(Whitespace::Space),
1221 Token::make_word("customer", None),
1222 Token::Whitespace(Whitespace::Space),
1223 Token::make_keyword("WHERE"),
1224 Token::Whitespace(Whitespace::Space),
1225 Token::make_word("id", None),
1226 Token::Whitespace(Whitespace::Space),
1227 Token::Eq,
1228 Token::Whitespace(Whitespace::Space),
1229 Token::Number(String::from("1")),
1230 Token::Whitespace(Whitespace::Space),
1231 Token::make_keyword("LIMIT"),
1232 Token::Whitespace(Whitespace::Space),
1233 Token::Number(String::from("5")),
1234 ];
1235
1236 compare(expected, tokens);
1237 }
1238
1239 #[test]
1240 fn tokenize_explain_select() {
1241 let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
1242 let mut tokenizer = Tokenizer::new(&sql);
1243 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1244
1245 let expected = vec![
1246 Token::make_keyword("EXPLAIN"),
1247 Token::Whitespace(Whitespace::Space),
1248 Token::make_keyword("SELECT"),
1249 Token::Whitespace(Whitespace::Space),
1250 Token::Mul,
1251 Token::Whitespace(Whitespace::Space),
1252 Token::make_keyword("FROM"),
1253 Token::Whitespace(Whitespace::Space),
1254 Token::make_word("customer", None),
1255 Token::Whitespace(Whitespace::Space),
1256 Token::make_keyword("WHERE"),
1257 Token::Whitespace(Whitespace::Space),
1258 Token::make_word("id", None),
1259 Token::Whitespace(Whitespace::Space),
1260 Token::Eq,
1261 Token::Whitespace(Whitespace::Space),
1262 Token::Number(String::from("1")),
1263 ];
1264
1265 compare(expected, tokens);
1266 }
1267
1268 #[test]
1269 fn tokenize_explain_analyze_select() {
1270 let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
1271 let mut tokenizer = Tokenizer::new(&sql);
1272 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1273
1274 let expected = vec![
1275 Token::make_keyword("EXPLAIN"),
1276 Token::Whitespace(Whitespace::Space),
1277 Token::make_keyword("ANALYZE"),
1278 Token::Whitespace(Whitespace::Space),
1279 Token::make_keyword("SELECT"),
1280 Token::Whitespace(Whitespace::Space),
1281 Token::Mul,
1282 Token::Whitespace(Whitespace::Space),
1283 Token::make_keyword("FROM"),
1284 Token::Whitespace(Whitespace::Space),
1285 Token::make_word("customer", None),
1286 Token::Whitespace(Whitespace::Space),
1287 Token::make_keyword("WHERE"),
1288 Token::Whitespace(Whitespace::Space),
1289 Token::make_word("id", None),
1290 Token::Whitespace(Whitespace::Space),
1291 Token::Eq,
1292 Token::Whitespace(Whitespace::Space),
1293 Token::Number(String::from("1")),
1294 ];
1295
1296 compare(expected, tokens);
1297 }
1298
1299 #[test]
1300 fn tokenize_string_predicate() {
1301 let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
1302 let mut tokenizer = Tokenizer::new(&sql);
1303 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1304
1305 let expected = vec![
1306 Token::make_keyword("SELECT"),
1307 Token::Whitespace(Whitespace::Space),
1308 Token::Mul,
1309 Token::Whitespace(Whitespace::Space),
1310 Token::make_keyword("FROM"),
1311 Token::Whitespace(Whitespace::Space),
1312 Token::make_word("customer", None),
1313 Token::Whitespace(Whitespace::Space),
1314 Token::make_keyword("WHERE"),
1315 Token::Whitespace(Whitespace::Space),
1316 Token::make_word("salary", None),
1317 Token::Whitespace(Whitespace::Space),
1318 Token::Neq,
1319 Token::Whitespace(Whitespace::Space),
1320 Token::SingleQuotedString(String::from("Not Provided")),
1321 ];
1322
1323 compare(expected, tokens);
1324 }
1325
1326 #[test]
1327 fn tokenize_invalid_string() {
1328 let sql = String::from("\nمصطفىh");
1329 let mut tokenizer = Tokenizer::new(&sql);
1330 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1331 let expected = vec![
1333 Token::Whitespace(Whitespace::Newline),
1334 Token::Char('م'),
1335 Token::Char('ص'),
1336 Token::Char('ط'),
1337 Token::Char('ف'),
1338 Token::Char('ى'),
1339 Token::make_word("h", None),
1340 ];
1341 compare(expected, tokens);
1342 }
1343
1344 #[test]
1345 fn tokenize_newline_in_string_literal() {
1346 let sql = String::from("'foo\r\nbar\nbaz'");
1347 let mut tokenizer = Tokenizer::new(&sql);
1348 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1349 let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_owned())];
1350 compare(expected, tokens);
1351 }
1352
1353 #[test]
1354 fn tokenize_unterminated_string_literal() {
1355 let sql = String::from("select 'foo");
1356 let mut tokenizer = Tokenizer::new(&sql);
1357 assert_eq!(
1358 tokenizer.tokenize_with_whitespace(),
1359 Err(TokenizerError {
1360 message: "Unterminated string literal".to_owned(),
1361 line: 1,
1362 col: 12,
1363 context: "LINE 1: select 'foo\n ^".to_owned(),
1364 })
1365 );
1366 }
1367
1368 #[test]
1369 fn tokenize_invalid_string_cols() {
1370 let sql = String::from("\n\nSELECT * FROM table\tمصطفىh");
1371 let mut tokenizer = Tokenizer::new(&sql);
1372 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1373 let expected = vec![
1375 Token::Whitespace(Whitespace::Newline),
1376 Token::Whitespace(Whitespace::Newline),
1377 Token::make_keyword("SELECT"),
1378 Token::Whitespace(Whitespace::Space),
1379 Token::Mul,
1380 Token::Whitespace(Whitespace::Space),
1381 Token::make_keyword("FROM"),
1382 Token::Whitespace(Whitespace::Space),
1383 Token::make_keyword("table"),
1384 Token::Whitespace(Whitespace::Tab),
1385 Token::Char('م'),
1386 Token::Char('ص'),
1387 Token::Char('ط'),
1388 Token::Char('ف'),
1389 Token::Char('ى'),
1390 Token::make_word("h", None),
1391 ];
1392 compare(expected, tokens);
1393 }
1394
1395 #[test]
1396 fn tokenize_right_arrow() {
1397 let sql = String::from("FUNCTION(key=>value)");
1398 let mut tokenizer = Tokenizer::new(&sql);
1399 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1400 let expected = vec![
1401 Token::make_word("FUNCTION", None),
1402 Token::LParen,
1403 Token::make_word("key", None),
1404 Token::RArrow,
1405 Token::make_word("value", None),
1406 Token::RParen,
1407 ];
1408 compare(expected, tokens);
1409 }
1410
1411 #[test]
1412 fn tokenize_is_null() {
1413 let sql = String::from("a IS NULL");
1414 let mut tokenizer = Tokenizer::new(&sql);
1415 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1416
1417 let expected = vec![
1418 Token::make_word("a", None),
1419 Token::Whitespace(Whitespace::Space),
1420 Token::make_keyword("IS"),
1421 Token::Whitespace(Whitespace::Space),
1422 Token::make_keyword("NULL"),
1423 ];
1424
1425 compare(expected, tokens);
1426 }
1427
1428 #[test]
1429 fn tokenize_comment() {
1430 let sql = String::from("0--this is a comment\n1");
1431 let mut tokenizer = Tokenizer::new(&sql);
1432 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1433 let expected = vec![
1434 Token::Number("0".to_owned()),
1435 Token::Whitespace(Whitespace::SingleLineComment {
1436 prefix: "--".to_owned(),
1437 comment: "this is a comment\n".to_owned(),
1438 }),
1439 Token::Number("1".to_owned()),
1440 ];
1441 compare(expected, tokens);
1442 }
1443
1444 #[test]
1445 fn tokenize_comment_at_eof() {
1446 let sql = String::from("--this is a comment");
1447 let mut tokenizer = Tokenizer::new(&sql);
1448 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1449 let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
1450 prefix: "--".to_owned(),
1451 comment: "this is a comment".to_owned(),
1452 })];
1453 compare(expected, tokens);
1454 }
1455
1456 #[test]
1457 fn tokenize_multiline_comment() {
1458 let sql = String::from("0/*multi-line\n* /comment*/1");
1459 let mut tokenizer = Tokenizer::new(&sql);
1460 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1461 let expected = vec![
1462 Token::Number("0".to_owned()),
1463 Token::Whitespace(Whitespace::MultiLineComment(
1464 "multi-line\n* /comment".to_owned(),
1465 )),
1466 Token::Number("1".to_owned()),
1467 ];
1468 compare(expected, tokens);
1469 }
1470
1471 #[test]
1472 fn tokenize_nested_multiline_comment() {
1473 let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
1474 let mut tokenizer = Tokenizer::new(&sql);
1475 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1476 let expected = vec![
1477 Token::Number("0".to_owned()),
1478 Token::Whitespace(Whitespace::MultiLineComment(
1479 "multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_owned(),
1480 )),
1481 Token::Number("1".to_owned()),
1482 ];
1483 compare(expected, tokens);
1484 }
1485
1486 #[test]
1487 fn tokenize_multiline_comment_with_even_asterisks() {
1488 let sql = String::from("\n/** Comment **/\n");
1489 let mut tokenizer = Tokenizer::new(&sql);
1490 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1491 let expected = vec![
1492 Token::Whitespace(Whitespace::Newline),
1493 Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_owned())),
1494 Token::Whitespace(Whitespace::Newline),
1495 ];
1496 compare(expected, tokens);
1497 }
1498
1499 #[test]
1500 fn tokenize_mismatched_quotes() {
1501 let sql = String::from("\"foo");
1502 let mut tokenizer = Tokenizer::new(&sql);
1503 assert_eq!(
1504 tokenizer.tokenize_with_whitespace(),
1505 Err(TokenizerError {
1506 message: "Expected close delimiter '\"' before EOF.".to_owned(),
1507 line: 1,
1508 col: 5,
1509 context: "LINE 1: \"foo\n ^".to_owned(),
1510 })
1511 );
1512 }
1513
1514 #[test]
1515 fn tokenize_newlines() {
1516 let sql = String::from("line1\nline2\rline3\r\nline4\r");
1517 let mut tokenizer = Tokenizer::new(&sql);
1518 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1519 let expected = vec![
1520 Token::make_word("line1", None),
1521 Token::Whitespace(Whitespace::Newline),
1522 Token::make_word("line2", None),
1523 Token::Whitespace(Whitespace::Newline),
1524 Token::make_word("line3", None),
1525 Token::Whitespace(Whitespace::Newline),
1526 Token::make_word("line4", None),
1527 Token::Whitespace(Whitespace::Newline),
1528 ];
1529 compare(expected, tokens);
1530 }
1531
1532 #[test]
1533 fn tokenize_pg_regex_match() {
1534 let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
1535 let mut tokenizer = Tokenizer::new(sql);
1536 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1537 let expected = vec![
1538 Token::make_keyword("SELECT"),
1539 Token::Whitespace(Whitespace::Space),
1540 Token::make_word("col", None),
1541 Token::Whitespace(Whitespace::Space),
1542 Token::Op("~".to_owned()),
1543 Token::Whitespace(Whitespace::Space),
1544 Token::SingleQuotedString("^a".into()),
1545 Token::Comma,
1546 Token::Whitespace(Whitespace::Space),
1547 Token::make_word("col", None),
1548 Token::Whitespace(Whitespace::Space),
1549 Token::Op("~*".to_owned()),
1550 Token::Whitespace(Whitespace::Space),
1551 Token::SingleQuotedString("^a".into()),
1552 Token::Comma,
1553 Token::Whitespace(Whitespace::Space),
1554 Token::make_word("col", None),
1555 Token::Whitespace(Whitespace::Space),
1556 Token::Op("!~".to_owned()),
1557 Token::Whitespace(Whitespace::Space),
1558 Token::SingleQuotedString("^a".into()),
1559 Token::Comma,
1560 Token::Whitespace(Whitespace::Space),
1561 Token::make_word("col", None),
1562 Token::Whitespace(Whitespace::Space),
1563 Token::Op("!~*".to_owned()),
1564 Token::Whitespace(Whitespace::Space),
1565 Token::SingleQuotedString("^a".into()),
1566 ];
1567 compare(expected, tokens);
1568 }
1569
1570 #[test]
1571 fn tokenize_select_array() {
1572 let sql = String::from("SELECT '{1, 2, 3}'");
1573 let mut tokenizer = Tokenizer::new(&sql);
1574 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1575
1576 let expected = vec![
1577 Token::make_keyword("SELECT"),
1578 Token::Whitespace(Whitespace::Space),
1579 Token::SingleQuotedString(String::from("{1, 2, 3}")),
1580 ];
1581
1582 compare(expected, tokens);
1583 }
1584
1585 fn compare(expected: Vec<Token>, actual: Vec<Token>) {
1586 assert_eq!(expected, actual);
1591 }
1592}