1#[cfg(not(feature = "std"))]
20use alloc::{
21 borrow::ToOwned,
22 format,
23 string::{String, ToString},
24 vec,
25 vec::Vec,
26};
27use core::fmt;
28use core::fmt::Debug;
29use core::iter::Peekable;
30use core::str::Chars;
31
32#[cfg(feature = "serde")]
33use serde::{Deserialize, Serialize};
34
35use crate::ast::{CstyleEscapedString, DollarQuotedString};
36use crate::keywords::{ALL_KEYWORDS, ALL_KEYWORDS_INDEX, Keyword};
37
38#[derive(Debug, Clone, PartialEq, Eq, Hash)]
40#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
41pub enum Token {
42 EOF,
44 Word(Word),
46 Number(String),
48 Char(char),
50 SingleQuotedString(String),
52 DollarQuotedString(DollarQuotedString),
54 CstyleEscapesString(CstyleEscapedString),
56 NationalStringLiteral(String),
58 HexStringLiteral(String),
60 Parameter(String),
62 Comma,
64 Whitespace(Whitespace),
66 DoubleEq,
68 Eq,
70 Neq,
72 Lt,
74 Gt,
76 LtEq,
78 GtEq,
80 Spaceship,
82 Plus,
84 Minus,
86 Mul,
88 Div,
90 Mod,
92 Concat,
94 LParen,
96 RParen,
98 Period,
100 Colon,
102 DoubleColon,
104 SemiColon,
106 Backslash,
108 LBracket,
110 RBracket,
112 Ampersand,
114 Pipe,
116 Caret,
118 Prefix,
120 LBrace,
122 RBrace,
124 RArrow,
126 Sharp,
128 Tilde,
131 TildeAsterisk,
133 ExclamationMarkTilde,
135 ExclamationMarkTildeAsterisk,
137 DoubleTilde,
139 DoubleTildeAsterisk,
141 ExclamationMarkDoubleTilde,
143 ExclamationMarkDoubleTildeAsterisk,
145 ShiftLeft,
147 ShiftRight,
149 ExclamationMark,
151 DoubleExclamationMark,
153 AtSign,
155 PGSquareRoot,
157 PGCubeRoot,
159 Arrow,
161 LongArrow,
163 HashArrow,
165 HashLongArrow,
167 HashMinus,
169 AtArrow,
171 ArrowAt,
173 QuestionMark,
175 QuestionMarkPipe,
177 QuestionMarkAmpersand,
179 AtQuestionMark,
181 AtAt,
183}
184
185impl fmt::Display for Token {
186 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
187 match self {
188 Token::EOF => f.write_str("EOF"),
189 Token::Word(w) => write!(f, "{}", w),
190 Token::Number(n) => write!(f, "{}", n),
191 Token::Char(c) => write!(f, "{}", c),
192 Token::SingleQuotedString(s) => write!(f, "'{}'", s),
193 Token::DollarQuotedString(s) => write!(f, "{}", s),
194 Token::NationalStringLiteral(s) => write!(f, "N'{}'", s),
195 Token::HexStringLiteral(s) => write!(f, "X'{}'", s),
196 Token::CstyleEscapesString(s) => write!(f, "E'{}'", s),
197 Token::Parameter(s) => write!(f, "${}", s),
198 Token::Comma => f.write_str(","),
199 Token::Whitespace(ws) => write!(f, "{}", ws),
200 Token::DoubleEq => f.write_str("=="),
201 Token::Spaceship => f.write_str("<=>"),
202 Token::Eq => f.write_str("="),
203 Token::Neq => f.write_str("<>"),
204 Token::Lt => f.write_str("<"),
205 Token::Gt => f.write_str(">"),
206 Token::LtEq => f.write_str("<="),
207 Token::GtEq => f.write_str(">="),
208 Token::Plus => f.write_str("+"),
209 Token::Minus => f.write_str("-"),
210 Token::Mul => f.write_str("*"),
211 Token::Div => f.write_str("/"),
212 Token::Concat => f.write_str("||"),
213 Token::Mod => f.write_str("%"),
214 Token::LParen => f.write_str("("),
215 Token::RParen => f.write_str(")"),
216 Token::Period => f.write_str("."),
217 Token::Colon => f.write_str(":"),
218 Token::DoubleColon => f.write_str("::"),
219 Token::SemiColon => f.write_str(";"),
220 Token::Backslash => f.write_str("\\"),
221 Token::LBracket => f.write_str("["),
222 Token::RBracket => f.write_str("]"),
223 Token::Ampersand => f.write_str("&"),
224 Token::Caret => f.write_str("^"),
225 Token::Prefix => f.write_str("^@"),
226 Token::Pipe => f.write_str("|"),
227 Token::LBrace => f.write_str("{"),
228 Token::RBrace => f.write_str("}"),
229 Token::RArrow => f.write_str("=>"),
230 Token::Sharp => f.write_str("#"),
231 Token::ExclamationMark => f.write_str("!"),
232 Token::DoubleExclamationMark => f.write_str("!!"),
233 Token::Tilde => f.write_str("~"),
234 Token::TildeAsterisk => f.write_str("~*"),
235 Token::ExclamationMarkTilde => f.write_str("!~"),
236 Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
237 Token::DoubleTilde => f.write_str("~~"),
238 Token::DoubleTildeAsterisk => f.write_str("~~*"),
239 Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
240 Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
241 Token::AtSign => f.write_str("@"),
242 Token::ShiftLeft => f.write_str("<<"),
243 Token::ShiftRight => f.write_str(">>"),
244 Token::PGSquareRoot => f.write_str("|/"),
245 Token::PGCubeRoot => f.write_str("||/"),
246 Token::Arrow => f.write_str("->"),
247 Token::LongArrow => f.write_str("->>"),
248 Token::HashArrow => f.write_str("#>"),
249 Token::HashLongArrow => f.write_str("#>>"),
250 Token::HashMinus => f.write_str("#-"),
251 Token::AtArrow => f.write_str("@>"),
252 Token::ArrowAt => f.write_str("<@"),
253 Token::QuestionMark => f.write_str("?"),
254 Token::QuestionMarkPipe => f.write_str("?|"),
255 Token::QuestionMarkAmpersand => f.write_str("?&"),
256 Token::AtQuestionMark => f.write_str("@?"),
257 Token::AtAt => f.write_str("@@"),
258 }
259 }
260}
261
262impl Token {
263 pub fn make_keyword(keyword: &str) -> Self {
264 Token::make_word(keyword, None)
265 }
266
267 pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
268 let word_uppercase = word.to_uppercase();
269 Token::Word(Word {
270 value: word.to_owned(),
271 quote_style,
272 keyword: if quote_style.is_none() {
273 let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
274 keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
275 } else {
276 Keyword::NoKeyword
277 },
278 })
279 }
280
281 pub fn with_location(self, location: Location) -> TokenWithLocation {
282 TokenWithLocation::new(self, location.line, location.column)
283 }
284}
285
286#[derive(Debug, Clone, PartialEq, Eq, Hash)]
288#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
289pub struct Word {
290 pub value: String,
293 pub quote_style: Option<char>,
297 pub keyword: Keyword,
300}
301
302impl fmt::Display for Word {
303 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
304 match self.quote_style {
305 Some(s) if s == '"' || s == '[' || s == '`' => {
306 write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
307 }
308 None => f.write_str(&self.value),
309 _ => panic!("Unexpected quote_style!"),
310 }
311 }
312}
313
314impl Word {
315 fn matching_end_quote(ch: char) -> char {
316 match ch {
317 '"' => '"', '[' => ']', '`' => '`', _ => panic!("unexpected quoting style!"),
321 }
322 }
323}
324
325#[derive(Debug, Clone, PartialEq, Eq, Hash)]
326#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
327pub enum Whitespace {
328 Space,
329 Newline,
330 Tab,
331 SingleLineComment { comment: String, prefix: String },
332 MultiLineComment(String),
333}
334
335impl fmt::Display for Whitespace {
336 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
337 match self {
338 Whitespace::Space => f.write_str(" "),
339 Whitespace::Newline => f.write_str("\n"),
340 Whitespace::Tab => f.write_str("\t"),
341 Whitespace::SingleLineComment { prefix, comment } => write!(f, "{}{}", prefix, comment),
342 Whitespace::MultiLineComment(s) => write!(f, "/*{}*/", s),
343 }
344 }
345}
346
347#[derive(Debug, Eq, PartialEq, Clone)]
349pub struct Location {
350 pub line: u64,
352 pub column: u64,
354}
355
356#[derive(Debug, Eq, PartialEq, Clone)]
358pub struct TokenWithLocation {
359 pub token: Token,
360 pub location: Location,
361}
362
363impl TokenWithLocation {
364 pub fn new(token: Token, line: u64, column: u64) -> TokenWithLocation {
365 TokenWithLocation {
366 token,
367 location: Location { line, column },
368 }
369 }
370
371 pub fn eof() -> TokenWithLocation {
372 TokenWithLocation::new(Token::EOF, 0, 0)
373 }
374}
375
376impl PartialEq<Token> for TokenWithLocation {
377 fn eq(&self, other: &Token) -> bool {
378 &self.token == other
379 }
380}
381
382impl PartialEq<TokenWithLocation> for Token {
383 fn eq(&self, other: &TokenWithLocation) -> bool {
384 self == &other.token
385 }
386}
387
388impl fmt::Display for TokenWithLocation {
389 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
390 if self.token == Token::EOF {
391 write!(f, "end of input")
392 } else {
393 write!(
394 f,
395 "{} at line {}, column {}",
396 self.token, self.location.line, self.location.column
397 )
398 }
399 }
400}
401
402#[derive(Debug, PartialEq)]
404pub struct TokenizerError {
405 pub message: String,
406 pub line: u64,
407 pub col: u64,
408 pub context: String,
409}
410
411impl fmt::Display for TokenizerError {
412 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
413 write!(
414 f,
415 "{} at line {}, column {}\n{}",
416 self.message, self.line, self.col, self.context
417 )
418 }
419}
420
421#[cfg(feature = "std")]
422impl std::error::Error for TokenizerError {}
423
424pub struct Tokenizer<'a> {
426 sql: &'a str,
427 chars: Peekable<Chars<'a>>,
428 line: u64,
429 col: u64,
430}
431
432impl<'a> Tokenizer<'a> {
433 pub fn new(query: &'a str) -> Self {
435 Self {
436 sql: query,
437 chars: query.chars().peekable(),
438 line: 1,
439 col: 1,
440 }
441 }
442
443 fn next(&mut self) -> Option<char> {
445 let ch = self.chars.next();
446 if let Some(ch) = ch {
447 match ch {
448 '\n' => {
449 self.line += 1;
450 self.col = 1;
451 }
452 '\t' => self.col += 4,
453 _ => self.col += 1,
454 }
455 }
456 ch
457 }
458
459 fn peek(&mut self) -> Option<char> {
461 self.chars.peek().cloned()
462 }
463
464 pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
468 let tokens = self.tokenize()?;
469 Ok(tokens
470 .into_iter()
471 .filter(|token| !matches!(&token.token, Token::Whitespace(_)))
472 .collect())
473 }
474
475 #[allow(dead_code)]
479 fn tokenize_with_whitespace(&mut self) -> Result<Vec<Token>, TokenizerError> {
480 let tokens = self.tokenize()?;
481 Ok(tokens.into_iter().map(|t| t.token).collect())
482 }
483
484 fn tokenize(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
488 let mut tokens = Vec::new();
489 while let Some(token) = self.next_token_with_location()? {
490 tokens.push(token);
491 }
492 Ok(tokens)
493 }
494
495 fn next_token_with_location(&mut self) -> Result<Option<TokenWithLocation>, TokenizerError> {
497 let loc = Location {
498 line: self.line,
499 column: self.col,
500 };
501 self.next_token()
502 .map(|t| t.map(|token| token.with_location(loc)))
503 }
504
505 fn next_token(&mut self) -> Result<Option<Token>, TokenizerError> {
507 match self.peek() {
508 Some(ch) => match ch {
509 ' ' => self.consume_and_return(Token::Whitespace(Whitespace::Space)),
510 '\t' => self.consume_and_return(Token::Whitespace(Whitespace::Tab)),
511 '\n' => self.consume_and_return(Token::Whitespace(Whitespace::Newline)),
512 '\r' => {
513 self.next();
515 if let Some('\n') = self.peek() {
516 self.next();
517 }
518 Ok(Some(Token::Whitespace(Whitespace::Newline)))
519 }
520 'N' => {
521 self.next(); match self.peek() {
523 Some('\'') => {
524 let s = self.tokenize_single_quoted_string()?;
526 Ok(Some(Token::NationalStringLiteral(s)))
527 }
528 _ => {
529 let s = self.tokenize_word('N');
531 Ok(Some(Token::make_word(&s, None)))
532 }
533 }
534 }
535 x @ 'e' | x @ 'E' => {
536 self.next(); match self.peek() {
538 Some('\'') => {
539 let s = self.tokenize_single_quoted_string_with_escape()?;
541 Ok(Some(Token::CstyleEscapesString(s)))
542 }
543 _ => {
544 let s = self.tokenize_word(x);
546 Ok(Some(Token::make_word(&s, None)))
547 }
548 }
549 }
550 x @ 'x' | x @ 'X' => {
553 self.next(); match self.peek() {
555 Some('\'') => {
556 let s = self.tokenize_single_quoted_string()?;
558 Ok(Some(Token::HexStringLiteral(s)))
559 }
560 _ => {
561 let s = self.tokenize_word(x);
563 Ok(Some(Token::make_word(&s, None)))
564 }
565 }
566 }
567 ch if is_identifier_start(ch) => {
569 self.next(); let s = self.tokenize_word(ch);
571
572 Ok(Some(Token::make_word(&s, None)))
573 }
574 '\'' => {
576 let s = self.tokenize_single_quoted_string()?;
577
578 Ok(Some(Token::SingleQuotedString(s)))
579 }
580 quote_start if is_delimited_identifier_start(quote_start) => {
582 self.next(); let quote_end = Word::matching_end_quote(quote_start);
584 let s = self.peeking_take_while(|ch| ch != quote_end);
585 if self.next() == Some(quote_end) {
586 Ok(Some(Token::make_word(&s, Some(quote_start))))
587 } else {
588 self.error(format!(
589 "Expected close delimiter '{}' before EOF.",
590 quote_end
591 ))
592 }
593 }
594 '0'..='9' | '.' => {
596 let mut s = self.peeking_take_while(|ch| ch.is_ascii_digit());
597
598 if s == "0"
600 && let Some(radix) = self.peek()
601 && "xob".contains(radix.to_ascii_lowercase())
602 {
603 self.next();
604 let radix = radix.to_ascii_lowercase();
605 let base = match radix {
606 'x' => 16,
607 'o' => 8,
608 'b' => 2,
609 _ => unreachable!(),
610 };
611 let s2 = self.peeking_take_while(|ch| ch.is_digit(base));
612 if s2.is_empty() {
613 return self.error("incomplete integer literal");
614 }
615 self.reject_number_junk()?;
616 return Ok(Some(Token::Number(format!("0{radix}{s2}"))));
617 }
618
619 if let Some('.') = self.peek() {
621 s.push('.');
622 self.next();
623 }
624 s += &self.peeking_take_while(|ch| ch.is_ascii_digit());
625
626 if s == "." {
628 return Ok(Some(Token::Period));
629 }
630
631 match self.peek() {
632 Some('e') | Some('E') => {
634 s.push('e');
635 self.next();
636
637 if let Some('-') = self.peek() {
638 s.push('-');
639 self.next();
640 }
641 s += &self.peeking_take_while(|ch| ch.is_ascii_digit());
642 self.reject_number_junk()?;
643 return Ok(Some(Token::Number(s)));
644 }
645 _ => {}
647 };
648 self.reject_number_junk()?;
649 Ok(Some(Token::Number(s)))
650 }
651 '(' => self.consume_and_return(Token::LParen),
653 ')' => self.consume_and_return(Token::RParen),
654 ',' => self.consume_and_return(Token::Comma),
655 '-' => {
657 self.next(); match self.peek() {
659 Some('-') => {
660 self.next(); let comment = self.tokenize_single_line_comment();
662 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
663 prefix: "--".to_owned(),
664 comment,
665 })))
666 }
667 Some('>') => {
668 self.next(); match self.peek() {
670 Some('>') => {
671 self.next(); Ok(Some(Token::LongArrow))
673 }
674 _ => Ok(Some(Token::Arrow)),
675 }
676 }
677 _ => Ok(Some(Token::Minus)),
679 }
680 }
681 '/' => {
682 self.next(); match self.peek() {
684 Some('*') => {
685 self.next(); self.tokenize_multiline_comment()
687 }
688 _ => Ok(Some(Token::Div)),
690 }
691 }
692 '+' => self.consume_and_return(Token::Plus),
693 '*' => self.consume_and_return(Token::Mul),
694 '%' => self.consume_and_return(Token::Mod),
695 '|' => {
696 self.next(); match self.peek() {
698 Some('/') => self.consume_and_return(Token::PGSquareRoot),
699 Some('|') => {
700 self.next(); match self.peek() {
702 Some('/') => self.consume_and_return(Token::PGCubeRoot),
703 _ => Ok(Some(Token::Concat)),
704 }
705 }
706 _ => Ok(Some(Token::Pipe)),
708 }
709 }
710 '=' => {
711 self.next(); match self.peek() {
713 Some('>') => self.consume_and_return(Token::RArrow),
714 _ => Ok(Some(Token::Eq)),
715 }
716 }
717 '!' => {
718 self.next(); match self.peek() {
720 Some('=') => self.consume_and_return(Token::Neq),
721 Some('!') => self.consume_and_return(Token::DoubleExclamationMark),
722 Some('~') => {
723 self.next();
724 match self.peek() {
725 Some('~') => {
726 self.next();
727 match self.peek() {
728 Some('*') => self.consume_and_return(
729 Token::ExclamationMarkDoubleTildeAsterisk,
730 ),
731 _ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
732 }
733 }
734 Some('*') => {
735 self.consume_and_return(Token::ExclamationMarkTildeAsterisk)
736 }
737 _ => Ok(Some(Token::ExclamationMarkTilde)),
738 }
739 }
740 _ => Ok(Some(Token::ExclamationMark)),
741 }
742 }
743 '<' => {
744 self.next(); match self.peek() {
746 Some('=') => {
747 self.next();
748 match self.peek() {
749 Some('>') => self.consume_and_return(Token::Spaceship),
750 _ => Ok(Some(Token::LtEq)),
751 }
752 }
753 Some('>') => self.consume_and_return(Token::Neq),
754 Some('<') => self.consume_and_return(Token::ShiftLeft),
755 Some('@') => self.consume_and_return(Token::ArrowAt),
756 _ => Ok(Some(Token::Lt)),
757 }
758 }
759 '>' => {
760 self.next(); match self.peek() {
762 Some('=') => self.consume_and_return(Token::GtEq),
763 Some('>') => self.consume_and_return(Token::ShiftRight),
764 _ => Ok(Some(Token::Gt)),
765 }
766 }
767 ':' => {
768 self.next();
769 match self.peek() {
770 Some(':') => self.consume_and_return(Token::DoubleColon),
771 _ => Ok(Some(Token::Colon)),
772 }
773 }
774 '$' => Ok(Some(self.tokenize_dollar_preceded_value()?)),
775 ';' => self.consume_and_return(Token::SemiColon),
776 '\\' => self.consume_and_return(Token::Backslash),
777 '[' => self.consume_and_return(Token::LBracket),
778 ']' => self.consume_and_return(Token::RBracket),
779 '&' => self.consume_and_return(Token::Ampersand),
780 '^' => {
781 self.next();
782 match self.peek() {
783 Some('@') => self.consume_and_return(Token::Prefix),
784 _ => Ok(Some(Token::Caret)),
785 }
786 }
787 '{' => self.consume_and_return(Token::LBrace),
788 '}' => self.consume_and_return(Token::RBrace),
789 '~' => {
790 self.next(); match self.peek() {
792 Some('~') => {
793 self.next();
794 match self.peek() {
795 Some('*') => self.consume_and_return(Token::DoubleTildeAsterisk),
796 _ => Ok(Some(Token::DoubleTilde)),
797 }
798 }
799 Some('*') => self.consume_and_return(Token::TildeAsterisk),
800 _ => Ok(Some(Token::Tilde)),
801 }
802 }
803 '#' => {
804 self.next(); match self.peek() {
806 Some('-') => self.consume_and_return(Token::HashMinus),
807 Some('>') => {
808 self.next(); match self.peek() {
810 Some('>') => {
811 self.next(); Ok(Some(Token::HashLongArrow))
813 }
814 _ => Ok(Some(Token::HashArrow)),
815 }
816 }
817 _ => Ok(Some(Token::Sharp)),
819 }
820 }
821 '@' => {
822 self.next(); match self.peek() {
824 Some('>') => self.consume_and_return(Token::AtArrow),
825 Some('?') => self.consume_and_return(Token::AtQuestionMark),
826 Some('@') => self.consume_and_return(Token::AtAt),
827 _ => Ok(Some(Token::AtSign)),
829 }
830 }
831 '?' => {
832 self.next(); match self.peek() {
834 Some('|') => self.consume_and_return(Token::QuestionMarkPipe),
835 Some('&') => self.consume_and_return(Token::QuestionMarkAmpersand),
836 _ => Ok(Some(Token::QuestionMark)),
838 }
839 }
840 other => self.consume_and_return(Token::Char(other)),
841 },
842 None => Ok(None),
843 }
844 }
845
846 fn tokenize_dollar_preceded_value(&mut self) -> Result<Token, TokenizerError> {
848 let mut s = String::new();
849 let mut value = String::new();
850
851 self.next();
852
853 if let Some('$') = self.peek() {
854 self.next();
855
856 let mut is_terminated = false;
857 let mut prev: Option<char> = None;
858
859 while let Some(ch) = self.peek() {
860 if prev == Some('$') {
861 if ch == '$' {
862 self.next();
863 is_terminated = true;
864 break;
865 } else {
866 s.push('$');
867 s.push(ch);
868 }
869 } else if ch != '$' {
870 s.push(ch);
871 }
872
873 prev = Some(ch);
874 self.next();
875 }
876
877 return if self.peek().is_none() && !is_terminated {
878 self.error("Unterminated dollar-quoted string")
879 } else {
880 Ok(Token::DollarQuotedString(DollarQuotedString {
881 value: s,
882 tag: None,
883 }))
884 };
885 } else {
886 value.push_str(&self.peeking_take_while(|ch| ch.is_alphanumeric() || ch == '_'));
887
888 if let Some('$') = self.peek() {
889 self.next();
890 s.push_str(&self.peeking_take_while(|ch| ch != '$'));
891
892 match self.peek() {
893 Some('$') => {
894 self.next();
895 for c in value.chars() {
896 let next_char = self.next();
897 if Some(c) != next_char {
898 return self.error(format!(
899 "Unterminated dollar-quoted string at or near \"{}\"",
900 value
901 ));
902 }
903 }
904
905 if let Some('$') = self.peek() {
906 self.next();
907 } else {
908 return self.error("Unterminated dollar-quoted string, expected $");
909 }
910 }
911 _ => {
912 return self.error("Unterminated dollar-quoted, expected $");
913 }
914 }
915 } else {
916 return Ok(Token::Parameter(value));
917 }
918 }
919
920 Ok(Token::DollarQuotedString(DollarQuotedString {
921 value: s,
922 tag: if value.is_empty() { None } else { Some(value) },
923 }))
924 }
925
926 fn error<R>(&self, message: impl Into<String>) -> Result<R, TokenizerError> {
927 let prefix = format!("LINE {}: ", self.line);
928 let sql_line = self.sql.split('\n').nth(self.line as usize - 1).unwrap();
929 let cursor = " ".repeat(prefix.len() + self.col as usize - 1);
930 let context = format!("{}{}\n{}^", prefix, sql_line, cursor);
931 Err(TokenizerError {
932 message: message.into(),
933 col: self.col,
934 line: self.line,
935 context,
936 })
937 }
938
939 fn reject_number_junk(&mut self) -> Result<(), TokenizerError> {
940 if let Some(ch) = self.peek()
941 && is_identifier_start(ch)
942 {
943 return self.error("trailing junk after numeric literal");
944 }
945 Ok(())
946 }
947
948 fn tokenize_single_line_comment(&mut self) -> String {
950 let mut comment = self.peeking_take_while(|ch| ch != '\n');
951 if let Some(ch) = self.next() {
952 assert_eq!(ch, '\n');
953 comment.push(ch);
954 }
955 comment
956 }
957
958 fn tokenize_word(&mut self, first_char: char) -> String {
960 let mut s = first_char.to_string();
961 s.push_str(&self.peeking_take_while(is_identifier_part));
962 s
963 }
964
965 fn tokenize_single_quoted_string(&mut self) -> Result<String, TokenizerError> {
967 let mut s = String::new();
968 self.next(); let mut is_escaped = false;
972 while let Some(ch) = self.peek() {
973 match ch {
974 '\'' => {
975 self.next(); if is_escaped {
977 s.push(ch);
978 is_escaped = false;
979 } else if self.peek().map(|c| c == '\'').unwrap_or(false) {
980 s.push(ch);
981 self.next();
982 } else {
983 return Ok(s);
984 }
985 }
986 '\\' => {
987 s.push(ch);
988 self.next();
989 }
990 _ => {
991 self.next(); s.push(ch);
993 }
994 }
995 }
996 self.error("Unterminated string literal")
997 }
998
999 fn tokenize_single_quoted_string_with_escape(
1001 &mut self,
1002 ) -> Result<CstyleEscapedString, TokenizerError> {
1003 let mut terminated = false;
1004 let mut s = String::new();
1005 self.next(); while let Some(ch) = self.peek() {
1008 match ch {
1009 '\'' => {
1010 self.next(); if self.peek().map(|c| c == '\'').unwrap_or(false) {
1012 s.push('\\');
1013 s.push(ch);
1014 self.next();
1015 } else {
1016 terminated = true;
1017 break;
1018 }
1019 }
1020 '\\' => {
1021 s.push(ch);
1022 self.next();
1023 if self.peek().map(|c| c == '\'' || c == '\\').unwrap_or(false) {
1024 s.push(self.next().unwrap());
1025 }
1026 }
1027 _ => {
1028 self.next(); s.push(ch);
1030 }
1031 }
1032 }
1033
1034 if !terminated {
1035 return self.error("Unterminated string literal");
1036 }
1037
1038 let unescaped = match Self::unescape_c_style(&s) {
1039 Ok(unescaped) => unescaped,
1040 Err(e) => return self.error(e),
1041 };
1042
1043 Ok(CstyleEscapedString {
1044 value: unescaped,
1045 raw: s,
1046 })
1047 }
1048
1049 fn unescape_c_style(s: &str) -> Result<String, String> {
1055 fn hex_byte_process(
1056 chars: &mut Peekable<Chars<'_>>,
1057 res: &mut String,
1058 len: usize,
1059 default_char: char,
1060 ) -> Result<(), String> {
1061 let mut unicode_seq: String = String::with_capacity(len);
1062 for _ in 0..len {
1063 if let Some(c) = chars.peek()
1064 && c.is_ascii_hexdigit()
1065 {
1066 unicode_seq.push(chars.next().unwrap());
1067 } else {
1068 break;
1069 }
1070 }
1071
1072 if unicode_seq.is_empty() && len == 2 {
1073 res.push(default_char);
1074 return Ok(());
1075 } else if unicode_seq.len() < len && len != 2 {
1076 return Err("invalid unicode sequence: must be \\uXXXX or \\UXXXXXXXX".to_owned());
1077 }
1078
1079 if len == 2 {
1080 let number = [u8::from_str_radix(&unicode_seq, 16)
1081 .map_err(|e| format!("invalid unicode sequence: {}", e))?];
1082
1083 res.push(
1084 std::str::from_utf8(&number)
1085 .map_err(|err| format!("invalid unicode sequence: {}", err))?
1086 .chars()
1087 .next()
1088 .unwrap(),
1089 );
1090 } else {
1091 let number = u32::from_str_radix(&unicode_seq, 16)
1092 .map_err(|e| format!("invalid unicode sequence: {}", e))?;
1093 res.push(
1094 char::from_u32(number)
1095 .ok_or_else(|| format!("invalid unicode sequence: {}", unicode_seq))?,
1096 );
1097 }
1098 Ok(())
1099 }
1100
1101 fn octal_byte_process(
1102 chars: &mut Peekable<Chars<'_>>,
1103 res: &mut String,
1104 digit: char,
1105 ) -> Result<(), String> {
1106 let mut unicode_seq: String = String::with_capacity(3);
1107 unicode_seq.push(digit);
1108 for _ in 0..2 {
1109 if let Some(c) = chars.peek()
1110 && matches!(*c, '0'..='7')
1111 {
1112 unicode_seq.push(chars.next().unwrap());
1113 } else {
1114 break;
1115 }
1116 }
1117
1118 let number = [u8::from_str_radix(&unicode_seq, 8)
1119 .map_err(|e| format!("invalid unicode sequence: {}", e))?];
1120
1121 res.push(
1122 std::str::from_utf8(&number)
1123 .map_err(|err| format!("invalid unicode sequence: {}", err))?
1124 .chars()
1125 .next()
1126 .unwrap(),
1127 );
1128 Ok(())
1129 }
1130
1131 let mut chars = s.chars().peekable();
1132 let mut res = String::with_capacity(s.len());
1133
1134 while let Some(c) = chars.next() {
1135 if c == '\\' {
1136 match chars.next() {
1137 None => {
1138 return Err("unterminated escape sequence".to_owned());
1139 }
1140 Some(next_c) => match next_c {
1141 'b' => res.push('\u{08}'),
1142 'f' => res.push('\u{0C}'),
1143 'n' => res.push('\n'),
1144 'r' => res.push('\r'),
1145 't' => res.push('\t'),
1146 'x' => hex_byte_process(&mut chars, &mut res, 2, 'x')?,
1147 'u' => hex_byte_process(&mut chars, &mut res, 4, 'u')?,
1148 'U' => hex_byte_process(&mut chars, &mut res, 8, 'U')?,
1149 digit @ '0'..='7' => octal_byte_process(&mut chars, &mut res, digit)?,
1150 _ => res.push(next_c),
1151 },
1152 }
1153 } else {
1154 res.push(c);
1155 }
1156 }
1157
1158 Ok(res)
1159 }
1160
1161 fn tokenize_multiline_comment(&mut self) -> Result<Option<Token>, TokenizerError> {
1162 let mut s = String::new();
1163
1164 let mut nested = 1;
1165 let mut last_ch = ' ';
1166
1167 loop {
1168 match self.next() {
1169 Some(ch) => {
1170 if last_ch == '/' && ch == '*' {
1171 nested += 1;
1172 } else if last_ch == '*' && ch == '/' {
1173 nested -= 1;
1174 if nested == 0 {
1175 s.pop();
1176 break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
1177 }
1178 }
1179 s.push(ch);
1180 last_ch = ch;
1181 }
1182 None => break self.error("Unexpected EOF while in a multi-line comment"),
1183 }
1184 }
1185 }
1186
1187 #[allow(clippy::unnecessary_wraps)]
1188 fn consume_and_return(&mut self, t: Token) -> Result<Option<Token>, TokenizerError> {
1189 self.next();
1190 Ok(Some(t))
1191 }
1192
1193 fn peeking_take_while(&mut self, mut predicate: impl FnMut(char) -> bool) -> String {
1197 let mut s = String::new();
1198 while let Some(ch) = self.peek() {
1199 if predicate(ch) {
1200 self.next(); s.push(ch);
1202 } else {
1203 break;
1204 }
1205 }
1206 s
1207 }
1208}
1209
1210fn is_delimited_identifier_start(ch: char) -> bool {
1216 ch == '"'
1217}
1218
1219fn is_identifier_start(ch: char) -> bool {
1221 ch.is_ascii_alphabetic() || ch == '_'
1225}
1226
1227fn is_identifier_part(ch: char) -> bool {
1229 ch.is_ascii_alphanumeric() || ch == '$' || ch == '_'
1230}
1231
1232#[cfg(test)]
1233mod tests {
1234 use super::*;
1235
1236 #[test]
1237 fn tokenizer_error_impl() {
1238 let err = TokenizerError {
1239 message: "test".into(),
1240 line: 1,
1241 col: 1,
1242 context: "LINE 1:".to_owned(),
1243 };
1244 #[cfg(feature = "std")]
1245 {
1246 use std::error::Error;
1247 assert!(err.source().is_none());
1248 }
1249 assert_eq!(err.to_string(), "test at line 1, column 1\nLINE 1:");
1250 }
1251
1252 #[test]
1253 fn tokenize_select_1() {
1254 let sql = String::from("SELECT 1");
1255 let mut tokenizer = Tokenizer::new(&sql);
1256 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1257
1258 let expected = vec![
1259 Token::make_keyword("SELECT"),
1260 Token::Whitespace(Whitespace::Space),
1261 Token::Number(String::from("1")),
1262 ];
1263
1264 compare(expected, tokens);
1265 }
1266
1267 #[test]
1268 fn tokenize_select_float() {
1269 let sql = String::from("SELECT .1");
1270 let mut tokenizer = Tokenizer::new(&sql);
1271 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1272
1273 let expected = vec![
1274 Token::make_keyword("SELECT"),
1275 Token::Whitespace(Whitespace::Space),
1276 Token::Number(String::from(".1")),
1277 ];
1278
1279 compare(expected, tokens);
1280 }
1281
1282 #[test]
1283 fn tokenize_scalar_function() {
1284 let sql = String::from("SELECT sqrt(1)");
1285 let mut tokenizer = Tokenizer::new(&sql);
1286 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1287
1288 let expected = vec![
1289 Token::make_keyword("SELECT"),
1290 Token::Whitespace(Whitespace::Space),
1291 Token::make_word("sqrt", None),
1292 Token::LParen,
1293 Token::Number(String::from("1")),
1294 Token::RParen,
1295 ];
1296
1297 compare(expected, tokens);
1298 }
1299
1300 #[test]
1301 fn tokenize_string_string_concat() {
1302 let sql = String::from("SELECT 'a' || 'b'");
1303 let mut tokenizer = Tokenizer::new(&sql);
1304 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1305
1306 let expected = vec![
1307 Token::make_keyword("SELECT"),
1308 Token::Whitespace(Whitespace::Space),
1309 Token::SingleQuotedString(String::from("a")),
1310 Token::Whitespace(Whitespace::Space),
1311 Token::Concat,
1312 Token::Whitespace(Whitespace::Space),
1313 Token::SingleQuotedString(String::from("b")),
1314 ];
1315
1316 compare(expected, tokens);
1317 }
1318
1319 #[test]
1320 fn tokenize_bitwise_op() {
1321 let sql = String::from("SELECT one | two ^ three");
1322 let mut tokenizer = Tokenizer::new(&sql);
1323 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1324
1325 let expected = vec![
1326 Token::make_keyword("SELECT"),
1327 Token::Whitespace(Whitespace::Space),
1328 Token::make_word("one", None),
1329 Token::Whitespace(Whitespace::Space),
1330 Token::Pipe,
1331 Token::Whitespace(Whitespace::Space),
1332 Token::make_word("two", None),
1333 Token::Whitespace(Whitespace::Space),
1334 Token::Caret,
1335 Token::Whitespace(Whitespace::Space),
1336 Token::make_word("three", None),
1337 ];
1338 compare(expected, tokens);
1339 }
1340
1341 #[test]
1342 fn tokenize_logical_xor() {
1343 let sql =
1344 String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
1345 let mut tokenizer = Tokenizer::new(&sql);
1346 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1347
1348 let expected = vec![
1349 Token::make_keyword("SELECT"),
1350 Token::Whitespace(Whitespace::Space),
1351 Token::make_keyword("true"),
1352 Token::Whitespace(Whitespace::Space),
1353 Token::make_keyword("XOR"),
1354 Token::Whitespace(Whitespace::Space),
1355 Token::make_keyword("true"),
1356 Token::Comma,
1357 Token::Whitespace(Whitespace::Space),
1358 Token::make_keyword("false"),
1359 Token::Whitespace(Whitespace::Space),
1360 Token::make_keyword("XOR"),
1361 Token::Whitespace(Whitespace::Space),
1362 Token::make_keyword("false"),
1363 Token::Comma,
1364 Token::Whitespace(Whitespace::Space),
1365 Token::make_keyword("true"),
1366 Token::Whitespace(Whitespace::Space),
1367 Token::make_keyword("XOR"),
1368 Token::Whitespace(Whitespace::Space),
1369 Token::make_keyword("false"),
1370 Token::Comma,
1371 Token::Whitespace(Whitespace::Space),
1372 Token::make_keyword("false"),
1373 Token::Whitespace(Whitespace::Space),
1374 Token::make_keyword("XOR"),
1375 Token::Whitespace(Whitespace::Space),
1376 Token::make_keyword("true"),
1377 ];
1378 compare(expected, tokens);
1379 }
1380
1381 #[test]
1382 fn tokenize_simple_select() {
1383 let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
1384 let mut tokenizer = Tokenizer::new(&sql);
1385 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1386
1387 let expected = vec![
1388 Token::make_keyword("SELECT"),
1389 Token::Whitespace(Whitespace::Space),
1390 Token::Mul,
1391 Token::Whitespace(Whitespace::Space),
1392 Token::make_keyword("FROM"),
1393 Token::Whitespace(Whitespace::Space),
1394 Token::make_word("customer", None),
1395 Token::Whitespace(Whitespace::Space),
1396 Token::make_keyword("WHERE"),
1397 Token::Whitespace(Whitespace::Space),
1398 Token::make_word("id", None),
1399 Token::Whitespace(Whitespace::Space),
1400 Token::Eq,
1401 Token::Whitespace(Whitespace::Space),
1402 Token::Number(String::from("1")),
1403 Token::Whitespace(Whitespace::Space),
1404 Token::make_keyword("LIMIT"),
1405 Token::Whitespace(Whitespace::Space),
1406 Token::Number(String::from("5")),
1407 ];
1408
1409 compare(expected, tokens);
1410 }
1411
1412 #[test]
1413 fn tokenize_explain_select() {
1414 let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
1415 let mut tokenizer = Tokenizer::new(&sql);
1416 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1417
1418 let expected = vec![
1419 Token::make_keyword("EXPLAIN"),
1420 Token::Whitespace(Whitespace::Space),
1421 Token::make_keyword("SELECT"),
1422 Token::Whitespace(Whitespace::Space),
1423 Token::Mul,
1424 Token::Whitespace(Whitespace::Space),
1425 Token::make_keyword("FROM"),
1426 Token::Whitespace(Whitespace::Space),
1427 Token::make_word("customer", None),
1428 Token::Whitespace(Whitespace::Space),
1429 Token::make_keyword("WHERE"),
1430 Token::Whitespace(Whitespace::Space),
1431 Token::make_word("id", None),
1432 Token::Whitespace(Whitespace::Space),
1433 Token::Eq,
1434 Token::Whitespace(Whitespace::Space),
1435 Token::Number(String::from("1")),
1436 ];
1437
1438 compare(expected, tokens);
1439 }
1440
1441 #[test]
1442 fn tokenize_explain_analyze_select() {
1443 let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
1444 let mut tokenizer = Tokenizer::new(&sql);
1445 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1446
1447 let expected = vec![
1448 Token::make_keyword("EXPLAIN"),
1449 Token::Whitespace(Whitespace::Space),
1450 Token::make_keyword("ANALYZE"),
1451 Token::Whitespace(Whitespace::Space),
1452 Token::make_keyword("SELECT"),
1453 Token::Whitespace(Whitespace::Space),
1454 Token::Mul,
1455 Token::Whitespace(Whitespace::Space),
1456 Token::make_keyword("FROM"),
1457 Token::Whitespace(Whitespace::Space),
1458 Token::make_word("customer", None),
1459 Token::Whitespace(Whitespace::Space),
1460 Token::make_keyword("WHERE"),
1461 Token::Whitespace(Whitespace::Space),
1462 Token::make_word("id", None),
1463 Token::Whitespace(Whitespace::Space),
1464 Token::Eq,
1465 Token::Whitespace(Whitespace::Space),
1466 Token::Number(String::from("1")),
1467 ];
1468
1469 compare(expected, tokens);
1470 }
1471
1472 #[test]
1473 fn tokenize_string_predicate() {
1474 let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
1475 let mut tokenizer = Tokenizer::new(&sql);
1476 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1477
1478 let expected = vec![
1479 Token::make_keyword("SELECT"),
1480 Token::Whitespace(Whitespace::Space),
1481 Token::Mul,
1482 Token::Whitespace(Whitespace::Space),
1483 Token::make_keyword("FROM"),
1484 Token::Whitespace(Whitespace::Space),
1485 Token::make_word("customer", None),
1486 Token::Whitespace(Whitespace::Space),
1487 Token::make_keyword("WHERE"),
1488 Token::Whitespace(Whitespace::Space),
1489 Token::make_word("salary", None),
1490 Token::Whitespace(Whitespace::Space),
1491 Token::Neq,
1492 Token::Whitespace(Whitespace::Space),
1493 Token::SingleQuotedString(String::from("Not Provided")),
1494 ];
1495
1496 compare(expected, tokens);
1497 }
1498
1499 #[test]
1500 fn tokenize_invalid_string() {
1501 let sql = String::from("\nمصطفىh");
1502 let mut tokenizer = Tokenizer::new(&sql);
1503 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1504 let expected = vec![
1506 Token::Whitespace(Whitespace::Newline),
1507 Token::Char('م'),
1508 Token::Char('ص'),
1509 Token::Char('ط'),
1510 Token::Char('ف'),
1511 Token::Char('ى'),
1512 Token::make_word("h", None),
1513 ];
1514 compare(expected, tokens);
1515 }
1516
1517 #[test]
1518 fn tokenize_newline_in_string_literal() {
1519 let sql = String::from("'foo\r\nbar\nbaz'");
1520 let mut tokenizer = Tokenizer::new(&sql);
1521 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1522 let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_owned())];
1523 compare(expected, tokens);
1524 }
1525
1526 #[test]
1527 fn tokenize_unterminated_string_literal() {
1528 let sql = String::from("select 'foo");
1529 let mut tokenizer = Tokenizer::new(&sql);
1530 assert_eq!(
1531 tokenizer.tokenize_with_whitespace(),
1532 Err(TokenizerError {
1533 message: "Unterminated string literal".to_owned(),
1534 line: 1,
1535 col: 12,
1536 context: "LINE 1: select 'foo\n ^".to_owned(),
1537 })
1538 );
1539 }
1540
1541 #[test]
1542 fn tokenize_invalid_string_cols() {
1543 let sql = String::from("\n\nSELECT * FROM table\tمصطفىh");
1544 let mut tokenizer = Tokenizer::new(&sql);
1545 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1546 let expected = vec![
1548 Token::Whitespace(Whitespace::Newline),
1549 Token::Whitespace(Whitespace::Newline),
1550 Token::make_keyword("SELECT"),
1551 Token::Whitespace(Whitespace::Space),
1552 Token::Mul,
1553 Token::Whitespace(Whitespace::Space),
1554 Token::make_keyword("FROM"),
1555 Token::Whitespace(Whitespace::Space),
1556 Token::make_keyword("table"),
1557 Token::Whitespace(Whitespace::Tab),
1558 Token::Char('م'),
1559 Token::Char('ص'),
1560 Token::Char('ط'),
1561 Token::Char('ف'),
1562 Token::Char('ى'),
1563 Token::make_word("h", None),
1564 ];
1565 compare(expected, tokens);
1566 }
1567
1568 #[test]
1569 fn tokenize_right_arrow() {
1570 let sql = String::from("FUNCTION(key=>value)");
1571 let mut tokenizer = Tokenizer::new(&sql);
1572 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1573 let expected = vec![
1574 Token::make_word("FUNCTION", None),
1575 Token::LParen,
1576 Token::make_word("key", None),
1577 Token::RArrow,
1578 Token::make_word("value", None),
1579 Token::RParen,
1580 ];
1581 compare(expected, tokens);
1582 }
1583
1584 #[test]
1585 fn tokenize_is_null() {
1586 let sql = String::from("a IS NULL");
1587 let mut tokenizer = Tokenizer::new(&sql);
1588 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1589
1590 let expected = vec![
1591 Token::make_word("a", None),
1592 Token::Whitespace(Whitespace::Space),
1593 Token::make_keyword("IS"),
1594 Token::Whitespace(Whitespace::Space),
1595 Token::make_keyword("NULL"),
1596 ];
1597
1598 compare(expected, tokens);
1599 }
1600
1601 #[test]
1602 fn tokenize_comment() {
1603 let sql = String::from("0--this is a comment\n1");
1604 let mut tokenizer = Tokenizer::new(&sql);
1605 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1606 let expected = vec![
1607 Token::Number("0".to_owned()),
1608 Token::Whitespace(Whitespace::SingleLineComment {
1609 prefix: "--".to_owned(),
1610 comment: "this is a comment\n".to_owned(),
1611 }),
1612 Token::Number("1".to_owned()),
1613 ];
1614 compare(expected, tokens);
1615 }
1616
1617 #[test]
1618 fn tokenize_comment_at_eof() {
1619 let sql = String::from("--this is a comment");
1620 let mut tokenizer = Tokenizer::new(&sql);
1621 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1622 let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
1623 prefix: "--".to_owned(),
1624 comment: "this is a comment".to_owned(),
1625 })];
1626 compare(expected, tokens);
1627 }
1628
1629 #[test]
1630 fn tokenize_multiline_comment() {
1631 let sql = String::from("0/*multi-line\n* /comment*/1");
1632 let mut tokenizer = Tokenizer::new(&sql);
1633 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1634 let expected = vec![
1635 Token::Number("0".to_owned()),
1636 Token::Whitespace(Whitespace::MultiLineComment(
1637 "multi-line\n* /comment".to_owned(),
1638 )),
1639 Token::Number("1".to_owned()),
1640 ];
1641 compare(expected, tokens);
1642 }
1643
1644 #[test]
1645 fn tokenize_nested_multiline_comment() {
1646 let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
1647 let mut tokenizer = Tokenizer::new(&sql);
1648 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1649 let expected = vec![
1650 Token::Number("0".to_owned()),
1651 Token::Whitespace(Whitespace::MultiLineComment(
1652 "multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_owned(),
1653 )),
1654 Token::Number("1".to_owned()),
1655 ];
1656 compare(expected, tokens);
1657 }
1658
1659 #[test]
1660 fn tokenize_multiline_comment_with_even_asterisks() {
1661 let sql = String::from("\n/** Comment **/\n");
1662 let mut tokenizer = Tokenizer::new(&sql);
1663 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1664 let expected = vec![
1665 Token::Whitespace(Whitespace::Newline),
1666 Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_owned())),
1667 Token::Whitespace(Whitespace::Newline),
1668 ];
1669 compare(expected, tokens);
1670 }
1671
1672 #[test]
1673 fn tokenize_mismatched_quotes() {
1674 let sql = String::from("\"foo");
1675 let mut tokenizer = Tokenizer::new(&sql);
1676 assert_eq!(
1677 tokenizer.tokenize_with_whitespace(),
1678 Err(TokenizerError {
1679 message: "Expected close delimiter '\"' before EOF.".to_owned(),
1680 line: 1,
1681 col: 5,
1682 context: "LINE 1: \"foo\n ^".to_owned(),
1683 })
1684 );
1685 }
1686
1687 #[test]
1688 fn tokenize_newlines() {
1689 let sql = String::from("line1\nline2\rline3\r\nline4\r");
1690 let mut tokenizer = Tokenizer::new(&sql);
1691 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1692 let expected = vec![
1693 Token::make_word("line1", None),
1694 Token::Whitespace(Whitespace::Newline),
1695 Token::make_word("line2", None),
1696 Token::Whitespace(Whitespace::Newline),
1697 Token::make_word("line3", None),
1698 Token::Whitespace(Whitespace::Newline),
1699 Token::make_word("line4", None),
1700 Token::Whitespace(Whitespace::Newline),
1701 ];
1702 compare(expected, tokens);
1703 }
1704
1705 #[test]
1706 fn tokenize_pg_regex_match() {
1707 let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
1708 let mut tokenizer = Tokenizer::new(sql);
1709 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1710 let expected = vec![
1711 Token::make_keyword("SELECT"),
1712 Token::Whitespace(Whitespace::Space),
1713 Token::make_word("col", None),
1714 Token::Whitespace(Whitespace::Space),
1715 Token::Tilde,
1716 Token::Whitespace(Whitespace::Space),
1717 Token::SingleQuotedString("^a".into()),
1718 Token::Comma,
1719 Token::Whitespace(Whitespace::Space),
1720 Token::make_word("col", None),
1721 Token::Whitespace(Whitespace::Space),
1722 Token::TildeAsterisk,
1723 Token::Whitespace(Whitespace::Space),
1724 Token::SingleQuotedString("^a".into()),
1725 Token::Comma,
1726 Token::Whitespace(Whitespace::Space),
1727 Token::make_word("col", None),
1728 Token::Whitespace(Whitespace::Space),
1729 Token::ExclamationMarkTilde,
1730 Token::Whitespace(Whitespace::Space),
1731 Token::SingleQuotedString("^a".into()),
1732 Token::Comma,
1733 Token::Whitespace(Whitespace::Space),
1734 Token::make_word("col", None),
1735 Token::Whitespace(Whitespace::Space),
1736 Token::ExclamationMarkTildeAsterisk,
1737 Token::Whitespace(Whitespace::Space),
1738 Token::SingleQuotedString("^a".into()),
1739 ];
1740 compare(expected, tokens);
1741 }
1742
1743 #[test]
1744 fn tokenize_select_array() {
1745 let sql = String::from("SELECT '{1, 2, 3}'");
1746 let mut tokenizer = Tokenizer::new(&sql);
1747 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1748
1749 let expected = vec![
1750 Token::make_keyword("SELECT"),
1751 Token::Whitespace(Whitespace::Space),
1752 Token::SingleQuotedString(String::from("{1, 2, 3}")),
1753 ];
1754
1755 compare(expected, tokens);
1756 }
1757
1758 fn compare(expected: Vec<Token>, actual: Vec<Token>) {
1759 assert_eq!(expected, actual);
1764 }
1765}