1use std::fmt;
20use std::fmt::Debug;
21use std::iter::Peekable;
22use std::str::Chars;
23
24use crate::ast::{CstyleEscapedString, DollarQuotedString};
25use crate::keywords::{ALL_KEYWORDS, ALL_KEYWORDS_INDEX, Keyword};
26
27#[derive(Debug, Clone, PartialEq, Eq, Hash)]
29pub enum Token {
30 EOF,
32 Word(Word),
34 Number(String),
36 Char(char),
38 SingleQuotedString(String),
40 DollarQuotedString(DollarQuotedString),
42 CstyleEscapesString(CstyleEscapedString),
44 NationalStringLiteral(String),
46 HexStringLiteral(String),
48 Parameter(String),
50 Comma,
52 Whitespace(Whitespace),
54 Op(String),
56 Eq,
58 Neq,
60 Lt,
62 Gt,
64 LtEq,
66 GtEq,
68 Plus,
70 Minus,
72 Mul,
74 Div,
76 Mod,
78 LParen,
80 RParen,
82 Period,
84 Colon,
86 DoubleColon,
88 SemiColon,
90 Backslash,
92 LBracket,
94 RBracket,
96 Pipe,
98 Caret,
100 LBrace,
102 RBrace,
104 RArrow,
106}
107
108impl fmt::Display for Token {
109 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
110 match self {
111 Token::EOF => f.write_str("EOF"),
112 Token::Word(w) => write!(f, "{}", w),
113 Token::Number(n) => write!(f, "{}", n),
114 Token::Char(c) => write!(f, "{}", c),
115 Token::SingleQuotedString(s) => write!(f, "'{}'", s),
116 Token::DollarQuotedString(s) => write!(f, "{}", s),
117 Token::NationalStringLiteral(s) => write!(f, "N'{}'", s),
118 Token::HexStringLiteral(s) => write!(f, "X'{}'", s),
119 Token::CstyleEscapesString(s) => write!(f, "E'{}'", s),
120 Token::Parameter(s) => write!(f, "${}", s),
121 Token::Comma => f.write_str(","),
122 Token::Whitespace(ws) => write!(f, "{}", ws),
123 Token::Op(op) => write!(f, "{}", op),
124 Token::Eq => f.write_str("="),
125 Token::Neq => f.write_str("<>"),
126 Token::Lt => f.write_str("<"),
127 Token::Gt => f.write_str(">"),
128 Token::LtEq => f.write_str("<="),
129 Token::GtEq => f.write_str(">="),
130 Token::Plus => f.write_str("+"),
131 Token::Minus => f.write_str("-"),
132 Token::Mul => f.write_str("*"),
133 Token::Div => f.write_str("/"),
134 Token::Mod => f.write_str("%"),
135 Token::LParen => f.write_str("("),
136 Token::RParen => f.write_str(")"),
137 Token::Period => f.write_str("."),
138 Token::Colon => f.write_str(":"),
139 Token::DoubleColon => f.write_str("::"),
140 Token::SemiColon => f.write_str(";"),
141 Token::Backslash => f.write_str("\\"),
142 Token::LBracket => f.write_str("["),
143 Token::RBracket => f.write_str("]"),
144 Token::Caret => f.write_str("^"),
145 Token::Pipe => f.write_str("|"),
146 Token::LBrace => f.write_str("{"),
147 Token::RBrace => f.write_str("}"),
148 Token::RArrow => f.write_str("=>"),
149 }
150 }
151}
152
153impl Token {
154 pub fn make_keyword(keyword: &str) -> Self {
155 Token::make_word(keyword, None)
156 }
157
158 pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
159 let word_uppercase = word.to_uppercase();
160 Token::Word(Word {
161 value: word.to_owned(),
162 quote_style,
163 keyword: if quote_style.is_none() {
164 let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
165 keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
166 } else {
167 Keyword::NoKeyword
168 },
169 })
170 }
171
172 pub fn with_location(self, location: Location) -> TokenWithLocation {
173 TokenWithLocation::new(self, location.line, location.column)
174 }
175}
176
177#[derive(Debug, Clone, PartialEq, Eq, Hash)]
179pub struct Word {
180 pub value: String,
183 pub quote_style: Option<char>,
187 pub keyword: Keyword,
190}
191
192impl fmt::Display for Word {
193 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
194 match self.quote_style {
195 Some(s) if s == '[' || s == '`' => {
196 write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
197 }
198 Some('"') => write!(f, "\"{}\"", self.value.replace('"', "\"\"")),
199 None => f.write_str(&self.value),
200 _ => panic!("Unexpected quote_style!"),
201 }
202 }
203}
204
205impl Word {
206 fn matching_end_quote(ch: char) -> char {
207 match ch {
208 '"' => '"', '[' => ']', '`' => '`', _ => panic!("unexpected quoting style!"),
212 }
213 }
214}
215
216#[derive(Debug, Clone, PartialEq, Eq, Hash)]
217pub enum Whitespace {
218 Space,
219 Newline,
220 Tab,
221 SingleLineComment { comment: String, prefix: String },
222 MultiLineComment(String),
223}
224
225impl fmt::Display for Whitespace {
226 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
227 match self {
228 Whitespace::Space => f.write_str(" "),
229 Whitespace::Newline => f.write_str("\n"),
230 Whitespace::Tab => f.write_str("\t"),
231 Whitespace::SingleLineComment { prefix, comment } => write!(f, "{}{}", prefix, comment),
232 Whitespace::MultiLineComment(s) => write!(f, "/*{}*/", s),
233 }
234 }
235}
236
237#[derive(Debug, Eq, PartialEq, Clone)]
239pub struct Location {
240 pub line: u64,
242 pub column: u64,
244}
245
246#[derive(Debug, Eq, PartialEq, Clone)]
248pub struct TokenWithLocation {
249 pub token: Token,
250 pub location: Location,
251}
252
253impl TokenWithLocation {
254 pub fn new(token: Token, line: u64, column: u64) -> TokenWithLocation {
255 TokenWithLocation {
256 token,
257 location: Location { line, column },
258 }
259 }
260
261 pub fn eof() -> TokenWithLocation {
262 TokenWithLocation::new(Token::EOF, 0, 0)
263 }
264}
265
266impl PartialEq<Token> for TokenWithLocation {
267 fn eq(&self, other: &Token) -> bool {
268 &self.token == other
269 }
270}
271
272impl PartialEq<TokenWithLocation> for Token {
273 fn eq(&self, other: &TokenWithLocation) -> bool {
274 self == &other.token
275 }
276}
277
278impl fmt::Display for TokenWithLocation {
279 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
280 if self.token == Token::EOF {
281 write!(f, "end of input")
282 } else {
283 write!(
284 f,
285 "{} at line {}, column {}",
286 self.token, self.location.line, self.location.column
287 )
288 }
289 }
290}
291
292#[derive(Debug, PartialEq)]
294pub struct TokenizerError {
295 pub message: String,
296 pub line: u64,
297 pub col: u64,
298 pub context: String,
299}
300
301impl fmt::Display for TokenizerError {
302 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
303 write!(
304 f,
305 "{} at line {}, column {}\n{}",
306 self.message, self.line, self.col, self.context
307 )
308 }
309}
310
311impl std::error::Error for TokenizerError {}
312
313#[derive(Clone)]
315pub struct Tokenizer<'a> {
316 sql: &'a str,
317 chars: Peekable<Chars<'a>>,
318 line: u64,
319 col: u64,
320}
321
322impl<'a> Tokenizer<'a> {
323 pub fn new(query: &'a str) -> Self {
325 Self {
326 sql: query,
327 chars: query.chars().peekable(),
328 line: 1,
329 col: 1,
330 }
331 }
332
333 fn next(&mut self) -> Option<char> {
335 let ch = self.chars.next();
336 if let Some(ch) = ch {
337 match ch {
338 '\n' => {
339 self.line += 1;
340 self.col = 1;
341 }
342 '\t' => self.col += 4,
343 _ => self.col += 1,
344 }
345 }
346 ch
347 }
348
349 fn peek(&mut self) -> Option<char> {
351 self.chars.peek().cloned()
352 }
353
354 pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
358 let tokens = self.tokenize()?;
359 Ok(tokens
360 .into_iter()
361 .filter(|token| !matches!(&token.token, Token::Whitespace(_)))
362 .collect())
363 }
364
365 #[allow(dead_code)]
369 fn tokenize_with_whitespace(&mut self) -> Result<Vec<Token>, TokenizerError> {
370 let tokens = self.tokenize()?;
371 Ok(tokens.into_iter().map(|t| t.token).collect())
372 }
373
374 fn tokenize(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
378 let mut tokens = Vec::new();
379 while let Some(token) = self.next_token_with_location()? {
380 tokens.push(token);
381 }
382 Ok(tokens)
383 }
384
385 fn next_token_with_location(&mut self) -> Result<Option<TokenWithLocation>, TokenizerError> {
387 let loc = Location {
388 line: self.line,
389 column: self.col,
390 };
391 self.next_token()
392 .map(|t| t.map(|token| token.with_location(loc)))
393 }
394
395 fn next_token(&mut self) -> Result<Option<Token>, TokenizerError> {
397 macro_rules! op_chars {
398 (all as_pat) => {
400 '+' | '-' | '*' | '/' | '<' | '>' | '=' | op_chars!(ext as_pat)
401 };
402 (ext $m:ident) => {
403 op_chars!($m '~' '!' '@' '#' '%' '^' '&' '|' '`' '?')
404 };
405 (as_arr $($c:literal)+) => {
406 [ $($c),+ ]
407 };
408 (as_pat $($c:literal)+) => {
409 $($c)|+
410 };
411 }
412
413 match self.peek() {
414 Some(ch) => match ch {
415 ' ' => self.consume_and_return(Token::Whitespace(Whitespace::Space)),
416 '\t' => self.consume_and_return(Token::Whitespace(Whitespace::Tab)),
417 '\n' => self.consume_and_return(Token::Whitespace(Whitespace::Newline)),
418 '\r' => {
419 self.next();
421 if let Some('\n') = self.peek() {
422 self.next();
423 }
424 Ok(Some(Token::Whitespace(Whitespace::Newline)))
425 }
426 'N' => {
427 self.next(); match self.peek() {
429 Some('\'') => {
430 let s = self.tokenize_single_quoted_string()?;
432 Ok(Some(Token::NationalStringLiteral(s)))
433 }
434 _ => {
435 let s = self.tokenize_word('N');
437 Ok(Some(Token::make_word(&s, None)))
438 }
439 }
440 }
441 x @ 'e' | x @ 'E' => {
442 self.next(); match self.peek() {
444 Some('\'') => {
445 let s = self.tokenize_single_quoted_string_with_escape()?;
447 Ok(Some(Token::CstyleEscapesString(s)))
448 }
449 _ => {
450 let s = self.tokenize_word(x);
452 Ok(Some(Token::make_word(&s, None)))
453 }
454 }
455 }
456 x @ 'x' | x @ 'X' => {
459 self.next(); match self.peek() {
461 Some('\'') => {
462 let s = self.tokenize_single_quoted_string()?;
464 Ok(Some(Token::HexStringLiteral(s)))
465 }
466 _ => {
467 let s = self.tokenize_word(x);
469 Ok(Some(Token::make_word(&s, None)))
470 }
471 }
472 }
473 ch if is_identifier_start(ch) => {
475 self.next(); let s = self.tokenize_word(ch);
477
478 Ok(Some(Token::make_word(&s, None)))
479 }
480 '\'' => {
482 let s = self.tokenize_single_quoted_string()?;
483
484 Ok(Some(Token::SingleQuotedString(s)))
485 }
486 quote_start if is_delimited_identifier_start(quote_start) => {
488 let s = self.tokenize_delimited_identifier(quote_start)?;
489 Ok(Some(Token::make_word(&s, Some(quote_start))))
490 }
491 '0'..='9' | '.' => {
493 let mut s = self.peeking_take_while(|ch| ch.is_ascii_digit());
494
495 if s == "0"
497 && let Some(radix) = self.peek()
498 && "xob".contains(radix.to_ascii_lowercase())
499 {
500 self.next();
501 let radix = radix.to_ascii_lowercase();
502 let base = match radix {
503 'x' => 16,
504 'o' => 8,
505 'b' => 2,
506 _ => unreachable!(),
507 };
508 let s2 = self.peeking_take_while(|ch| ch.is_digit(base));
509 if s2.is_empty() {
510 return self.error("incomplete integer literal");
511 }
512 self.reject_number_junk()?;
513 return Ok(Some(Token::Number(format!("0{radix}{s2}"))));
514 }
515
516 if let Some('.') = self.peek() {
518 s.push('.');
519 self.next();
520 }
521 s += &self.peeking_take_while(|ch| ch.is_ascii_digit());
522
523 if s == "." {
525 return Ok(Some(Token::Period));
526 }
527
528 match self.peek() {
529 Some('e') | Some('E') => {
531 s.push('e');
532 self.next();
533
534 if let Some('-') = self.peek() {
535 s.push('-');
536 self.next();
537 }
538 s += &self.peeking_take_while(|ch| ch.is_ascii_digit());
539 self.reject_number_junk()?;
540 return Ok(Some(Token::Number(s)));
541 }
542 _ => {}
544 };
545 self.reject_number_junk()?;
546 Ok(Some(Token::Number(s)))
547 }
548 '(' => self.consume_and_return(Token::LParen),
550 ')' => self.consume_and_return(Token::RParen),
551 ',' => self.consume_and_return(Token::Comma),
552 ':' => {
553 self.next();
554 match self.peek() {
555 Some(':') => self.consume_and_return(Token::DoubleColon),
556 _ => Ok(Some(Token::Colon)),
557 }
558 }
559 '$' => Ok(Some(self.tokenize_dollar_preceded_value()?)),
560 ';' => self.consume_and_return(Token::SemiColon),
561 '\\' => self.consume_and_return(Token::Backslash),
562 '[' => self.consume_and_return(Token::LBracket),
563 ']' => self.consume_and_return(Token::RBracket),
564 '{' => self.consume_and_return(Token::LBrace),
565 '}' => self.consume_and_return(Token::RBrace),
566 op_chars!(all as_pat) => {
568 let mut trial = self.clone();
569 let op_taken = trial.peeking_take_while(|c| matches!(c, op_chars!(all as_pat)));
570 let slash_star = op_taken.find("/*");
575 let dash_dash = op_taken.find("--");
576 let pos = match (slash_star, dash_dash) {
577 (Some(s), Some(d)) => s.min(d),
578 (Some(s), None) => s,
579 (None, Some(d)) => d,
580 (None, None) => op_taken.len(),
581 };
582 let mut op = &op_taken[..pos];
583 if op.is_empty() {
584 match self.next() {
585 Some('-') => {
586 self.next(); let comment = self.tokenize_single_line_comment();
588
589 return Ok(Some(Token::Whitespace(
590 Whitespace::SingleLineComment {
591 prefix: "--".to_owned(),
592 comment,
593 },
594 )));
595 }
596 Some('/') => {
597 self.next(); return self.tokenize_multiline_comment();
599 }
600 _ => unreachable!(),
601 }
602 };
603 if op.len() > 1
604 && op.ends_with(['+', '-'])
605 && !op.contains(op_chars!(ext as_arr))
606 {
607 op = op.trim_end_matches(['+', '-']);
608 if op.is_empty() {
609 op = &op_taken[..1];
610 }
611 }
612 if op.len() == op_taken.len() {
613 *self = trial;
614 } else {
615 for _ in op.chars() {
616 self.next();
617 }
618 }
619 match op {
620 "+" => Ok(Some(Token::Plus)),
622 "-" => Ok(Some(Token::Minus)),
623 "*" => Ok(Some(Token::Mul)),
624 "/" => Ok(Some(Token::Div)),
625 "%" => Ok(Some(Token::Mod)),
626 "^" => Ok(Some(Token::Caret)),
627 "<" => Ok(Some(Token::Lt)),
628 ">" => Ok(Some(Token::Gt)),
629 "=" => Ok(Some(Token::Eq)),
630 "=>" => Ok(Some(Token::RArrow)),
632 "<=" => Ok(Some(Token::LtEq)),
633 ">=" => Ok(Some(Token::GtEq)),
634 "<>" => Ok(Some(Token::Neq)),
635 "!=" => Ok(Some(Token::Neq)),
636 "|" => Ok(Some(Token::Pipe)),
641 _ => Ok(Some(Token::Op(op.to_owned()))),
642 }
643 }
644 other => self.consume_and_return(Token::Char(other)),
645 },
646 None => Ok(None),
647 }
648 }
649
650 fn tokenize_delimited_identifier(
651 &mut self,
652 quote_start: char,
653 ) -> Result<String, TokenizerError> {
654 let quote_end = Word::matching_end_quote(quote_start);
655 let mut s = String::new();
656
657 self.next(); while let Some(ch) = self.peek() {
660 self.next(); if ch == quote_end {
663 if self.peek() == Some(quote_end) {
664 self.next(); s.push(quote_end);
666 } else {
667 return Ok(s);
668 }
669 } else {
670 s.push(ch);
671 }
672 }
673
674 self.error(format!(
675 "Expected close delimiter '{}' before EOF.",
676 quote_end
677 ))
678 }
679
680 fn tokenize_dollar_preceded_value(&mut self) -> Result<Token, TokenizerError> {
682 let mut s = String::new();
683 let mut value = String::new();
684
685 self.next();
686
687 if let Some('$') = self.peek() {
688 self.next();
690
691 let delimiter = "$$";
692 while self.peek().is_some() {
693 if self.starts_with(delimiter) {
694 for _ in delimiter.chars() {
695 self.next();
696 }
697 return Ok(Token::DollarQuotedString(DollarQuotedString {
698 value: s,
699 tag: None,
700 }));
701 }
702 s.push(self.next().unwrap());
703 }
704
705 self.error("Unterminated dollar-quoted string")
706 } else {
707 value.push_str(&self.peeking_take_while(|ch| ch.is_ascii_alphanumeric() || ch == '_'));
709
710 if let Some('$') = self.peek() {
711 if !is_valid_dollar_quote_tag(&value) {
712 return self.error(format!("Invalid dollar-quoted string tag \"{}\"", value));
713 }
714
715 self.next();
716
717 let delimiter = format!("${}$", value);
718 while self.peek().is_some() {
719 if self.starts_with(&delimiter) {
720 for _ in delimiter.chars() {
721 self.next();
722 }
723 return Ok(Token::DollarQuotedString(DollarQuotedString {
724 value: s,
725 tag: Some(value),
726 }));
727 }
728 s.push(self.next().unwrap());
729 }
730
731 self.error(format!(
732 "Unterminated dollar-quoted string at or near \"{}\"",
733 value
734 ))
735 } else {
736 Ok(Token::Parameter(value))
737 }
738 }
739 }
740
741 fn error<R>(&self, message: impl Into<String>) -> Result<R, TokenizerError> {
742 let prefix = format!("LINE {}: ", self.line);
743 let sql_line = self.sql.split('\n').nth(self.line as usize - 1).unwrap();
744 let cursor = " ".repeat(prefix.len() + self.col as usize - 1);
745 let context = format!("{}{}\n{}^", prefix, sql_line, cursor);
746 Err(TokenizerError {
747 message: message.into(),
748 col: self.col,
749 line: self.line,
750 context,
751 })
752 }
753
754 fn reject_number_junk(&mut self) -> Result<(), TokenizerError> {
755 if let Some(ch) = self.peek()
756 && is_identifier_start(ch)
757 {
758 return self.error("trailing junk after numeric literal");
759 }
760 Ok(())
761 }
762
763 fn tokenize_single_line_comment(&mut self) -> String {
765 let mut comment = self.peeking_take_while(|ch| ch != '\n');
766 if let Some(ch) = self.next() {
767 assert_eq!(ch, '\n');
768 comment.push(ch);
769 }
770 comment
771 }
772
773 fn tokenize_word(&mut self, first_char: char) -> String {
775 let mut s = first_char.to_string();
776 s.push_str(&self.peeking_take_while(is_identifier_part));
777 s
778 }
779
780 fn tokenize_single_quoted_string(&mut self) -> Result<String, TokenizerError> {
782 let mut s = String::new();
783 self.next(); let mut is_escaped = false;
787 while let Some(ch) = self.peek() {
788 match ch {
789 '\'' => {
790 self.next(); if is_escaped {
792 s.push(ch);
793 is_escaped = false;
794 } else if self.peek().map(|c| c == '\'').unwrap_or(false) {
795 s.push(ch);
796 self.next();
797 } else {
798 return Ok(s);
799 }
800 }
801 '\\' => {
802 s.push(ch);
803 self.next();
804 }
805 _ => {
806 self.next(); s.push(ch);
808 }
809 }
810 }
811 self.error("Unterminated string literal")
812 }
813
814 fn tokenize_single_quoted_string_with_escape(
816 &mut self,
817 ) -> Result<CstyleEscapedString, TokenizerError> {
818 let mut terminated = false;
819 let mut s = String::new();
820 self.next(); while let Some(ch) = self.peek() {
823 match ch {
824 '\'' => {
825 self.next(); if self.peek().map(|c| c == '\'').unwrap_or(false) {
827 s.push('\\');
828 s.push(ch);
829 self.next();
830 } else {
831 terminated = true;
832 break;
833 }
834 }
835 '\\' => {
836 s.push(ch);
837 self.next();
838 if self.peek().map(|c| c == '\'' || c == '\\').unwrap_or(false) {
839 s.push(self.next().unwrap());
840 }
841 }
842 _ => {
843 self.next(); s.push(ch);
845 }
846 }
847 }
848
849 if !terminated {
850 return self.error("Unterminated string literal");
851 }
852
853 let unescaped = match Self::unescape_c_style(&s) {
854 Ok(unescaped) => unescaped,
855 Err(e) => return self.error(e),
856 };
857
858 Ok(CstyleEscapedString {
859 value: unescaped,
860 raw: s,
861 })
862 }
863
864 fn unescape_c_style(s: &str) -> Result<String, String> {
870 fn hex_byte_process(
871 chars: &mut Peekable<Chars<'_>>,
872 res: &mut String,
873 len: usize,
874 default_char: char,
875 ) -> Result<(), String> {
876 let mut unicode_seq: String = String::with_capacity(len);
877 for _ in 0..len {
878 if let Some(c) = chars.peek()
879 && c.is_ascii_hexdigit()
880 {
881 unicode_seq.push(chars.next().unwrap());
882 } else {
883 break;
884 }
885 }
886
887 if unicode_seq.is_empty() && len == 2 {
888 res.push(default_char);
889 return Ok(());
890 } else if unicode_seq.len() < len && len != 2 {
891 return Err("invalid unicode sequence: must be \\uXXXX or \\UXXXXXXXX".to_owned());
892 }
893
894 if len == 2 {
895 let number = [u8::from_str_radix(&unicode_seq, 16)
896 .map_err(|e| format!("invalid unicode sequence: {}", e))?];
897
898 res.push(
899 std::str::from_utf8(&number)
900 .map_err(|err| format!("invalid unicode sequence: {}", err))?
901 .chars()
902 .next()
903 .unwrap(),
904 );
905 } else {
906 let number = u32::from_str_radix(&unicode_seq, 16)
907 .map_err(|e| format!("invalid unicode sequence: {}", e))?;
908 res.push(
909 char::from_u32(number)
910 .ok_or_else(|| format!("invalid unicode sequence: {}", unicode_seq))?,
911 );
912 }
913 Ok(())
914 }
915
916 fn octal_byte_process(
917 chars: &mut Peekable<Chars<'_>>,
918 res: &mut String,
919 digit: char,
920 ) -> Result<(), String> {
921 let mut unicode_seq: String = String::with_capacity(3);
922 unicode_seq.push(digit);
923 for _ in 0..2 {
924 if let Some(c) = chars.peek()
925 && matches!(*c, '0'..='7')
926 {
927 unicode_seq.push(chars.next().unwrap());
928 } else {
929 break;
930 }
931 }
932
933 let number = [u8::from_str_radix(&unicode_seq, 8)
934 .map_err(|e| format!("invalid unicode sequence: {}", e))?];
935
936 res.push(
937 std::str::from_utf8(&number)
938 .map_err(|err| format!("invalid unicode sequence: {}", err))?
939 .chars()
940 .next()
941 .unwrap(),
942 );
943 Ok(())
944 }
945
946 let mut chars = s.chars().peekable();
947 let mut res = String::with_capacity(s.len());
948
949 while let Some(c) = chars.next() {
950 if c == '\\' {
951 match chars.next() {
952 None => {
953 return Err("unterminated escape sequence".to_owned());
954 }
955 Some(next_c) => match next_c {
956 'b' => res.push('\u{08}'),
957 'f' => res.push('\u{0C}'),
958 'n' => res.push('\n'),
959 'r' => res.push('\r'),
960 't' => res.push('\t'),
961 'x' => hex_byte_process(&mut chars, &mut res, 2, 'x')?,
962 'u' => hex_byte_process(&mut chars, &mut res, 4, 'u')?,
963 'U' => hex_byte_process(&mut chars, &mut res, 8, 'U')?,
964 digit @ '0'..='7' => octal_byte_process(&mut chars, &mut res, digit)?,
965 _ => res.push(next_c),
966 },
967 }
968 } else {
969 res.push(c);
970 }
971 }
972
973 Ok(res)
974 }
975
976 fn tokenize_multiline_comment(&mut self) -> Result<Option<Token>, TokenizerError> {
977 let mut s = String::new();
978
979 let mut nested = 1;
980 let mut last_ch = ' ';
981
982 loop {
983 match self.next() {
984 Some(ch) => {
985 if last_ch == '/' && ch == '*' {
986 nested += 1;
987 } else if last_ch == '*' && ch == '/' {
988 nested -= 1;
989 if nested == 0 {
990 s.pop();
991 break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
992 }
993 }
994 s.push(ch);
995 last_ch = ch;
996 }
997 None => break self.error("Unexpected EOF while in a multi-line comment"),
998 }
999 }
1000 }
1001
1002 #[expect(clippy::unnecessary_wraps)]
1003 fn consume_and_return(&mut self, t: Token) -> Result<Option<Token>, TokenizerError> {
1004 self.next();
1005 Ok(Some(t))
1006 }
1007
1008 fn starts_with(&self, expected: &str) -> bool {
1009 let mut chars = self.chars.clone();
1010 for expected_char in expected.chars() {
1011 if chars.next() != Some(expected_char) {
1012 return false;
1013 }
1014 }
1015 true
1016 }
1017
1018 fn peeking_take_while(&mut self, mut predicate: impl FnMut(char) -> bool) -> String {
1022 let mut s = String::new();
1023 while let Some(ch) = self.peek() {
1024 if predicate(ch) {
1025 self.next(); s.push(ch);
1027 } else {
1028 break;
1029 }
1030 }
1031 s
1032 }
1033}
1034
1035fn is_delimited_identifier_start(ch: char) -> bool {
1041 ch == '"'
1042}
1043
1044fn is_identifier_start(ch: char) -> bool {
1046 ch.is_ascii_alphabetic() || ch == '_'
1050}
1051
1052fn is_identifier_part(ch: char) -> bool {
1054 ch.is_ascii_alphanumeric() || ch == '$' || ch == '_'
1055}
1056
1057fn is_valid_dollar_quote_tag(tag: &str) -> bool {
1058 let mut chars = tag.chars();
1059 matches!(chars.next(), Some(ch) if ch.is_ascii_alphabetic() || ch == '_')
1060 && chars.all(|ch| ch.is_ascii_alphanumeric() || ch == '_')
1061}
1062
1063#[cfg(test)]
1064mod tests {
1065 use super::*;
1066
1067 #[test]
1068 fn tokenizer_error_impl() {
1069 use std::error::Error;
1070
1071 let err = TokenizerError {
1072 message: "test".into(),
1073 line: 1,
1074 col: 1,
1075 context: "LINE 1:".to_owned(),
1076 };
1077
1078 assert!(err.source().is_none());
1079 assert_eq!(err.to_string(), "test at line 1, column 1\nLINE 1:");
1080 }
1081
1082 #[test]
1083 fn tokenize_select_1() {
1084 let sql = String::from("SELECT 1");
1085 let mut tokenizer = Tokenizer::new(&sql);
1086 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1087
1088 let expected = vec![
1089 Token::make_keyword("SELECT"),
1090 Token::Whitespace(Whitespace::Space),
1091 Token::Number(String::from("1")),
1092 ];
1093
1094 compare(expected, tokens);
1095 }
1096
1097 #[test]
1098 fn tokenize_select_float() {
1099 let sql = String::from("SELECT .1");
1100 let mut tokenizer = Tokenizer::new(&sql);
1101 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1102
1103 let expected = vec![
1104 Token::make_keyword("SELECT"),
1105 Token::Whitespace(Whitespace::Space),
1106 Token::Number(String::from(".1")),
1107 ];
1108
1109 compare(expected, tokens);
1110 }
1111
1112 #[test]
1113 fn tokenize_scalar_function() {
1114 let sql = String::from("SELECT sqrt(1)");
1115 let mut tokenizer = Tokenizer::new(&sql);
1116 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1117
1118 let expected = vec![
1119 Token::make_keyword("SELECT"),
1120 Token::Whitespace(Whitespace::Space),
1121 Token::make_word("sqrt", None),
1122 Token::LParen,
1123 Token::Number(String::from("1")),
1124 Token::RParen,
1125 ];
1126
1127 compare(expected, tokens);
1128 }
1129
1130 #[test]
1131 fn tokenize_string_string_concat() {
1132 let sql = String::from("SELECT 'a' || 'b'");
1133 let mut tokenizer = Tokenizer::new(&sql);
1134 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1135
1136 let expected = vec![
1137 Token::make_keyword("SELECT"),
1138 Token::Whitespace(Whitespace::Space),
1139 Token::SingleQuotedString(String::from("a")),
1140 Token::Whitespace(Whitespace::Space),
1141 Token::Op("||".to_owned()),
1142 Token::Whitespace(Whitespace::Space),
1143 Token::SingleQuotedString(String::from("b")),
1144 ];
1145
1146 compare(expected, tokens);
1147 }
1148
1149 #[test]
1150 fn tokenize_escaped_double_quote_in_delimited_identifier() {
1151 let sql = String::from(r###"SELECT "a""b", "x""""y""###);
1152 let mut tokenizer = Tokenizer::new(&sql);
1153 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1154
1155 let expected = vec![
1156 Token::make_keyword("SELECT"),
1157 Token::Whitespace(Whitespace::Space),
1158 Token::make_word("a\"b", Some('"')),
1159 Token::Comma,
1160 Token::Whitespace(Whitespace::Space),
1161 Token::make_word("x\"\"y", Some('"')),
1162 ];
1163
1164 compare(expected, tokens);
1165 }
1166
1167 #[test]
1168 fn display_escaped_double_quote_in_delimited_identifier() {
1169 let sql = String::from(r###"SELECT "a""b", "x""""y""###);
1170 let mut tokenizer = Tokenizer::new(&sql);
1171 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1172
1173 assert_eq!(
1174 tokens.iter().map(ToString::to_string).collect::<String>(),
1175 sql
1176 );
1177 }
1178
1179 #[test]
1180 fn tokenize_bitwise_op() {
1181 let sql = String::from("SELECT one | two ^ three");
1182 let mut tokenizer = Tokenizer::new(&sql);
1183 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1184
1185 let expected = vec![
1186 Token::make_keyword("SELECT"),
1187 Token::Whitespace(Whitespace::Space),
1188 Token::make_word("one", None),
1189 Token::Whitespace(Whitespace::Space),
1190 Token::Pipe,
1191 Token::Whitespace(Whitespace::Space),
1192 Token::make_word("two", None),
1193 Token::Whitespace(Whitespace::Space),
1194 Token::Caret,
1195 Token::Whitespace(Whitespace::Space),
1196 Token::make_word("three", None),
1197 ];
1198 compare(expected, tokens);
1199 }
1200
1201 #[test]
1202 fn tokenize_tagged_dollar_quoted_string_with_inner_different_tag() {
1203 let sql = String::from("SELECT $foo$the content with $bar$nested$bar$ usage$foo$");
1204 let mut tokenizer = Tokenizer::new(&sql);
1205 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1206
1207 let expected = vec![
1208 Token::make_keyword("SELECT"),
1209 Token::Whitespace(Whitespace::Space),
1210 Token::DollarQuotedString(DollarQuotedString {
1211 tag: Some("foo".into()),
1212 value: "the content with $bar$nested$bar$ usage".into(),
1213 }),
1214 ];
1215
1216 compare(expected, tokens);
1217 }
1218
1219 #[test]
1220 fn tokenize_tagged_dollar_quoted_string_with_identifier_tag() {
1221 let sql = String::from("SELECT $_tag_1$hello$_tag_1$");
1222 let mut tokenizer = Tokenizer::new(&sql);
1223 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1224
1225 let expected = vec![
1226 Token::make_keyword("SELECT"),
1227 Token::Whitespace(Whitespace::Space),
1228 Token::DollarQuotedString(DollarQuotedString {
1229 tag: Some("_tag_1".into()),
1230 value: "hello".into(),
1231 }),
1232 ];
1233
1234 compare(expected, tokens);
1235 }
1236
1237 #[test]
1238 fn tokenize_dollar_quoted_string_with_invalid_tag() {
1239 let sql = String::from("SELECT $1tag$hello$1tag$");
1240 let mut tokenizer = Tokenizer::new(&sql);
1241 let error = tokenizer.tokenize_with_whitespace().unwrap_err();
1242
1243 assert!(
1244 error
1245 .to_string()
1246 .contains("Invalid dollar-quoted string tag \"1tag\"")
1247 );
1248 }
1249
1250 #[test]
1251 fn tokenize_tagged_dollar_quoted_string_followed_by_alias_with_dollar() {
1252 let sql = String::from("SELECT $go$o$not nesting just $ sign$go$o$");
1253 let mut tokenizer = Tokenizer::new(&sql);
1254 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1255
1256 let expected = vec![
1257 Token::make_keyword("SELECT"),
1258 Token::Whitespace(Whitespace::Space),
1259 Token::DollarQuotedString(DollarQuotedString {
1260 tag: Some("go".into()),
1261 value: "o$not nesting just $ sign".into(),
1262 }),
1263 Token::make_word("o$", None),
1264 ];
1265
1266 compare(expected, tokens);
1267 }
1268
1269 #[test]
1270 fn tokenize_logical_xor() {
1271 let sql =
1272 String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
1273 let mut tokenizer = Tokenizer::new(&sql);
1274 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1275
1276 let expected = vec![
1277 Token::make_keyword("SELECT"),
1278 Token::Whitespace(Whitespace::Space),
1279 Token::make_keyword("true"),
1280 Token::Whitespace(Whitespace::Space),
1281 Token::make_keyword("XOR"),
1282 Token::Whitespace(Whitespace::Space),
1283 Token::make_keyword("true"),
1284 Token::Comma,
1285 Token::Whitespace(Whitespace::Space),
1286 Token::make_keyword("false"),
1287 Token::Whitespace(Whitespace::Space),
1288 Token::make_keyword("XOR"),
1289 Token::Whitespace(Whitespace::Space),
1290 Token::make_keyword("false"),
1291 Token::Comma,
1292 Token::Whitespace(Whitespace::Space),
1293 Token::make_keyword("true"),
1294 Token::Whitespace(Whitespace::Space),
1295 Token::make_keyword("XOR"),
1296 Token::Whitespace(Whitespace::Space),
1297 Token::make_keyword("false"),
1298 Token::Comma,
1299 Token::Whitespace(Whitespace::Space),
1300 Token::make_keyword("false"),
1301 Token::Whitespace(Whitespace::Space),
1302 Token::make_keyword("XOR"),
1303 Token::Whitespace(Whitespace::Space),
1304 Token::make_keyword("true"),
1305 ];
1306 compare(expected, tokens);
1307 }
1308
1309 #[test]
1310 fn tokenize_simple_select() {
1311 let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
1312 let mut tokenizer = Tokenizer::new(&sql);
1313 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1314
1315 let expected = vec![
1316 Token::make_keyword("SELECT"),
1317 Token::Whitespace(Whitespace::Space),
1318 Token::Mul,
1319 Token::Whitespace(Whitespace::Space),
1320 Token::make_keyword("FROM"),
1321 Token::Whitespace(Whitespace::Space),
1322 Token::make_word("customer", None),
1323 Token::Whitespace(Whitespace::Space),
1324 Token::make_keyword("WHERE"),
1325 Token::Whitespace(Whitespace::Space),
1326 Token::make_word("id", None),
1327 Token::Whitespace(Whitespace::Space),
1328 Token::Eq,
1329 Token::Whitespace(Whitespace::Space),
1330 Token::Number(String::from("1")),
1331 Token::Whitespace(Whitespace::Space),
1332 Token::make_keyword("LIMIT"),
1333 Token::Whitespace(Whitespace::Space),
1334 Token::Number(String::from("5")),
1335 ];
1336
1337 compare(expected, tokens);
1338 }
1339
1340 #[test]
1341 fn tokenize_explain_select() {
1342 let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
1343 let mut tokenizer = Tokenizer::new(&sql);
1344 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1345
1346 let expected = vec![
1347 Token::make_keyword("EXPLAIN"),
1348 Token::Whitespace(Whitespace::Space),
1349 Token::make_keyword("SELECT"),
1350 Token::Whitespace(Whitespace::Space),
1351 Token::Mul,
1352 Token::Whitespace(Whitespace::Space),
1353 Token::make_keyword("FROM"),
1354 Token::Whitespace(Whitespace::Space),
1355 Token::make_word("customer", None),
1356 Token::Whitespace(Whitespace::Space),
1357 Token::make_keyword("WHERE"),
1358 Token::Whitespace(Whitespace::Space),
1359 Token::make_word("id", None),
1360 Token::Whitespace(Whitespace::Space),
1361 Token::Eq,
1362 Token::Whitespace(Whitespace::Space),
1363 Token::Number(String::from("1")),
1364 ];
1365
1366 compare(expected, tokens);
1367 }
1368
1369 #[test]
1370 fn tokenize_explain_analyze_select() {
1371 let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
1372 let mut tokenizer = Tokenizer::new(&sql);
1373 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1374
1375 let expected = vec![
1376 Token::make_keyword("EXPLAIN"),
1377 Token::Whitespace(Whitespace::Space),
1378 Token::make_keyword("ANALYZE"),
1379 Token::Whitespace(Whitespace::Space),
1380 Token::make_keyword("SELECT"),
1381 Token::Whitespace(Whitespace::Space),
1382 Token::Mul,
1383 Token::Whitespace(Whitespace::Space),
1384 Token::make_keyword("FROM"),
1385 Token::Whitespace(Whitespace::Space),
1386 Token::make_word("customer", None),
1387 Token::Whitespace(Whitespace::Space),
1388 Token::make_keyword("WHERE"),
1389 Token::Whitespace(Whitespace::Space),
1390 Token::make_word("id", None),
1391 Token::Whitespace(Whitespace::Space),
1392 Token::Eq,
1393 Token::Whitespace(Whitespace::Space),
1394 Token::Number(String::from("1")),
1395 ];
1396
1397 compare(expected, tokens);
1398 }
1399
1400 #[test]
1401 fn tokenize_string_predicate() {
1402 let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
1403 let mut tokenizer = Tokenizer::new(&sql);
1404 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1405
1406 let expected = vec![
1407 Token::make_keyword("SELECT"),
1408 Token::Whitespace(Whitespace::Space),
1409 Token::Mul,
1410 Token::Whitespace(Whitespace::Space),
1411 Token::make_keyword("FROM"),
1412 Token::Whitespace(Whitespace::Space),
1413 Token::make_word("customer", None),
1414 Token::Whitespace(Whitespace::Space),
1415 Token::make_keyword("WHERE"),
1416 Token::Whitespace(Whitespace::Space),
1417 Token::make_word("salary", None),
1418 Token::Whitespace(Whitespace::Space),
1419 Token::Neq,
1420 Token::Whitespace(Whitespace::Space),
1421 Token::SingleQuotedString(String::from("Not Provided")),
1422 ];
1423
1424 compare(expected, tokens);
1425 }
1426
1427 #[test]
1428 fn tokenize_invalid_string() {
1429 let sql = String::from("\nمصطفىh");
1430 let mut tokenizer = Tokenizer::new(&sql);
1431 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1432 let expected = vec![
1434 Token::Whitespace(Whitespace::Newline),
1435 Token::Char('م'),
1436 Token::Char('ص'),
1437 Token::Char('ط'),
1438 Token::Char('ف'),
1439 Token::Char('ى'),
1440 Token::make_word("h", None),
1441 ];
1442 compare(expected, tokens);
1443 }
1444
1445 #[test]
1446 fn tokenize_newline_in_string_literal() {
1447 let sql = String::from("'foo\r\nbar\nbaz'");
1448 let mut tokenizer = Tokenizer::new(&sql);
1449 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1450 let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_owned())];
1451 compare(expected, tokens);
1452 }
1453
1454 #[test]
1455 fn tokenize_unterminated_string_literal() {
1456 let sql = String::from("select 'foo");
1457 let mut tokenizer = Tokenizer::new(&sql);
1458 assert_eq!(
1459 tokenizer.tokenize_with_whitespace(),
1460 Err(TokenizerError {
1461 message: "Unterminated string literal".to_owned(),
1462 line: 1,
1463 col: 12,
1464 context: "LINE 1: select 'foo\n ^".to_owned(),
1465 })
1466 );
1467 }
1468
1469 #[test]
1470 fn tokenize_invalid_string_cols() {
1471 let sql = String::from("\n\nSELECT * FROM table\tمصطفىh");
1472 let mut tokenizer = Tokenizer::new(&sql);
1473 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1474 let expected = vec![
1476 Token::Whitespace(Whitespace::Newline),
1477 Token::Whitespace(Whitespace::Newline),
1478 Token::make_keyword("SELECT"),
1479 Token::Whitespace(Whitespace::Space),
1480 Token::Mul,
1481 Token::Whitespace(Whitespace::Space),
1482 Token::make_keyword("FROM"),
1483 Token::Whitespace(Whitespace::Space),
1484 Token::make_keyword("table"),
1485 Token::Whitespace(Whitespace::Tab),
1486 Token::Char('م'),
1487 Token::Char('ص'),
1488 Token::Char('ط'),
1489 Token::Char('ف'),
1490 Token::Char('ى'),
1491 Token::make_word("h", None),
1492 ];
1493 compare(expected, tokens);
1494 }
1495
1496 #[test]
1497 fn tokenize_right_arrow() {
1498 let sql = String::from("FUNCTION(key=>value)");
1499 let mut tokenizer = Tokenizer::new(&sql);
1500 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1501 let expected = vec![
1502 Token::make_word("FUNCTION", None),
1503 Token::LParen,
1504 Token::make_word("key", None),
1505 Token::RArrow,
1506 Token::make_word("value", None),
1507 Token::RParen,
1508 ];
1509 compare(expected, tokens);
1510 }
1511
1512 #[test]
1513 fn tokenize_is_null() {
1514 let sql = String::from("a IS NULL");
1515 let mut tokenizer = Tokenizer::new(&sql);
1516 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1517
1518 let expected = vec![
1519 Token::make_word("a", None),
1520 Token::Whitespace(Whitespace::Space),
1521 Token::make_keyword("IS"),
1522 Token::Whitespace(Whitespace::Space),
1523 Token::make_keyword("NULL"),
1524 ];
1525
1526 compare(expected, tokens);
1527 }
1528
1529 #[test]
1530 fn tokenize_comment() {
1531 let sql = String::from("0--this is a comment\n1");
1532 let mut tokenizer = Tokenizer::new(&sql);
1533 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1534 let expected = vec![
1535 Token::Number("0".to_owned()),
1536 Token::Whitespace(Whitespace::SingleLineComment {
1537 prefix: "--".to_owned(),
1538 comment: "this is a comment\n".to_owned(),
1539 }),
1540 Token::Number("1".to_owned()),
1541 ];
1542 compare(expected, tokens);
1543 }
1544
1545 #[test]
1546 fn tokenize_comment_at_eof() {
1547 let sql = String::from("--this is a comment");
1548 let mut tokenizer = Tokenizer::new(&sql);
1549 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1550 let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
1551 prefix: "--".to_owned(),
1552 comment: "this is a comment".to_owned(),
1553 })];
1554 compare(expected, tokens);
1555 }
1556
1557 #[test]
1558 fn tokenize_multiline_comment() {
1559 let sql = String::from("0/*multi-line\n* /comment*/1");
1560 let mut tokenizer = Tokenizer::new(&sql);
1561 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1562 let expected = vec![
1563 Token::Number("0".to_owned()),
1564 Token::Whitespace(Whitespace::MultiLineComment(
1565 "multi-line\n* /comment".to_owned(),
1566 )),
1567 Token::Number("1".to_owned()),
1568 ];
1569 compare(expected, tokens);
1570 }
1571
1572 #[test]
1573 fn tokenize_nested_multiline_comment() {
1574 let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
1575 let mut tokenizer = Tokenizer::new(&sql);
1576 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1577 let expected = vec![
1578 Token::Number("0".to_owned()),
1579 Token::Whitespace(Whitespace::MultiLineComment(
1580 "multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_owned(),
1581 )),
1582 Token::Number("1".to_owned()),
1583 ];
1584 compare(expected, tokens);
1585 }
1586
1587 #[test]
1588 fn tokenize_multiline_comment_with_even_asterisks() {
1589 let sql = String::from("\n/** Comment **/\n");
1590 let mut tokenizer = Tokenizer::new(&sql);
1591 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1592 let expected = vec![
1593 Token::Whitespace(Whitespace::Newline),
1594 Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_owned())),
1595 Token::Whitespace(Whitespace::Newline),
1596 ];
1597 compare(expected, tokens);
1598 }
1599
1600 #[test]
1601 fn tokenize_mismatched_quotes() {
1602 let sql = String::from("\"foo");
1603 let mut tokenizer = Tokenizer::new(&sql);
1604 assert_eq!(
1605 tokenizer.tokenize_with_whitespace(),
1606 Err(TokenizerError {
1607 message: "Expected close delimiter '\"' before EOF.".to_owned(),
1608 line: 1,
1609 col: 5,
1610 context: "LINE 1: \"foo\n ^".to_owned(),
1611 })
1612 );
1613 }
1614
1615 #[test]
1616 fn tokenize_newlines() {
1617 let sql = String::from("line1\nline2\rline3\r\nline4\r");
1618 let mut tokenizer = Tokenizer::new(&sql);
1619 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1620 let expected = vec![
1621 Token::make_word("line1", None),
1622 Token::Whitespace(Whitespace::Newline),
1623 Token::make_word("line2", None),
1624 Token::Whitespace(Whitespace::Newline),
1625 Token::make_word("line3", None),
1626 Token::Whitespace(Whitespace::Newline),
1627 Token::make_word("line4", None),
1628 Token::Whitespace(Whitespace::Newline),
1629 ];
1630 compare(expected, tokens);
1631 }
1632
1633 #[test]
1634 fn tokenize_pg_regex_match() {
1635 let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
1636 let mut tokenizer = Tokenizer::new(sql);
1637 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1638 let expected = vec![
1639 Token::make_keyword("SELECT"),
1640 Token::Whitespace(Whitespace::Space),
1641 Token::make_word("col", None),
1642 Token::Whitespace(Whitespace::Space),
1643 Token::Op("~".to_owned()),
1644 Token::Whitespace(Whitespace::Space),
1645 Token::SingleQuotedString("^a".into()),
1646 Token::Comma,
1647 Token::Whitespace(Whitespace::Space),
1648 Token::make_word("col", None),
1649 Token::Whitespace(Whitespace::Space),
1650 Token::Op("~*".to_owned()),
1651 Token::Whitespace(Whitespace::Space),
1652 Token::SingleQuotedString("^a".into()),
1653 Token::Comma,
1654 Token::Whitespace(Whitespace::Space),
1655 Token::make_word("col", None),
1656 Token::Whitespace(Whitespace::Space),
1657 Token::Op("!~".to_owned()),
1658 Token::Whitespace(Whitespace::Space),
1659 Token::SingleQuotedString("^a".into()),
1660 Token::Comma,
1661 Token::Whitespace(Whitespace::Space),
1662 Token::make_word("col", None),
1663 Token::Whitespace(Whitespace::Space),
1664 Token::Op("!~*".to_owned()),
1665 Token::Whitespace(Whitespace::Space),
1666 Token::SingleQuotedString("^a".into()),
1667 ];
1668 compare(expected, tokens);
1669 }
1670
1671 #[test]
1672 fn tokenize_select_array() {
1673 let sql = String::from("SELECT '{1, 2, 3}'");
1674 let mut tokenizer = Tokenizer::new(&sql);
1675 let tokens = tokenizer.tokenize_with_whitespace().unwrap();
1676
1677 let expected = vec![
1678 Token::make_keyword("SELECT"),
1679 Token::Whitespace(Whitespace::Space),
1680 Token::SingleQuotedString(String::from("{1, 2, 3}")),
1681 ];
1682
1683 compare(expected, tokens);
1684 }
1685
1686 fn compare(expected: Vec<Token>, actual: Vec<Token>) {
1687 assert_eq!(expected, actual);
1692 }
1693}