#[cfg(not(feature = "std"))]
use alloc::{
borrow::ToOwned,
format,
string::{String, ToString},
vec,
vec::Vec,
};
use core::fmt;
use core::fmt::Debug;
use core::iter::Peekable;
use core::str::Chars;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use crate::ast::{CstyleEscapedString, DollarQuotedString};
use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub enum Token {
EOF,
Word(Word),
Number(String),
Char(char),
SingleQuotedString(String),
DollarQuotedString(DollarQuotedString),
CstyleEscapesString(CstyleEscapedString),
NationalStringLiteral(String),
HexStringLiteral(String),
Parameter(String),
Comma,
Whitespace(Whitespace),
DoubleEq,
Eq,
Neq,
Lt,
Gt,
LtEq,
GtEq,
Spaceship,
Plus,
Minus,
Mul,
Div,
Mod,
Concat,
LParen,
RParen,
Period,
Colon,
DoubleColon,
SemiColon,
Backslash,
LBracket,
RBracket,
Ampersand,
Pipe,
Caret,
Prefix,
LBrace,
RBrace,
RArrow,
Sharp,
Tilde,
TildeAsterisk,
ExclamationMarkTilde,
ExclamationMarkTildeAsterisk,
DoubleTilde,
DoubleTildeAsterisk,
ExclamationMarkDoubleTilde,
ExclamationMarkDoubleTildeAsterisk,
ShiftLeft,
ShiftRight,
ExclamationMark,
DoubleExclamationMark,
AtSign,
PGSquareRoot,
PGCubeRoot,
Arrow,
LongArrow,
HashArrow,
HashLongArrow,
HashMinus,
AtArrow,
ArrowAt,
QuestionMark,
QuestionMarkPipe,
QuestionMarkAmpersand,
AtQuestionMark,
AtAt,
}
impl fmt::Display for Token {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Token::EOF => f.write_str("EOF"),
Token::Word(ref w) => write!(f, "{}", w),
Token::Number(ref n) => write!(f, "{}", n),
Token::Char(ref c) => write!(f, "{}", c),
Token::SingleQuotedString(ref s) => write!(f, "'{}'", s),
Token::DollarQuotedString(ref s) => write!(f, "{}", s),
Token::NationalStringLiteral(ref s) => write!(f, "N'{}'", s),
Token::HexStringLiteral(ref s) => write!(f, "X'{}'", s),
Token::CstyleEscapesString(ref s) => write!(f, "E'{}'", s),
Token::Parameter(ref s) => write!(f, "${}", s),
Token::Comma => f.write_str(","),
Token::Whitespace(ws) => write!(f, "{}", ws),
Token::DoubleEq => f.write_str("=="),
Token::Spaceship => f.write_str("<=>"),
Token::Eq => f.write_str("="),
Token::Neq => f.write_str("<>"),
Token::Lt => f.write_str("<"),
Token::Gt => f.write_str(">"),
Token::LtEq => f.write_str("<="),
Token::GtEq => f.write_str(">="),
Token::Plus => f.write_str("+"),
Token::Minus => f.write_str("-"),
Token::Mul => f.write_str("*"),
Token::Div => f.write_str("/"),
Token::Concat => f.write_str("||"),
Token::Mod => f.write_str("%"),
Token::LParen => f.write_str("("),
Token::RParen => f.write_str(")"),
Token::Period => f.write_str("."),
Token::Colon => f.write_str(":"),
Token::DoubleColon => f.write_str("::"),
Token::SemiColon => f.write_str(";"),
Token::Backslash => f.write_str("\\"),
Token::LBracket => f.write_str("["),
Token::RBracket => f.write_str("]"),
Token::Ampersand => f.write_str("&"),
Token::Caret => f.write_str("^"),
Token::Prefix => f.write_str("^@"),
Token::Pipe => f.write_str("|"),
Token::LBrace => f.write_str("{"),
Token::RBrace => f.write_str("}"),
Token::RArrow => f.write_str("=>"),
Token::Sharp => f.write_str("#"),
Token::ExclamationMark => f.write_str("!"),
Token::DoubleExclamationMark => f.write_str("!!"),
Token::Tilde => f.write_str("~"),
Token::TildeAsterisk => f.write_str("~*"),
Token::ExclamationMarkTilde => f.write_str("!~"),
Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
Token::DoubleTilde => f.write_str("~~"),
Token::DoubleTildeAsterisk => f.write_str("~~*"),
Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
Token::AtSign => f.write_str("@"),
Token::ShiftLeft => f.write_str("<<"),
Token::ShiftRight => f.write_str(">>"),
Token::PGSquareRoot => f.write_str("|/"),
Token::PGCubeRoot => f.write_str("||/"),
Token::Arrow => f.write_str("->"),
Token::LongArrow => f.write_str("->>"),
Token::HashArrow => f.write_str("#>"),
Token::HashLongArrow => f.write_str("#>>"),
Token::HashMinus => f.write_str("#-"),
Token::AtArrow => f.write_str("@>"),
Token::ArrowAt => f.write_str("<@"),
Token::QuestionMark => f.write_str("?"),
Token::QuestionMarkPipe => f.write_str("?|"),
Token::QuestionMarkAmpersand => f.write_str("?&"),
Token::AtQuestionMark => f.write_str("@?"),
Token::AtAt => f.write_str("@@"),
}
}
}
impl Token {
pub fn make_keyword(keyword: &str) -> Self {
Token::make_word(keyword, None)
}
pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
let word_uppercase = word.to_uppercase();
Token::Word(Word {
value: word.to_string(),
quote_style,
keyword: if quote_style.is_none() {
let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
} else {
Keyword::NoKeyword
},
})
}
pub fn with_location(self, location: Location) -> TokenWithLocation {
TokenWithLocation::new(self, location.line, location.column)
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct Word {
pub value: String,
pub quote_style: Option<char>,
pub keyword: Keyword,
}
impl fmt::Display for Word {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self.quote_style {
Some(s) if s == '"' || s == '[' || s == '`' => {
write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
}
None => f.write_str(&self.value),
_ => panic!("Unexpected quote_style!"),
}
}
}
impl Word {
fn matching_end_quote(ch: char) -> char {
match ch {
'"' => '"', '[' => ']', '`' => '`', _ => panic!("unexpected quoting style!"),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub enum Whitespace {
Space,
Newline,
Tab,
SingleLineComment { comment: String, prefix: String },
MultiLineComment(String),
}
impl fmt::Display for Whitespace {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Whitespace::Space => f.write_str(" "),
Whitespace::Newline => f.write_str("\n"),
Whitespace::Tab => f.write_str("\t"),
Whitespace::SingleLineComment { prefix, comment } => write!(f, "{}{}", prefix, comment),
Whitespace::MultiLineComment(s) => write!(f, "/*{}*/", s),
}
}
}
#[derive(Debug, Eq, PartialEq, Clone)]
pub struct Location {
pub line: u64,
pub column: u64,
}
#[derive(Debug, Eq, PartialEq, Clone)]
pub struct TokenWithLocation {
pub token: Token,
pub location: Location,
}
impl TokenWithLocation {
pub fn new(token: Token, line: u64, column: u64) -> TokenWithLocation {
TokenWithLocation {
token,
location: Location { line, column },
}
}
pub fn eof() -> TokenWithLocation {
TokenWithLocation::new(Token::EOF, 0, 0)
}
}
impl PartialEq<Token> for TokenWithLocation {
fn eq(&self, other: &Token) -> bool {
&self.token == other
}
}
impl PartialEq<TokenWithLocation> for Token {
fn eq(&self, other: &TokenWithLocation) -> bool {
self == &other.token
}
}
impl fmt::Display for TokenWithLocation {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.token == Token::EOF {
write!(f, "end of input")
} else {
write!(
f,
"{} at line {}, column {}",
self.token, self.location.line, self.location.column
)
}
}
}
#[derive(Debug, PartialEq)]
pub struct TokenizerError {
pub message: String,
pub line: u64,
pub col: u64,
pub context: String,
}
impl fmt::Display for TokenizerError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"{} at line {}, column {}\n{}",
self.message, self.line, self.col, self.context
)
}
}
#[cfg(feature = "std")]
impl std::error::Error for TokenizerError {}
pub struct Tokenizer<'a> {
sql: &'a str,
chars: Peekable<Chars<'a>>,
line: u64,
col: u64,
}
impl<'a> Tokenizer<'a> {
pub fn new(query: &'a str) -> Self {
Self {
sql: query,
chars: query.chars().peekable(),
line: 1,
col: 1,
}
}
fn next(&mut self) -> Option<char> {
let ch = self.chars.next();
if let Some(ch) = ch {
match ch {
'\n' => {
self.line += 1;
self.col = 1;
}
'\t' => self.col += 4,
_ => self.col += 1,
}
}
ch
}
fn peek(&mut self) -> Option<char> {
self.chars.peek().cloned()
}
pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
let tokens = self.tokenize()?;
Ok(tokens
.into_iter()
.filter(|token| !matches!(&token.token, Token::Whitespace(_)))
.collect())
}
#[allow(dead_code)]
fn tokenize_with_whitespace(&mut self) -> Result<Vec<Token>, TokenizerError> {
let tokens = self.tokenize()?;
Ok(tokens.into_iter().map(|t| t.token).collect())
}
fn tokenize(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
let mut tokens = Vec::new();
while let Some(token) = self.next_token_with_location()? {
tokens.push(token);
}
Ok(tokens)
}
fn next_token_with_location(&mut self) -> Result<Option<TokenWithLocation>, TokenizerError> {
let loc = Location {
line: self.line,
column: self.col,
};
self.next_token()
.map(|t| t.map(|token| token.with_location(loc)))
}
fn next_token(&mut self) -> Result<Option<Token>, TokenizerError> {
match self.peek() {
Some(ch) => match ch {
' ' => self.consume_and_return(Token::Whitespace(Whitespace::Space)),
'\t' => self.consume_and_return(Token::Whitespace(Whitespace::Tab)),
'\n' => self.consume_and_return(Token::Whitespace(Whitespace::Newline)),
'\r' => {
self.next();
if let Some('\n') = self.peek() {
self.next();
}
Ok(Some(Token::Whitespace(Whitespace::Newline)))
}
'N' => {
self.next(); match self.peek() {
Some('\'') => {
let s = self.tokenize_single_quoted_string()?;
Ok(Some(Token::NationalStringLiteral(s)))
}
_ => {
let s = self.tokenize_word('N');
Ok(Some(Token::make_word(&s, None)))
}
}
}
x @ 'e' | x @ 'E' => {
self.next(); match self.peek() {
Some('\'') => {
let s = self.tokenize_single_quoted_string_with_escape()?;
Ok(Some(Token::CstyleEscapesString(s)))
}
_ => {
let s = self.tokenize_word(x);
Ok(Some(Token::make_word(&s, None)))
}
}
}
x @ 'x' | x @ 'X' => {
self.next(); match self.peek() {
Some('\'') => {
let s = self.tokenize_single_quoted_string()?;
Ok(Some(Token::HexStringLiteral(s)))
}
_ => {
let s = self.tokenize_word(x);
Ok(Some(Token::make_word(&s, None)))
}
}
}
ch if is_identifier_start(ch) => {
self.next(); let s = self.tokenize_word(ch);
Ok(Some(Token::make_word(&s, None)))
}
'\'' => {
let s = self.tokenize_single_quoted_string()?;
Ok(Some(Token::SingleQuotedString(s)))
}
quote_start if is_delimited_identifier_start(quote_start) => {
self.next(); let quote_end = Word::matching_end_quote(quote_start);
let s = self.peeking_take_while(|ch| ch != quote_end);
if self.next() == Some(quote_end) {
Ok(Some(Token::make_word(&s, Some(quote_start))))
} else {
self.error(format!(
"Expected close delimiter '{}' before EOF.",
quote_end
))
}
}
'0'..='9' | '.' => {
let mut s = self.peeking_take_while(|ch| ch.is_ascii_digit());
if s == "0"
&& let Some(radix) = self.peek()
&& "xob".contains(radix.to_ascii_lowercase())
{
self.next();
let radix = radix.to_ascii_lowercase();
let base = match radix {
'x' => 16,
'o' => 8,
'b' => 2,
_ => unreachable!(),
};
let s2 = self.peeking_take_while(|ch| ch.is_digit(base));
if s2.is_empty() {
return self.error("incomplete integer literal");
}
self.reject_number_junk()?;
return Ok(Some(Token::Number(format!("0{radix}{s2}"))));
}
if let Some('.') = self.peek() {
s.push('.');
self.next();
}
s += &self.peeking_take_while(|ch| ch.is_ascii_digit());
if s == "." {
return Ok(Some(Token::Period));
}
match self.peek() {
Some('e') | Some('E') => {
s.push('e');
self.next();
if let Some('-') = self.peek() {
s.push('-');
self.next();
}
s += &self.peeking_take_while(|ch| ch.is_ascii_digit());
self.reject_number_junk()?;
return Ok(Some(Token::Number(s)));
}
_ => {}
};
self.reject_number_junk()?;
Ok(Some(Token::Number(s)))
}
'(' => self.consume_and_return(Token::LParen),
')' => self.consume_and_return(Token::RParen),
',' => self.consume_and_return(Token::Comma),
'-' => {
self.next(); match self.peek() {
Some('-') => {
self.next(); let comment = self.tokenize_single_line_comment();
Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
prefix: "--".to_owned(),
comment,
})))
}
Some('>') => {
self.next(); match self.peek() {
Some('>') => {
self.next(); Ok(Some(Token::LongArrow))
}
_ => Ok(Some(Token::Arrow)),
}
}
_ => Ok(Some(Token::Minus)),
}
}
'/' => {
self.next(); match self.peek() {
Some('*') => {
self.next(); self.tokenize_multiline_comment()
}
_ => Ok(Some(Token::Div)),
}
}
'+' => self.consume_and_return(Token::Plus),
'*' => self.consume_and_return(Token::Mul),
'%' => self.consume_and_return(Token::Mod),
'|' => {
self.next(); match self.peek() {
Some('/') => self.consume_and_return(Token::PGSquareRoot),
Some('|') => {
self.next(); match self.peek() {
Some('/') => self.consume_and_return(Token::PGCubeRoot),
_ => Ok(Some(Token::Concat)),
}
}
_ => Ok(Some(Token::Pipe)),
}
}
'=' => {
self.next(); match self.peek() {
Some('>') => self.consume_and_return(Token::RArrow),
_ => Ok(Some(Token::Eq)),
}
}
'!' => {
self.next(); match self.peek() {
Some('=') => self.consume_and_return(Token::Neq),
Some('!') => self.consume_and_return(Token::DoubleExclamationMark),
Some('~') => {
self.next();
match self.peek() {
Some('~') => {
self.next();
match self.peek() {
Some('*') => self.consume_and_return(
Token::ExclamationMarkDoubleTildeAsterisk,
),
_ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
}
}
Some('*') => {
self.consume_and_return(Token::ExclamationMarkTildeAsterisk)
}
_ => Ok(Some(Token::ExclamationMarkTilde)),
}
}
_ => Ok(Some(Token::ExclamationMark)),
}
}
'<' => {
self.next(); match self.peek() {
Some('=') => {
self.next();
match self.peek() {
Some('>') => self.consume_and_return(Token::Spaceship),
_ => Ok(Some(Token::LtEq)),
}
}
Some('>') => self.consume_and_return(Token::Neq),
Some('<') => self.consume_and_return(Token::ShiftLeft),
Some('@') => self.consume_and_return(Token::ArrowAt),
_ => Ok(Some(Token::Lt)),
}
}
'>' => {
self.next(); match self.peek() {
Some('=') => self.consume_and_return(Token::GtEq),
Some('>') => self.consume_and_return(Token::ShiftRight),
_ => Ok(Some(Token::Gt)),
}
}
':' => {
self.next();
match self.peek() {
Some(':') => self.consume_and_return(Token::DoubleColon),
_ => Ok(Some(Token::Colon)),
}
}
'$' => Ok(Some(self.tokenize_dollar_preceded_value()?)),
';' => self.consume_and_return(Token::SemiColon),
'\\' => self.consume_and_return(Token::Backslash),
'[' => self.consume_and_return(Token::LBracket),
']' => self.consume_and_return(Token::RBracket),
'&' => self.consume_and_return(Token::Ampersand),
'^' => {
self.next();
match self.peek() {
Some('@') => self.consume_and_return(Token::Prefix),
_ => Ok(Some(Token::Caret)),
}
}
'{' => self.consume_and_return(Token::LBrace),
'}' => self.consume_and_return(Token::RBrace),
'~' => {
self.next(); match self.peek() {
Some('~') => {
self.next();
match self.peek() {
Some('*') => self.consume_and_return(Token::DoubleTildeAsterisk),
_ => Ok(Some(Token::DoubleTilde)),
}
}
Some('*') => self.consume_and_return(Token::TildeAsterisk),
_ => Ok(Some(Token::Tilde)),
}
}
'#' => {
self.next(); match self.peek() {
Some('-') => self.consume_and_return(Token::HashMinus),
Some('>') => {
self.next(); match self.peek() {
Some('>') => {
self.next(); Ok(Some(Token::HashLongArrow))
}
_ => Ok(Some(Token::HashArrow)),
}
}
_ => Ok(Some(Token::Sharp)),
}
}
'@' => {
self.next(); match self.peek() {
Some('>') => self.consume_and_return(Token::AtArrow),
Some('?') => self.consume_and_return(Token::AtQuestionMark),
Some('@') => self.consume_and_return(Token::AtAt),
_ => Ok(Some(Token::AtSign)),
}
}
'?' => {
self.next(); match self.peek() {
Some('|') => self.consume_and_return(Token::QuestionMarkPipe),
Some('&') => self.consume_and_return(Token::QuestionMarkAmpersand),
_ => Ok(Some(Token::QuestionMark)),
}
}
other => self.consume_and_return(Token::Char(other)),
},
None => Ok(None),
}
}
fn tokenize_dollar_preceded_value(&mut self) -> Result<Token, TokenizerError> {
let mut s = String::new();
let mut value = String::new();
self.next();
if let Some('$') = self.peek() {
self.next();
let mut is_terminated = false;
let mut prev: Option<char> = None;
while let Some(ch) = self.peek() {
if prev == Some('$') {
if ch == '$' {
self.next();
is_terminated = true;
break;
} else {
s.push('$');
s.push(ch);
}
} else if ch != '$' {
s.push(ch);
}
prev = Some(ch);
self.next();
}
return if self.peek().is_none() && !is_terminated {
self.error("Unterminated dollar-quoted string")
} else {
Ok(Token::DollarQuotedString(DollarQuotedString {
value: s,
tag: None,
}))
};
} else {
value.push_str(&self.peeking_take_while(|ch| ch.is_alphanumeric() || ch == '_'));
if let Some('$') = self.peek() {
self.next();
s.push_str(&self.peeking_take_while(|ch| ch != '$'));
match self.peek() {
Some('$') => {
self.next();
for c in value.chars() {
let next_char = self.next();
if Some(c) != next_char {
return self.error(format!(
"Unterminated dollar-quoted string at or near \"{}\"",
value
));
}
}
if let Some('$') = self.peek() {
self.next();
} else {
return self.error("Unterminated dollar-quoted string, expected $");
}
}
_ => {
return self.error("Unterminated dollar-quoted, expected $");
}
}
} else {
return Ok(Token::Parameter(value));
}
}
Ok(Token::DollarQuotedString(DollarQuotedString {
value: s,
tag: if value.is_empty() { None } else { Some(value) },
}))
}
fn error<R>(&self, message: impl Into<String>) -> Result<R, TokenizerError> {
let prefix = format!("LINE {}: ", self.line);
let sql_line = self.sql.split('\n').nth(self.line as usize - 1).unwrap();
let cursor = " ".repeat(prefix.len() + self.col as usize - 1);
let context = format!("{}{}\n{}^", prefix, sql_line, cursor);
Err(TokenizerError {
message: message.into(),
col: self.col,
line: self.line,
context,
})
}
fn reject_number_junk(&mut self) -> Result<(), TokenizerError> {
if let Some(ch) = self.peek()
&& is_identifier_start(ch)
{
return self.error("trailing junk after numeric literal");
}
Ok(())
}
fn tokenize_single_line_comment(&mut self) -> String {
let mut comment = self.peeking_take_while(|ch| ch != '\n');
if let Some(ch) = self.next() {
assert_eq!(ch, '\n');
comment.push(ch);
}
comment
}
fn tokenize_word(&mut self, first_char: char) -> String {
let mut s = first_char.to_string();
s.push_str(&self.peeking_take_while(is_identifier_part));
s
}
fn tokenize_single_quoted_string(&mut self) -> Result<String, TokenizerError> {
let mut s = String::new();
self.next(); let mut is_escaped = false;
while let Some(ch) = self.peek() {
match ch {
'\'' => {
self.next(); if is_escaped {
s.push(ch);
is_escaped = false;
} else if self.peek().map(|c| c == '\'').unwrap_or(false) {
s.push(ch);
self.next();
} else {
return Ok(s);
}
}
'\\' => {
s.push(ch);
self.next();
}
_ => {
self.next(); s.push(ch);
}
}
}
self.error("Unterminated string literal")
}
fn tokenize_single_quoted_string_with_escape(
&mut self,
) -> Result<CstyleEscapedString, TokenizerError> {
let mut terminated = false;
let mut s = String::new();
self.next(); while let Some(ch) = self.peek() {
match ch {
'\'' => {
self.next(); if self.peek().map(|c| c == '\'').unwrap_or(false) {
s.push('\\');
s.push(ch);
self.next();
} else {
terminated = true;
break;
}
}
'\\' => {
s.push(ch);
self.next();
if self.peek().map(|c| c == '\'' || c == '\\').unwrap_or(false) {
s.push(self.next().unwrap());
}
}
_ => {
self.next(); s.push(ch);
}
}
}
if !terminated {
return self.error("Unterminated string literal");
}
let unescaped = match Self::unescape_c_style(&s) {
Ok(unescaped) => unescaped,
Err(e) => return self.error(e),
};
Ok(CstyleEscapedString {
value: unescaped,
raw: s,
})
}
fn unescape_c_style(s: &str) -> Result<String, String> {
fn hex_byte_process(
chars: &mut Peekable<Chars<'_>>,
res: &mut String,
len: usize,
default_char: char,
) -> Result<(), String> {
let mut unicode_seq: String = String::with_capacity(len);
for _ in 0..len {
if let Some(c) = chars.peek()
&& c.is_ascii_hexdigit()
{
unicode_seq.push(chars.next().unwrap());
} else {
break;
}
}
if unicode_seq.is_empty() && len == 2 {
res.push(default_char);
return Ok(());
} else if unicode_seq.len() < len && len != 2 {
return Err("invalid unicode sequence: must be \\uXXXX or \\UXXXXXXXX".to_string());
}
if len == 2 {
let number = [u8::from_str_radix(&unicode_seq, 16)
.map_err(|e| format!("invalid unicode sequence: {}", e))?];
res.push(
std::str::from_utf8(&number)
.map_err(|err| format!("invalid unicode sequence: {}", err))?
.chars()
.next()
.unwrap(),
);
} else {
let number = u32::from_str_radix(&unicode_seq, 16)
.map_err(|e| format!("invalid unicode sequence: {}", e))?;
res.push(
char::from_u32(number)
.ok_or_else(|| format!("invalid unicode sequence: {}", unicode_seq))?,
);
}
Ok(())
}
fn octal_byte_process(
chars: &mut Peekable<Chars<'_>>,
res: &mut String,
digit: char,
) -> Result<(), String> {
let mut unicode_seq: String = String::with_capacity(3);
unicode_seq.push(digit);
for _ in 0..2 {
if let Some(c) = chars.peek()
&& matches!(*c, '0'..='7')
{
unicode_seq.push(chars.next().unwrap());
} else {
break;
}
}
let number = [u8::from_str_radix(&unicode_seq, 8)
.map_err(|e| format!("invalid unicode sequence: {}", e))?];
res.push(
std::str::from_utf8(&number)
.map_err(|err| format!("invalid unicode sequence: {}", err))?
.chars()
.next()
.unwrap(),
);
Ok(())
}
let mut chars = s.chars().peekable();
let mut res = String::with_capacity(s.len());
while let Some(c) = chars.next() {
if c == '\\' {
match chars.next() {
None => {
return Err("unterminated escape sequence".to_string());
}
Some(next_c) => match next_c {
'b' => res.push('\u{08}'),
'f' => res.push('\u{0C}'),
'n' => res.push('\n'),
'r' => res.push('\r'),
't' => res.push('\t'),
'x' => hex_byte_process(&mut chars, &mut res, 2, 'x')?,
'u' => hex_byte_process(&mut chars, &mut res, 4, 'u')?,
'U' => hex_byte_process(&mut chars, &mut res, 8, 'U')?,
digit @ '0'..='7' => octal_byte_process(&mut chars, &mut res, digit)?,
_ => res.push(next_c),
},
}
} else {
res.push(c);
}
}
Ok(res)
}
fn tokenize_multiline_comment(&mut self) -> Result<Option<Token>, TokenizerError> {
let mut s = String::new();
let mut nested = 1;
let mut last_ch = ' ';
loop {
match self.next() {
Some(ch) => {
if last_ch == '/' && ch == '*' {
nested += 1;
} else if last_ch == '*' && ch == '/' {
nested -= 1;
if nested == 0 {
s.pop();
break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
}
}
s.push(ch);
last_ch = ch;
}
None => break self.error("Unexpected EOF while in a multi-line comment"),
}
}
}
#[allow(clippy::unnecessary_wraps)]
fn consume_and_return(&mut self, t: Token) -> Result<Option<Token>, TokenizerError> {
self.next();
Ok(Some(t))
}
fn peeking_take_while(&mut self, mut predicate: impl FnMut(char) -> bool) -> String {
let mut s = String::new();
while let Some(ch) = self.peek() {
if predicate(ch) {
self.next(); s.push(ch);
} else {
break;
}
}
s
}
}
fn is_delimited_identifier_start(ch: char) -> bool {
ch == '"'
}
fn is_identifier_start(ch: char) -> bool {
ch.is_ascii_alphabetic() || ch == '_'
}
fn is_identifier_part(ch: char) -> bool {
ch.is_ascii_alphanumeric() || ch == '$' || ch == '_'
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn tokenizer_error_impl() {
let err = TokenizerError {
message: "test".into(),
line: 1,
col: 1,
context: "LINE 1:".to_string(),
};
#[cfg(feature = "std")]
{
use std::error::Error;
assert!(err.source().is_none());
}
assert_eq!(err.to_string(), "test at line 1, column 1\nLINE 1:");
}
#[test]
fn tokenize_select_1() {
let sql = String::from("SELECT 1");
let mut tokenizer = Tokenizer::new(&sql);
let tokens = tokenizer.tokenize_with_whitespace().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("1")),
];
compare(expected, tokens);
}
#[test]
fn tokenize_select_float() {
let sql = String::from("SELECT .1");
let mut tokenizer = Tokenizer::new(&sql);
let tokens = tokenizer.tokenize_with_whitespace().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Number(String::from(".1")),
];
compare(expected, tokens);
}
#[test]
fn tokenize_scalar_function() {
let sql = String::from("SELECT sqrt(1)");
let mut tokenizer = Tokenizer::new(&sql);
let tokens = tokenizer.tokenize_with_whitespace().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::make_word("sqrt", None),
Token::LParen,
Token::Number(String::from("1")),
Token::RParen,
];
compare(expected, tokens);
}
#[test]
fn tokenize_string_string_concat() {
let sql = String::from("SELECT 'a' || 'b'");
let mut tokenizer = Tokenizer::new(&sql);
let tokens = tokenizer.tokenize_with_whitespace().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::SingleQuotedString(String::from("a")),
Token::Whitespace(Whitespace::Space),
Token::Concat,
Token::Whitespace(Whitespace::Space),
Token::SingleQuotedString(String::from("b")),
];
compare(expected, tokens);
}
#[test]
fn tokenize_bitwise_op() {
let sql = String::from("SELECT one | two ^ three");
let mut tokenizer = Tokenizer::new(&sql);
let tokens = tokenizer.tokenize_with_whitespace().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::make_word("one", None),
Token::Whitespace(Whitespace::Space),
Token::Pipe,
Token::Whitespace(Whitespace::Space),
Token::make_word("two", None),
Token::Whitespace(Whitespace::Space),
Token::Caret,
Token::Whitespace(Whitespace::Space),
Token::make_word("three", None),
];
compare(expected, tokens);
}
#[test]
fn tokenize_logical_xor() {
let sql =
String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
let mut tokenizer = Tokenizer::new(&sql);
let tokens = tokenizer.tokenize_with_whitespace().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("true"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("XOR"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("true"),
Token::Comma,
Token::Whitespace(Whitespace::Space),
Token::make_keyword("false"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("XOR"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("false"),
Token::Comma,
Token::Whitespace(Whitespace::Space),
Token::make_keyword("true"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("XOR"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("false"),
Token::Comma,
Token::Whitespace(Whitespace::Space),
Token::make_keyword("false"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("XOR"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("true"),
];
compare(expected, tokens);
}
#[test]
fn tokenize_simple_select() {
let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
let mut tokenizer = Tokenizer::new(&sql);
let tokens = tokenizer.tokenize_with_whitespace().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Mul,
Token::Whitespace(Whitespace::Space),
Token::make_keyword("FROM"),
Token::Whitespace(Whitespace::Space),
Token::make_word("customer", None),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("WHERE"),
Token::Whitespace(Whitespace::Space),
Token::make_word("id", None),
Token::Whitespace(Whitespace::Space),
Token::Eq,
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("1")),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("LIMIT"),
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("5")),
];
compare(expected, tokens);
}
#[test]
fn tokenize_explain_select() {
let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
let mut tokenizer = Tokenizer::new(&sql);
let tokens = tokenizer.tokenize_with_whitespace().unwrap();
let expected = vec![
Token::make_keyword("EXPLAIN"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Mul,
Token::Whitespace(Whitespace::Space),
Token::make_keyword("FROM"),
Token::Whitespace(Whitespace::Space),
Token::make_word("customer", None),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("WHERE"),
Token::Whitespace(Whitespace::Space),
Token::make_word("id", None),
Token::Whitespace(Whitespace::Space),
Token::Eq,
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("1")),
];
compare(expected, tokens);
}
#[test]
fn tokenize_explain_analyze_select() {
let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
let mut tokenizer = Tokenizer::new(&sql);
let tokens = tokenizer.tokenize_with_whitespace().unwrap();
let expected = vec![
Token::make_keyword("EXPLAIN"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("ANALYZE"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Mul,
Token::Whitespace(Whitespace::Space),
Token::make_keyword("FROM"),
Token::Whitespace(Whitespace::Space),
Token::make_word("customer", None),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("WHERE"),
Token::Whitespace(Whitespace::Space),
Token::make_word("id", None),
Token::Whitespace(Whitespace::Space),
Token::Eq,
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("1")),
];
compare(expected, tokens);
}
#[test]
fn tokenize_string_predicate() {
let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
let mut tokenizer = Tokenizer::new(&sql);
let tokens = tokenizer.tokenize_with_whitespace().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Mul,
Token::Whitespace(Whitespace::Space),
Token::make_keyword("FROM"),
Token::Whitespace(Whitespace::Space),
Token::make_word("customer", None),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("WHERE"),
Token::Whitespace(Whitespace::Space),
Token::make_word("salary", None),
Token::Whitespace(Whitespace::Space),
Token::Neq,
Token::Whitespace(Whitespace::Space),
Token::SingleQuotedString(String::from("Not Provided")),
];
compare(expected, tokens);
}
#[test]
fn tokenize_invalid_string() {
let sql = String::from("\nمصطفىh");
let mut tokenizer = Tokenizer::new(&sql);
let tokens = tokenizer.tokenize_with_whitespace().unwrap();
let expected = vec![
Token::Whitespace(Whitespace::Newline),
Token::Char('م'),
Token::Char('ص'),
Token::Char('ط'),
Token::Char('ف'),
Token::Char('ى'),
Token::make_word("h", None),
];
compare(expected, tokens);
}
#[test]
fn tokenize_newline_in_string_literal() {
let sql = String::from("'foo\r\nbar\nbaz'");
let mut tokenizer = Tokenizer::new(&sql);
let tokens = tokenizer.tokenize_with_whitespace().unwrap();
let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
compare(expected, tokens);
}
#[test]
fn tokenize_unterminated_string_literal() {
let sql = String::from("select 'foo");
let mut tokenizer = Tokenizer::new(&sql);
assert_eq!(
tokenizer.tokenize_with_whitespace(),
Err(TokenizerError {
message: "Unterminated string literal".to_string(),
line: 1,
col: 12,
context: "LINE 1: select 'foo\n ^".to_string(),
})
);
}
#[test]
fn tokenize_invalid_string_cols() {
let sql = String::from("\n\nSELECT * FROM table\tمصطفىh");
let mut tokenizer = Tokenizer::new(&sql);
let tokens = tokenizer.tokenize_with_whitespace().unwrap();
let expected = vec![
Token::Whitespace(Whitespace::Newline),
Token::Whitespace(Whitespace::Newline),
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Mul,
Token::Whitespace(Whitespace::Space),
Token::make_keyword("FROM"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("table"),
Token::Whitespace(Whitespace::Tab),
Token::Char('م'),
Token::Char('ص'),
Token::Char('ط'),
Token::Char('ف'),
Token::Char('ى'),
Token::make_word("h", None),
];
compare(expected, tokens);
}
#[test]
fn tokenize_right_arrow() {
let sql = String::from("FUNCTION(key=>value)");
let mut tokenizer = Tokenizer::new(&sql);
let tokens = tokenizer.tokenize_with_whitespace().unwrap();
let expected = vec![
Token::make_word("FUNCTION", None),
Token::LParen,
Token::make_word("key", None),
Token::RArrow,
Token::make_word("value", None),
Token::RParen,
];
compare(expected, tokens);
}
#[test]
fn tokenize_is_null() {
let sql = String::from("a IS NULL");
let mut tokenizer = Tokenizer::new(&sql);
let tokens = tokenizer.tokenize_with_whitespace().unwrap();
let expected = vec![
Token::make_word("a", None),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("IS"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("NULL"),
];
compare(expected, tokens);
}
#[test]
fn tokenize_comment() {
let sql = String::from("0--this is a comment\n1");
let mut tokenizer = Tokenizer::new(&sql);
let tokens = tokenizer.tokenize_with_whitespace().unwrap();
let expected = vec![
Token::Number("0".to_string()),
Token::Whitespace(Whitespace::SingleLineComment {
prefix: "--".to_string(),
comment: "this is a comment\n".to_string(),
}),
Token::Number("1".to_string()),
];
compare(expected, tokens);
}
#[test]
fn tokenize_comment_at_eof() {
let sql = String::from("--this is a comment");
let mut tokenizer = Tokenizer::new(&sql);
let tokens = tokenizer.tokenize_with_whitespace().unwrap();
let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
prefix: "--".to_string(),
comment: "this is a comment".to_string(),
})];
compare(expected, tokens);
}
#[test]
fn tokenize_multiline_comment() {
let sql = String::from("0/*multi-line\n* /comment*/1");
let mut tokenizer = Tokenizer::new(&sql);
let tokens = tokenizer.tokenize_with_whitespace().unwrap();
let expected = vec![
Token::Number("0".to_string()),
Token::Whitespace(Whitespace::MultiLineComment(
"multi-line\n* /comment".to_string(),
)),
Token::Number("1".to_string()),
];
compare(expected, tokens);
}
#[test]
fn tokenize_nested_multiline_comment() {
let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
let mut tokenizer = Tokenizer::new(&sql);
let tokens = tokenizer.tokenize_with_whitespace().unwrap();
let expected = vec![
Token::Number("0".to_string()),
Token::Whitespace(Whitespace::MultiLineComment(
"multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_string(),
)),
Token::Number("1".to_string()),
];
compare(expected, tokens);
}
#[test]
fn tokenize_multiline_comment_with_even_asterisks() {
let sql = String::from("\n/** Comment **/\n");
let mut tokenizer = Tokenizer::new(&sql);
let tokens = tokenizer.tokenize_with_whitespace().unwrap();
let expected = vec![
Token::Whitespace(Whitespace::Newline),
Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
Token::Whitespace(Whitespace::Newline),
];
compare(expected, tokens);
}
#[test]
fn tokenize_mismatched_quotes() {
let sql = String::from("\"foo");
let mut tokenizer = Tokenizer::new(&sql);
assert_eq!(
tokenizer.tokenize_with_whitespace(),
Err(TokenizerError {
message: "Expected close delimiter '\"' before EOF.".to_string(),
line: 1,
col: 5,
context: "LINE 1: \"foo\n ^".to_string(),
})
);
}
#[test]
fn tokenize_newlines() {
let sql = String::from("line1\nline2\rline3\r\nline4\r");
let mut tokenizer = Tokenizer::new(&sql);
let tokens = tokenizer.tokenize_with_whitespace().unwrap();
let expected = vec![
Token::make_word("line1", None),
Token::Whitespace(Whitespace::Newline),
Token::make_word("line2", None),
Token::Whitespace(Whitespace::Newline),
Token::make_word("line3", None),
Token::Whitespace(Whitespace::Newline),
Token::make_word("line4", None),
Token::Whitespace(Whitespace::Newline),
];
compare(expected, tokens);
}
#[test]
fn tokenize_pg_regex_match() {
let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
let mut tokenizer = Tokenizer::new(sql);
let tokens = tokenizer.tokenize_with_whitespace().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::make_word("col", None),
Token::Whitespace(Whitespace::Space),
Token::Tilde,
Token::Whitespace(Whitespace::Space),
Token::SingleQuotedString("^a".into()),
Token::Comma,
Token::Whitespace(Whitespace::Space),
Token::make_word("col", None),
Token::Whitespace(Whitespace::Space),
Token::TildeAsterisk,
Token::Whitespace(Whitespace::Space),
Token::SingleQuotedString("^a".into()),
Token::Comma,
Token::Whitespace(Whitespace::Space),
Token::make_word("col", None),
Token::Whitespace(Whitespace::Space),
Token::ExclamationMarkTilde,
Token::Whitespace(Whitespace::Space),
Token::SingleQuotedString("^a".into()),
Token::Comma,
Token::Whitespace(Whitespace::Space),
Token::make_word("col", None),
Token::Whitespace(Whitespace::Space),
Token::ExclamationMarkTildeAsterisk,
Token::Whitespace(Whitespace::Space),
Token::SingleQuotedString("^a".into()),
];
compare(expected, tokens);
}
#[test]
fn tokenize_select_array() {
let sql = String::from("SELECT '{1, 2, 3}'");
let mut tokenizer = Tokenizer::new(&sql);
let tokens = tokenizer.tokenize_with_whitespace().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::SingleQuotedString(String::from("{1, 2, 3}")),
];
compare(expected, tokens);
}
fn compare(expected: Vec<Token>, actual: Vec<Token>) {
assert_eq!(expected, actual);
}
}