risingwave_expr_impl/scalar/
similar_to_escape.rs

1// Copyright 2025 RisingWave Labs
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::fmt::Write;
16
17use risingwave_expr::{ExprError, Result, function};
18
19// escape `similar-to` pattern to POSIX regex pattern
20// Adapted from:
21// https://github.com/postgres/postgres/blob/db4f21e4a34b1d5a3f7123e28e77f575d1a971ea/src/backend/utils/adt/regexp.c#L768
22fn similar_escape_internal(
23    pat: &str,
24    esc_text: Option<char>,
25    writer: &mut impl Write,
26) -> std::result::Result<(), ExprError> {
27    macro_rules! write_ {
28        ($s:expr) => {
29            write!(writer, "{}", $s).unwrap()
30        };
31    }
32
33    write_!("^(?:");
34
35    let mut nquotes = 0;
36    let mut afterescape = false;
37    let mut incharclass = false;
38
39    for chr in pat.chars() {
40        match chr {
41            c if afterescape => {
42                if c == '"' && !incharclass {
43                    match nquotes {
44                        0 => write_!("){1,1}?("),
45                        1 => write_!("){1,1}(?:"),
46                        _ => {
47                            return Err(ExprError::InvalidParam {
48                                name: "pat",
49                                reason: "SQL regular expression may not contain more than two escape-double-quote separators".into()
50                            });
51                        }
52                    }
53                    nquotes += 1;
54                } else {
55                    write_!('\\');
56                    write_!(c);
57                }
58
59                afterescape = false;
60            }
61            c if esc_text.is_some_and(|t| t == c) => {
62                afterescape = true;
63            }
64            c if incharclass => {
65                if c == '\\' {
66                    write_!('\\');
67                }
68                write_!(c);
69
70                if c == ']' {
71                    incharclass = false;
72                }
73            }
74            c @ '[' => {
75                write_!(c);
76                incharclass = true;
77            }
78            '%' => {
79                write_!(".*");
80            }
81            '_' => {
82                write_!('.');
83            }
84            '(' => {
85                // convert to non-capturing parenthesis
86                write_!("(?:");
87            }
88            c @ ('\\' | '.' | '^' | '$') => {
89                write_!('\\');
90                write_!(c);
91            }
92            c => {
93                write_!(c);
94            }
95        }
96    }
97
98    write_!(")$");
99
100    Ok(())
101}
102
103#[function(
104    // x SIMILAR TO y -> x ~ similar_to_escape(y)
105    "similar_to_escape(varchar) -> varchar",
106)]
107fn similar_to_escape_default(pat: &str, writer: &mut impl Write) -> Result<()> {
108    similar_escape_internal(pat, Some('\\'), writer)
109}
110
111#[function(
112    // x SIMILAR TO y ESCAPE z -> x ~ similar_to_escape(y, z)
113    "similar_to_escape(varchar, varchar) -> varchar"
114)]
115fn similar_to_escape_with_escape_text(
116    pat: &str,
117    esc_text: &str,
118    writer: &mut impl Write,
119) -> Result<()> {
120    if esc_text.chars().nth(1).is_some() {
121        return Err(ExprError::InvalidParam {
122            name: "escape string",
123            reason: format!(
124                "Invalid escape string: `{}`, must be empty or one character",
125                esc_text
126            )
127            .into(),
128        });
129    }
130
131    similar_escape_internal(pat, esc_text.chars().next(), writer)
132}
133
134#[cfg(test)]
135mod tests {
136    use super::{similar_to_escape_default, similar_to_escape_with_escape_text};
137
138    #[test]
139    fn test_default_escape() {
140        let cases = vec![
141            ("", "^(?:)$"),
142            ("_bcd%", r#"^(?:.bcd.*)$"#),
143            ("bcd%", r#"^(?:bcd.*)$"#),
144            (r#"_bcd\%"#, r#"^(?:.bcd\%)$"#),
145            ("bcd[]ee", "^(?:bcd[]ee)$"),
146            (r#"bcd[]"ee""#, r#"^(?:bcd[]"ee")$"#),
147            ("bcd[pp_%.]ee", "^(?:bcd[pp_%.]ee)$"),
148            ("bcd[pp_%.]ee_%.", r#"^(?:bcd[pp_%.]ee..*\.)$"#),
149            ("bcd[pp_%.](ee_%.)", r#"^(?:bcd[pp_%.](?:ee..*\.))$"#),
150            (r#"%\"o_b\"%"#, "^(?:.*){1,1}?(o.b){1,1}(?:.*)$"),
151        ];
152
153        for (pat, escaped) in cases {
154            let mut writer = String::new();
155            similar_to_escape_default(pat, &mut writer).ok();
156            assert_eq!(writer, escaped);
157        }
158
159        // may not contain more than two escape-double-quote separators
160        // 3 double quotes (> 2)
161        let pat = r#"one\"two\"three\"four"#;
162        let mut writer = String::new();
163        let res = similar_to_escape_default(pat, &mut writer);
164        assert!(res.is_err());
165    }
166
167    #[test]
168    fn test_escape_with_escape_text() {
169        let cases = vec![
170            ("", "^(?:)$"),
171            ("_bcd%", "^(?:.bcd.*)$"),
172            ("bcd%", "^(?:bcd.*)$"),
173            (r#"_bcd\%"#, r#"^(?:.bcd\\.*)$"#),
174            ("bcd[]ee", "^(?:bcd[]ee)$"),
175            (r#"bcd[]ee"""#, r#"^(?:bcd[]ee"")$"#),
176            (r#"bcd[]"ee""#, r#"^(?:bcd[]"ee")$"#),
177            ("bcd[pp]ee", "^(?:bcd[pp]ee)$"),
178            ("bcd[pp_%.]ee", "^(?:bcd[pp_%.]ee)$"),
179            ("bcd[pp_%.]ee_%.", r#"^(?:bcd[pp_%.]ee..*\.)$"#),
180            ("bcd[pp_%.](ee_%.)", r#"^(?:bcd[pp_%.](?:ee..*\.))$"#),
181            (r#"%#"o_b#"%"#, "^(?:.*){1,1}?(o.b){1,1}(?:.*)$"),
182        ];
183
184        for (pat, escaped) in cases {
185            let mut writer = String::new();
186            similar_to_escape_with_escape_text(pat, "#", &mut writer).ok();
187            assert_eq!(writer, escaped);
188        }
189
190        let pat = "xxx";
191        let mut writer = String::new();
192        let res = similar_to_escape_with_escape_text(pat, "##", &mut writer);
193        assert!(res.is_err())
194    }
195
196    #[test]
197    fn test_escape_with_escape_unicode() {
198        let cases = vec![
199            ("", "^(?:)$"),
200            ("_bcd%", "^(?:.bcd.*)$"),
201            ("bcd%", "^(?:bcd.*)$"),
202            (r#"_bcd\%"#, r#"^(?:.bcd\\.*)$"#),
203            ("bcd[]ee", "^(?:bcd[]ee)$"),
204            (r#"bcd[]ee"""#, r#"^(?:bcd[]ee"")$"#),
205            (r#"bcd[]"ee""#, r#"^(?:bcd[]"ee")$"#),
206            ("bcd[pp]ee", "^(?:bcd[pp]ee)$"),
207            ("bcd[pp_%.]ee", "^(?:bcd[pp_%.]ee)$"),
208            ("bcd[pp_%.]ee_%.", r#"^(?:bcd[pp_%.]ee..*\.)$"#),
209            ("bcd[pp_%.](ee_%.)", r#"^(?:bcd[pp_%.](?:ee..*\.))$"#),
210            (r#"%💅"o_b💅"%"#, "^(?:.*){1,1}?(o.b){1,1}(?:.*)$"),
211        ];
212
213        for (pat, escaped) in cases {
214            let mut writer = String::new();
215            similar_to_escape_with_escape_text(pat, "💅", &mut writer).ok();
216            assert_eq!(writer, escaped);
217        }
218
219        let pat = "xxx";
220        let mut writer = String::new();
221        let res = similar_to_escape_with_escape_text(pat, "💅💅", &mut writer);
222        assert!(res.is_err())
223    }
224
225    #[test]
226    fn test_escape_with_escape_disabled() {
227        let cases = vec![
228            ("", "^(?:)$"),
229            ("_bcd%", "^(?:.bcd.*)$"),
230            ("bcd%", "^(?:bcd.*)$"),
231            (r#"_bcd\%"#, r#"^(?:.bcd\\.*)$"#),
232            ("bcd[]ee", "^(?:bcd[]ee)$"),
233            (r#"bcd[]ee"""#, r#"^(?:bcd[]ee"")$"#),
234            (r#"bcd[]"ee""#, r#"^(?:bcd[]"ee")$"#),
235            ("bcd[pp]ee", "^(?:bcd[pp]ee)$"),
236            ("bcd[pp_%.]ee", "^(?:bcd[pp_%.]ee)$"),
237            ("bcd[pp_%.]ee_%.", r#"^(?:bcd[pp_%.]ee..*\.)$"#),
238            ("bcd[pp_%.](ee_%.)", r#"^(?:bcd[pp_%.](?:ee..*\.))$"#),
239            (r#"%\"o_b\"%"#, r#"^(?:.*\\"o.b\\".*)$"#),
240        ];
241
242        for (pat, escaped) in cases {
243            let mut writer = String::new();
244            similar_to_escape_with_escape_text(pat, "", &mut writer).ok();
245            assert_eq!(writer, escaped);
246        }
247    }
248}