risingwave_expr_impl/scalar/
similar_to_escape.rs

1// Copyright 2025 RisingWave Labs
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use risingwave_expr::{ExprError, Result, function};
16
17// escape `similar-to` pattern to POSIX regex pattern
18// Adapted from:
19// https://github.com/postgres/postgres/blob/db4f21e4a34b1d5a3f7123e28e77f575d1a971ea/src/backend/utils/adt/regexp.c#L768
20fn similar_escape_internal(
21    pat: &str,
22    esc_text: Option<char>,
23    writer: &mut impl std::fmt::Write,
24) -> std::result::Result<(), ExprError> {
25    macro_rules! write_ {
26        ($s:expr) => {
27            write!(writer, "{}", $s).unwrap()
28        };
29    }
30
31    write_!("^(?:");
32
33    let mut nquotes = 0;
34    let mut afterescape = false;
35    let mut incharclass = false;
36
37    for chr in pat.chars() {
38        match chr {
39            c if afterescape => {
40                if c == '"' && !incharclass {
41                    match nquotes {
42                        0 => write_!("){1,1}?("),
43                        1 => write_!("){1,1}(?:"),
44                        _ => {
45                            return Err(ExprError::InvalidParam {
46                                name: "pat",
47                                reason: "SQL regular expression may not contain more than two escape-double-quote separators".into()
48                            });
49                        }
50                    }
51                    nquotes += 1;
52                } else {
53                    write_!('\\');
54                    write_!(c);
55                }
56
57                afterescape = false;
58            }
59            c if esc_text.is_some_and(|t| t == c) => {
60                afterescape = true;
61            }
62            c if incharclass => {
63                if c == '\\' {
64                    write_!('\\');
65                }
66                write_!(c);
67
68                if c == ']' {
69                    incharclass = false;
70                }
71            }
72            c @ '[' => {
73                write_!(c);
74                incharclass = true;
75            }
76            '%' => {
77                write_!(".*");
78            }
79            '_' => {
80                write_!('.');
81            }
82            '(' => {
83                // convert to non-capturing parenthesis
84                write_!("(?:");
85            }
86            c @ ('\\' | '.' | '^' | '$') => {
87                write_!('\\');
88                write_!(c);
89            }
90            c => {
91                write_!(c);
92            }
93        }
94    }
95
96    write_!(")$");
97
98    Ok(())
99}
100
101#[function(
102    // x SIMILAR TO y -> x ~ similar_to_escape(y)
103    "similar_to_escape(varchar) -> varchar",
104)]
105fn similar_to_escape_default(pat: &str, writer: &mut impl std::fmt::Write) -> Result<()> {
106    similar_escape_internal(pat, Some('\\'), writer)
107}
108
109#[function(
110    // x SIMILAR TO y ESCAPE z -> x ~ similar_to_escape(y, z)
111    "similar_to_escape(varchar, varchar) -> varchar"
112)]
113fn similar_to_escape_with_escape_text(
114    pat: &str,
115    esc_text: &str,
116    writer: &mut impl std::fmt::Write,
117) -> Result<()> {
118    if esc_text.chars().nth(1).is_some() {
119        return Err(ExprError::InvalidParam {
120            name: "escape string",
121            reason: format!(
122                "Invalid escape string: `{}`, must be empty or one character",
123                esc_text
124            )
125            .into(),
126        });
127    }
128
129    similar_escape_internal(pat, esc_text.chars().next(), writer)
130}
131
132#[cfg(test)]
133mod tests {
134    use super::{similar_to_escape_default, similar_to_escape_with_escape_text};
135
136    #[test]
137    fn test_default_escape() {
138        let cases = vec![
139            ("", "^(?:)$"),
140            ("_bcd%", r#"^(?:.bcd.*)$"#),
141            ("bcd%", r#"^(?:bcd.*)$"#),
142            (r#"_bcd\%"#, r#"^(?:.bcd\%)$"#),
143            ("bcd[]ee", "^(?:bcd[]ee)$"),
144            (r#"bcd[]"ee""#, r#"^(?:bcd[]"ee")$"#),
145            ("bcd[pp_%.]ee", "^(?:bcd[pp_%.]ee)$"),
146            ("bcd[pp_%.]ee_%.", r#"^(?:bcd[pp_%.]ee..*\.)$"#),
147            ("bcd[pp_%.](ee_%.)", r#"^(?:bcd[pp_%.](?:ee..*\.))$"#),
148            (r#"%\"o_b\"%"#, "^(?:.*){1,1}?(o.b){1,1}(?:.*)$"),
149        ];
150
151        for (pat, escaped) in cases {
152            let mut writer = String::new();
153            similar_to_escape_default(pat, &mut writer).ok();
154            assert_eq!(writer, escaped);
155        }
156
157        // may not contain more than two escape-double-quote separators
158        // 3 double quotes (> 2)
159        let pat = r#"one\"two\"three\"four"#;
160        let mut writer = String::new();
161        let res = similar_to_escape_default(pat, &mut writer);
162        assert!(res.is_err());
163    }
164
165    #[test]
166    fn test_escape_with_escape_text() {
167        let cases = vec![
168            ("", "^(?:)$"),
169            ("_bcd%", "^(?:.bcd.*)$"),
170            ("bcd%", "^(?:bcd.*)$"),
171            (r#"_bcd\%"#, r#"^(?:.bcd\\.*)$"#),
172            ("bcd[]ee", "^(?:bcd[]ee)$"),
173            (r#"bcd[]ee"""#, r#"^(?:bcd[]ee"")$"#),
174            (r#"bcd[]"ee""#, r#"^(?:bcd[]"ee")$"#),
175            ("bcd[pp]ee", "^(?:bcd[pp]ee)$"),
176            ("bcd[pp_%.]ee", "^(?:bcd[pp_%.]ee)$"),
177            ("bcd[pp_%.]ee_%.", r#"^(?:bcd[pp_%.]ee..*\.)$"#),
178            ("bcd[pp_%.](ee_%.)", r#"^(?:bcd[pp_%.](?:ee..*\.))$"#),
179            (r#"%#"o_b#"%"#, "^(?:.*){1,1}?(o.b){1,1}(?:.*)$"),
180        ];
181
182        for (pat, escaped) in cases {
183            let mut writer = String::new();
184            similar_to_escape_with_escape_text(pat, "#", &mut writer).ok();
185            assert_eq!(writer, escaped);
186        }
187
188        let pat = "xxx";
189        let mut writer = String::new();
190        let res = similar_to_escape_with_escape_text(pat, "##", &mut writer);
191        assert!(res.is_err())
192    }
193
194    #[test]
195    fn test_escape_with_escape_unicode() {
196        let cases = vec![
197            ("", "^(?:)$"),
198            ("_bcd%", "^(?:.bcd.*)$"),
199            ("bcd%", "^(?:bcd.*)$"),
200            (r#"_bcd\%"#, r#"^(?:.bcd\\.*)$"#),
201            ("bcd[]ee", "^(?:bcd[]ee)$"),
202            (r#"bcd[]ee"""#, r#"^(?:bcd[]ee"")$"#),
203            (r#"bcd[]"ee""#, r#"^(?:bcd[]"ee")$"#),
204            ("bcd[pp]ee", "^(?:bcd[pp]ee)$"),
205            ("bcd[pp_%.]ee", "^(?:bcd[pp_%.]ee)$"),
206            ("bcd[pp_%.]ee_%.", r#"^(?:bcd[pp_%.]ee..*\.)$"#),
207            ("bcd[pp_%.](ee_%.)", r#"^(?:bcd[pp_%.](?:ee..*\.))$"#),
208            (r#"%💅"o_b💅"%"#, "^(?:.*){1,1}?(o.b){1,1}(?:.*)$"),
209        ];
210
211        for (pat, escaped) in cases {
212            let mut writer = String::new();
213            similar_to_escape_with_escape_text(pat, "💅", &mut writer).ok();
214            assert_eq!(writer, escaped);
215        }
216
217        let pat = "xxx";
218        let mut writer = String::new();
219        let res = similar_to_escape_with_escape_text(pat, "💅💅", &mut writer);
220        assert!(res.is_err())
221    }
222
223    #[test]
224    fn test_escape_with_escape_disabled() {
225        let cases = vec![
226            ("", "^(?:)$"),
227            ("_bcd%", "^(?:.bcd.*)$"),
228            ("bcd%", "^(?:bcd.*)$"),
229            (r#"_bcd\%"#, r#"^(?:.bcd\\.*)$"#),
230            ("bcd[]ee", "^(?:bcd[]ee)$"),
231            (r#"bcd[]ee"""#, r#"^(?:bcd[]ee"")$"#),
232            (r#"bcd[]"ee""#, r#"^(?:bcd[]"ee")$"#),
233            ("bcd[pp]ee", "^(?:bcd[pp]ee)$"),
234            ("bcd[pp_%.]ee", "^(?:bcd[pp_%.]ee)$"),
235            ("bcd[pp_%.]ee_%.", r#"^(?:bcd[pp_%.]ee..*\.)$"#),
236            ("bcd[pp_%.](ee_%.)", r#"^(?:bcd[pp_%.](?:ee..*\.))$"#),
237            (r#"%\"o_b\"%"#, r#"^(?:.*\\"o.b\\".*)$"#),
238        ];
239
240        for (pat, escaped) in cases {
241            let mut writer = String::new();
242            similar_to_escape_with_escape_text(pat, "", &mut writer).ok();
243            assert_eq!(writer, escaped);
244        }
245    }
246}