risingwave_sqlsmith/sql_gen/
expr.rs

1// Copyright 2025 RisingWave Labs
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use itertools::Itertools;
16use rand::Rng;
17use rand::seq::IndexedRandom;
18use risingwave_common::types::{DataType, DataTypeName, StructType};
19use risingwave_expr::sig::FUNCTION_REGISTRY;
20use risingwave_frontend::expr::cast_sigs;
21use risingwave_sqlparser::ast::{Expr, Ident, OrderByExpr, Value};
22
23use crate::sql_gen::types::data_type_to_ast_data_type;
24use crate::sql_gen::{SqlGenerator, SqlGeneratorContext};
25
26static STRUCT_FIELD_NAMES: [&str; 26] = [
27    "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s",
28    "t", "u", "v", "w", "x", "y", "z",
29];
30
31impl<R: Rng> SqlGenerator<'_, R> {
32    /// In generating expression, there are two execution modes:
33    /// 1) Can have Aggregate expressions (`can_agg` = true)
34    ///    We can have aggregate of all bound columns (those present in GROUP BY and otherwise).
35    ///    Not all GROUP BY columns need to be aggregated.
36    /// 2) Can't have Aggregate expressions (`can_agg` = false)
37    ///    Only columns present in GROUP BY can be selected.
38    ///
39    /// `inside_agg` indicates if we are calling `gen_expr` inside an aggregate.
40    pub(crate) fn gen_expr(&mut self, typ: &DataType, context: SqlGeneratorContext) -> Expr {
41        if !self.can_recurse() {
42            // Stop recursion with a simple scalar or column.
43            // Weight it more towards columns, scalar has much higher chance of being generated,
44            // since it is usually used as fail-safe expression.
45            return match self.rng.random_bool(0.1) {
46                true => self.gen_simple_scalar(typ),
47                false => self.gen_col(typ, context),
48            };
49        }
50
51        if *typ == DataType::Boolean && self.rng.random_bool(0.05) {
52            return match self.rng.random_bool(0.5) {
53                true => {
54                    let (ty, expr) = self.gen_arbitrary_expr(context);
55                    let n = self.rng.random_range(1..=10);
56                    Expr::InList {
57                        expr: Box::new(Expr::Nested(Box::new(expr))),
58                        list: self.gen_n_exprs_with_type(n, &ty, context),
59                        negated: self.flip_coin(),
60                    }
61                }
62                false => {
63                    // TODO: InSubquery expression may not be always bound in all context.
64                    // Parts labelled workaround can be removed or
65                    // generalized if it is bound in all contexts.
66                    // https://github.com/risingwavelabs/risingwave/issues/1343
67                    let old_ctxt = self.new_local_context(); // WORKAROUND
68                    let (query, column) = self.gen_single_item_query();
69                    let ty = column.data_type;
70                    let expr = self.gen_simple_scalar(&ty); // WORKAROUND
71                    let in_subquery_expr = Expr::InSubquery {
72                        expr: Box::new(Expr::Nested(Box::new(expr))),
73                        subquery: Box::new(query),
74                        negated: self.flip_coin(),
75                    };
76                    self.restore_context(old_ctxt); // WORKAROUND
77                    in_subquery_expr
78                }
79            };
80        }
81
82        // NOTE:
83        // We generate AST first, then use its `Display` trait
84        // to generate an sql string.
85        // That may erase nesting context.
86        // For instance `IN(a, b)` is `a IN b`.
87        // this can lead to ambiguity, if `a` is an
88        // INFIX/POSTFIX compound expression too:
89        // - `a1 IN a2 IN b`
90        // - `a1 >= a2 IN b`
91        // ...
92        // We just nest compound expressions to avoid this.
93        let range = if context.can_gen_agg() { 100 } else { 50 };
94        match self.rng.random_range(0..=range) {
95            0..=35 => Expr::Nested(Box::new(self.gen_func(typ, context))),
96            36..=40 => self.gen_exists(typ, context),
97            41..=50 => self.gen_explicit_cast(typ, context),
98            51..=100 => self.gen_agg(typ),
99            _ => unreachable!(),
100        }
101    }
102
103    fn gen_data_type(&mut self) -> DataType {
104        // Depth of struct/list nesting
105        let depth = self.rng.random_range(0..=1);
106        self.gen_data_type_inner(depth)
107    }
108
109    fn gen_data_type_inner(&mut self, depth: usize) -> DataType {
110        match self.rng.random_bool(0.8) {
111            true if !self.bound_columns.is_empty() => self
112                .bound_columns
113                .choose(&mut self.rng)
114                .unwrap()
115                .data_type
116                .clone(),
117            _ => {
118                use {DataType as S, DataTypeName as T};
119                let mut candidate_ret_types = vec![
120                    T::Boolean,
121                    T::Int16,
122                    T::Int32,
123                    T::Int64,
124                    T::Decimal,
125                    T::Float32,
126                    T::Float64,
127                    T::Varchar,
128                    T::Date,
129                    T::Timestamp,
130                    // ENABLE: https://github.com/risingwavelabs/risingwave/issues/5826
131                    // T::Timestamptz,
132                    T::Time,
133                    T::Interval,
134                ];
135                if depth > 0 {
136                    candidate_ret_types.push(T::Struct);
137                    candidate_ret_types.push(T::List);
138                }
139                let typ_name = candidate_ret_types.choose(&mut self.rng).unwrap();
140                match typ_name {
141                    T::Boolean => S::Boolean,
142                    T::Int16 => S::Int16,
143                    T::Int32 => S::Int32,
144                    T::Int64 => S::Int64,
145                    T::Decimal => S::Decimal,
146                    T::Float32 => S::Float32,
147                    T::Float64 => S::Float64,
148                    T::Varchar => S::Varchar,
149                    T::Date => S::Date,
150                    T::Timestamp => S::Timestamp,
151                    T::Timestamptz => S::Timestamptz,
152                    T::Time => S::Time,
153                    T::Interval => S::Interval,
154                    T::Struct => self.gen_struct_data_type(depth - 1),
155                    T::List => self.gen_list_data_type(depth - 1),
156                    _ => unreachable!(),
157                }
158            }
159        }
160    }
161
162    fn gen_list_data_type(&mut self, depth: usize) -> DataType {
163        DataType::List(Box::new(self.gen_data_type_inner(depth)))
164    }
165
166    fn gen_struct_data_type(&mut self, depth: usize) -> DataType {
167        let num_fields = self.rng.random_range(1..4);
168        DataType::Struct(StructType::new(
169            STRUCT_FIELD_NAMES[0..num_fields]
170                .iter()
171                .map(|s| (s.to_string(), self.gen_data_type_inner(depth))),
172        ))
173    }
174
175    /// Generates an arbitrary expression, but biased towards datatypes present in bound columns.
176    pub(crate) fn gen_arbitrary_expr(&mut self, context: SqlGeneratorContext) -> (DataType, Expr) {
177        let ret_type = self.gen_data_type();
178        let expr = self.gen_expr(&ret_type, context);
179        (ret_type, expr)
180    }
181
182    fn gen_col(&mut self, typ: &DataType, context: SqlGeneratorContext) -> Expr {
183        let columns = if context.is_inside_agg() {
184            if self.bound_relations.is_empty() {
185                return self.gen_simple_scalar(typ);
186            }
187            self.bound_relations
188                .choose(self.rng)
189                .unwrap()
190                .get_qualified_columns()
191        } else {
192            if self.bound_columns.is_empty() {
193                return self.gen_simple_scalar(typ);
194            }
195            self.bound_columns.clone()
196        };
197
198        let matched_cols = columns
199            .iter()
200            .filter(|col| col.data_type == *typ)
201            .collect::<Vec<_>>();
202        if matched_cols.is_empty() {
203            self.gen_simple_scalar(typ)
204        } else {
205            let col_def = matched_cols.choose(&mut self.rng).unwrap();
206            Expr::Identifier(Ident::new_unchecked(&col_def.name))
207        }
208    }
209
210    /// Generates `n` expressions of type `ret`.
211    pub(crate) fn gen_n_exprs_with_type(
212        &mut self,
213        n: usize,
214        ret: &DataType,
215        context: SqlGeneratorContext,
216    ) -> Vec<Expr> {
217        (0..n).map(|_| self.gen_expr(ret, context)).collect()
218    }
219
220    fn gen_exists(&mut self, ret: &DataType, context: SqlGeneratorContext) -> Expr {
221        if *ret != DataType::Boolean || context.can_gen_agg() {
222            return self.gen_simple_scalar(ret);
223        };
224        // Generating correlated subquery tends to create queries which cannot be unnested.
225        // we still want to test it, but reduce the chance it occurs.
226        let (subquery, _) = match self.rng.random_bool(0.05) {
227            true => self.gen_correlated_query(),
228            false => self.gen_local_query(),
229        };
230        Expr::Exists(Box::new(subquery))
231    }
232
233    /// Generate ORDER BY expressions by choosing from available bound columns.
234    pub(crate) fn gen_order_by(&mut self) -> Vec<OrderByExpr> {
235        if self.bound_columns.is_empty() {
236            return vec![];
237        }
238        let mut order_by = vec![];
239        while self.flip_coin() {
240            let column = self.bound_columns.choose(&mut self.rng).unwrap();
241            order_by.push(OrderByExpr {
242                expr: Expr::Identifier(Ident::new_unchecked(&column.name)),
243                asc: if self.rng.random_bool(0.3) {
244                    None
245                } else {
246                    Some(self.rng.random_bool(0.5))
247                },
248                nulls_first: if self.rng.random_bool(0.3) {
249                    None
250                } else {
251                    Some(self.rng.random_bool(0.5))
252                },
253            })
254        }
255        order_by
256    }
257
258    /// Generate ORDER BY expressions by choosing from given expressions.
259    pub(crate) fn gen_order_by_within(&mut self, exprs: &[Expr]) -> Vec<OrderByExpr> {
260        let exprs = exprs
261            .iter()
262            .filter(|e| matches!(e, Expr::Identifier(_) | Expr::Value(_)))
263            .cloned()
264            .collect::<Vec<_>>();
265        if exprs.is_empty() {
266            return vec![];
267        }
268        let mut order_by = vec![];
269        while self.flip_coin() {
270            let expr = exprs.choose(&mut self.rng).unwrap();
271            order_by.push(OrderByExpr {
272                expr: expr.clone(),
273                asc: if self.rng.random_bool(0.3) {
274                    None
275                } else {
276                    Some(self.rng.random_bool(0.5))
277                },
278                nulls_first: if self.rng.random_bool(0.3) {
279                    None
280                } else {
281                    Some(self.rng.random_bool(0.5))
282                },
283            })
284        }
285        order_by
286    }
287}
288
289pub(crate) fn typed_null(ty: &DataType) -> Expr {
290    Expr::Cast {
291        expr: Box::new(sql_null()),
292        data_type: data_type_to_ast_data_type(ty),
293    }
294}
295
296/// Generates a `NULL` value.
297pub(crate) fn sql_null() -> Expr {
298    Expr::Value(Value::Null)
299}
300
301// TODO(kwannoel):
302// Add variadic function signatures. Can add these functions
303// to a FUNC_TABLE too.
304pub fn print_function_table() -> String {
305    let func_str = FUNCTION_REGISTRY
306        .iter_scalars()
307        .map(|sign| {
308            format!(
309                "{}({}) -> {}",
310                sign.name,
311                sign.inputs_type.iter().format(", "),
312                sign.ret_type,
313            )
314        })
315        .join("\n");
316
317    let agg_func_str = FUNCTION_REGISTRY
318        .iter_aggregates()
319        .map(|sign| {
320            format!(
321                "{}({}) -> {}",
322                sign.name,
323                sign.inputs_type.iter().format(", "),
324                sign.ret_type,
325            )
326        })
327        .join("\n");
328
329    let cast_str = cast_sigs()
330        .map(|sig| {
331            format!(
332                "{:?} CAST {:?} -> {:?}",
333                sig.context, sig.from_type, sig.to_type,
334            )
335        })
336        .sorted()
337        .join("\n");
338
339    format!(
340        "
341==== FUNCTION SIGNATURES
342{}
343
344==== AGGREGATE FUNCTION SIGNATURES
345{}
346
347==== CAST SIGNATURES
348{}
349",
350        func_str, agg_func_str, cast_str
351    )
352}