risingwave_meta/barrier/checkpoint/independent_job/batch_refresh_job/
mod.rs

1// Copyright 2026 RisingWave Labs
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Batch refresh job checkpoint control for periodically-refreshed materialized views.
16//!
17//! It lives permanently in `DatabaseCheckpointControl.independent_checkpoint_job_controls`
18//! as an `IndependentCheckpointJobControl::BatchRefresh` variant for its entire lifetime.
19//!
20//! Lifecycle:
21//!   DDL → `ConsumingSnapshot` → `FinishingSnapshot` → `Idle`
22//!                                                        ↕  (periodic trigger)
23//!                                        `Idle` ← `ConsumingLogStore`
24
25use std::collections::{HashMap, HashSet};
26use std::mem::{replace, take};
27use std::sync::atomic::AtomicU32;
28
29use anyhow::anyhow;
30use itertools::Itertools;
31use risingwave_common::catalog::{DatabaseId, TableId};
32use risingwave_common::id::JobId;
33use risingwave_common::metrics::{LabelGuardedHistogram, LabelGuardedIntGauge};
34use risingwave_common::util::epoch::{Epoch, EpochPair};
35use risingwave_meta_model::{DispatcherType, WorkerId, streaming_job};
36use risingwave_pb::common::WorkerNode;
37use risingwave_pb::ddl_service::PbBackfillType;
38use risingwave_pb::hummock::HummockVersionStats;
39use risingwave_pb::id::{ActorId, FragmentId, PartialGraphId};
40use risingwave_pb::stream_plan::barrier::PbBarrierKind;
41use risingwave_pb::stream_plan::barrier_mutation::Mutation;
42use risingwave_pb::stream_plan::{AddMutation, StartFragmentBackfillMutation, StopMutation};
43use risingwave_pb::stream_service::BarrierCompleteResponse;
44use tracing::{debug, info};
45
46use crate::MetaResult;
47use crate::barrier::backfill_order_control::get_nodes_with_backfill_dependencies;
48use crate::barrier::command::PostCollectCommand;
49use crate::barrier::context::CreateSnapshotBackfillJobCommandInfo;
50use crate::barrier::edge_builder::{EdgeBuilderFragmentInfo, FragmentEdgeBuilder};
51use crate::barrier::info::BarrierInfo;
52use crate::barrier::notifier::Notifier;
53use crate::barrier::partial_graph::{
54    CollectedBarrier, PartialGraphBarrierInfo, PartialGraphManager, PartialGraphStat,
55};
56use crate::barrier::progress::{CreateMviewProgressTracker, TrackingJob, collect_done_fragments};
57use crate::barrier::rpc::to_partial_graph_id;
58use crate::barrier::{
59    BackfillOrderState, BackfillProgress, BarrierKind, FragmentBackfillProgress, TracedEpoch,
60};
61use crate::controller::fragment::InflightFragmentInfo;
62use crate::controller::scale::{
63    ComponentFragmentAligner, EnsembleActorTemplate, LoadedFragment, NoShuffleEnsemble,
64    build_no_shuffle_fragment_graph_edges, find_no_shuffle_graphs,
65};
66use crate::model::{
67    FragmentDownstreamRelation, StreamActor, StreamJobActorsToCreate, StreamingJobModelContextExt,
68};
69use crate::rpc::metrics::GLOBAL_META_METRICS;
70use crate::stream::ExtendedFragmentBackfillOrder;
71
72// ── Public types ──────────────────────────────────────────────────────────────
73
74/// Logical fragment metadata for a batch refresh job.
75///
76/// Contains only catalog-level information: fragment structure, stream plan nodes,
77/// distribution, and downstream relations. No actor IDs, no worker placement.
78///
79/// Used as the uniform input for `render_actors_and_build_job_info()`, which performs
80/// actor rendering (ID allocation, worker placement, vnode assignment) internally.
81/// No-shuffle ensembles are derived from `downstreams` internally.
82#[derive(Debug)]
83pub(crate) struct BatchRefreshLogicalFragments {
84    /// Logical fragments of this job. Keyed by `fragment_id`.
85    pub fragments: HashMap<FragmentId, LoadedFragment>,
86    /// Internal downstream relations (intra-job only; no upstream edges).
87    pub downstreams: FragmentDownstreamRelation,
88}
89
90/// Result of the unified actor rendering for a batch refresh job.
91///
92/// Produced by `render_actors_and_build_job_info()` and consumed by both
93/// `new()` (create) and `recover()`.
94#[derive(Debug)]
95pub(crate) struct BatchRefreshRenderResult {
96    pub fragment_infos: HashMap<FragmentId, InflightFragmentInfo>,
97    pub node_actors: HashMap<WorkerId, HashSet<ActorId>>,
98    pub state_table_ids: HashSet<TableId>,
99    pub actors_to_create: StreamJobActorsToCreate,
100}
101
102// ── Batch refresh job metadata ───────────────────────────────────────────────
103
104/// Lightweight metadata for re-rendering actors on each periodic refresh run.
105///
106/// Loaded asynchronously on every trigger via `load_batch_refresh_trigger_context()`.
107/// Contains the pieces consumed by `from_context()` and `render_actors_and_build_job_info()`,
108/// as well as the resolved upstream log epochs and target epoch for this trigger.
109#[derive(Debug)]
110pub(crate) struct BatchRefreshJobTriggerContext {
111    pub fragments: HashMap<FragmentId, LoadedFragment>,
112    pub downstreams: FragmentDownstreamRelation,
113    pub streaming_job_model: streaming_job::Model,
114    pub definition: String,
115    pub database_resource_group: String,
116    /// Changelog entries per upstream table, used to derive log barriers.
117    pub upstream_table_log_epochs: HashMap<TableId, Vec<(Vec<u64>, u64)>>,
118    /// The upstream committed epoch to catch up to.
119    pub target_upstream_epoch: u64,
120}
121
122// ── Status ────────────────────────────────────────────────────────────────────
123
124/// The partial graph is being reset (always for drop).
125/// Once the reset is confirmed, the job is removed from the map.
126
127#[derive(Debug)]
128enum BatchRefreshJobStatus {
129    /// The job is consuming upstream snapshot.
130    ///
131    /// Once snapshot consumption finishes, the final checkpoint + stop barriers are injected
132    /// and the status transitions to `FinishingSnapshot`.
133    ConsumingSnapshot {
134        prev_epoch_fake_physical_time: u64,
135        version_stats: HummockVersionStats,
136        create_mview_tracker: CreateMviewProgressTracker,
137        snapshot_epoch: u64,
138        fragment_infos: HashMap<FragmentId, InflightFragmentInfo>,
139        pending_non_checkpoint_barriers: Vec<u64>,
140        node_actors: HashMap<WorkerId, HashSet<ActorId>>,
141        state_table_ids: HashSet<TableId>,
142    },
143    /// The job has finished consuming the snapshot.
144    ///
145    /// The final checkpoint barrier (at `snapshot_epoch`) and the stop barrier have been
146    /// injected. Once the stop epoch is committed the job transitions to `Idle`.
147    /// The committed epoch is expected to be the snapshot epoch when the snapshot
148    /// consumption finishes.
149    FinishingSnapshot {
150        tracking_job: Option<TrackingJob>,
151        fragment_infos: HashMap<FragmentId, InflightFragmentInfo>,
152    },
153    /// The job is idle, waiting for the next trigger. No partial graph is held.
154    Idle { last_committed_epoch: u64 },
155    /// The job has created a partial graph for periodic refresh and is waiting for
156    /// the initial barrier to bootstrap the newly-created actors.
157    InitializingBatchRefresh {
158        fragment_infos: HashMap<FragmentId, InflightFragmentInfo>,
159        node_actors: HashMap<WorkerId, HashSet<ActorId>>,
160        state_table_ids: HashSet<TableId>,
161        /// Log barriers to inject after the partial graph is initialized. The
162        /// last one is the checkpoint stop barrier with `curr_epoch = u64::MAX`.
163        pending_log_barriers: Vec<BarrierInfo>,
164        logstore_start_epoch: u64,
165        target_upstream_epoch: u64,
166    },
167    /// The job is consuming upstream log store changes (periodic refresh).
168    ///
169    /// All replay barriers have been pre-injected (last with `StopMutation` at
170    /// `curr_epoch = u64::MAX`). When `target_upstream_epoch` commits,
171    /// the partial graph is removed and the job transitions to `Idle`.
172    ConsumingLogStore {
173        fragment_infos: HashMap<FragmentId, InflightFragmentInfo>,
174        /// The epoch from which log consumption started (for `pinned_upstream_log_epoch`).
175        logstore_start_epoch: u64,
176        /// `prev_epoch` of the stop barrier; becomes `last_committed_epoch` when transitioning to Idle.
177        target_upstream_epoch: u64,
178    },
179    /// The partial graph is being reset (for drop).
180    Resetting { notifiers: Vec<Notifier> },
181}
182
183// ── Complete type ─────────────────────────────────────────────────────────────
184
185// ── Main checkpoint control ───────────────────────────────────────────────────
186
187/// Self-contained checkpoint control for a batch refresh MV.
188///
189/// Unlike `CreatingStreamingJobControl`, this struct handles the full lifecycle
190/// (snapshot → idle → re-run → idle → ...). Both types are stored together in
191/// `DatabaseCheckpointControl.independent_checkpoint_job_controls` as
192/// `IndependentCheckpointJobControl` variants.
193#[derive(Debug)]
194pub(crate) struct BatchRefreshJobCheckpointControl {
195    job_id: JobId,
196    partial_graph_id: PartialGraphId,
197    snapshot_backfill_upstream_tables: HashSet<TableId>,
198    snapshot_epoch: u64,
199    /// Batch refresh interval in seconds. Used to determine when to trigger a refresh run.
200    batch_refresh_seconds: u64,
201
202    status: BatchRefreshJobStatus,
203}
204
205// ── Unified actor rendering ───────────────────────────────────────────────────
206
207impl BatchRefreshJobCheckpointControl {
208    /// Render actors for a batch refresh job from logical metadata only.
209    ///
210    /// Performs the full pipeline:
211    /// 1. Derive no-shuffle ensembles from `downstreams`
212    /// 2. Render actor assignments (ID allocation, worker placement, vnode bitmap)
213    /// 3. Build `StreamActor` structs
214    /// 4. Build internal-only edges (no upstream dispatcher edges)
215    /// 5. Produce `fragment_infos`, `node_actors`, `state_table_ids`, `actors_to_create`
216    ///
217    /// Shared by both the DDL create path and the recovery path.
218    pub(crate) fn render_actors_and_build_job_info(
219        fragments: &HashMap<FragmentId, LoadedFragment>,
220        downstreams: &FragmentDownstreamRelation,
221        definition: &str,
222        // Actor rendering context:
223        actor_id_generator: &AtomicU32,
224        worker_nodes: &HashMap<WorkerId, WorkerNode>,
225        database_resource_group: &str,
226        streaming_job_model: &streaming_job::Model,
227        // Edge building context:
228        partial_graph_id: PartialGraphId,
229    ) -> MetaResult<BatchRefreshRenderResult> {
230        // Step 1: Derive no-shuffle ensembles from downstreams.
231        let ensembles = Self::resolve_ensembles(fragments, downstreams)?;
232
233        // Step 2: Render actor assignments for each ensemble.
234        let mut actor_assignments: HashMap<
235            FragmentId,
236            HashMap<ActorId, (WorkerId, Option<risingwave_common::bitmap::Bitmap>)>,
237        > = HashMap::new();
238
239        for ensemble in &ensembles {
240            // All fragments are new (batch refresh has no existing upstream fragments).
241            let first_component = ensemble
242                .component_fragments()
243                .next()
244                .expect("ensemble must have at least one component");
245            let fragment = &fragments[&first_component];
246            let distribution_type = fragment.distribution_type;
247            let vnode_count = fragment.vnode_count;
248
249            // Assert all component fragments share the same vnode count.
250            for fid in ensemble.component_fragments() {
251                let f = &fragments[&fid];
252                assert_eq!(
253                    vnode_count, f.vnode_count,
254                    "fragments {} and {} in same ensemble have different vnode counts",
255                    first_component, fid,
256                );
257            }
258
259            let entry_fragment_parallelism = Itertools::exactly_one(
260                ensemble
261                    .entry_fragments()
262                    .map(|fid| fragments[&fid].parallelism.clone())
263                    .dedup(),
264            )
265            .map_err(|_| {
266                anyhow!(
267                    "entry fragments have inconsistent parallelism settings in batch refresh job"
268                )
269            })?;
270
271            let actor_template = EnsembleActorTemplate::render_new(
272                streaming_job_model,
273                worker_nodes,
274                entry_fragment_parallelism,
275                database_resource_group.to_owned(),
276                distribution_type,
277                vnode_count,
278            )?;
279
280            for fid in ensemble.component_fragments() {
281                let f = &fragments[&fid];
282                let aligner =
283                    ComponentFragmentAligner::new_persistent(&actor_template, actor_id_generator);
284                let assignments = aligner.align_component_actor(f.distribution_type);
285                actor_assignments.insert(fid, assignments);
286            }
287        }
288
289        // Step 3: Expand assignments into StreamActor + actor_location + InflightFragmentInfo.
290        let mut stream_actors: HashMap<FragmentId, Vec<StreamActor>> = HashMap::new();
291        let mut actor_location: HashMap<ActorId, WorkerId> = HashMap::new();
292
293        for (fragment_id, assignments) in &actor_assignments {
294            let mut actors = Vec::with_capacity(assignments.len());
295            for (&actor_id, (worker_id, vnode_bitmap)) in assignments {
296                actor_location.insert(actor_id, *worker_id);
297                let stream_context = streaming_job_model.stream_context();
298                actors.push(StreamActor {
299                    actor_id,
300                    fragment_id: *fragment_id,
301                    vnode_bitmap: vnode_bitmap.clone(),
302                    mview_definition: definition.to_owned(),
303                    expr_context: Some(stream_context.to_expr_context()),
304                    config_override: stream_context.config_override.clone(),
305                });
306            }
307            stream_actors.insert(*fragment_id, actors);
308        }
309
310        // Build InflightFragmentInfo from logical fragments + rendered actors.
311        let fragment_infos: HashMap<FragmentId, InflightFragmentInfo> = fragments
312            .iter()
313            .map(|(fragment_id, loaded)| {
314                let actors = stream_actors
315                    .get(fragment_id)
316                    .into_iter()
317                    .flatten()
318                    .map(|actor| {
319                        (
320                            actor.actor_id,
321                            crate::controller::fragment::InflightActorInfo {
322                                worker_id: actor_location[&actor.actor_id],
323                                vnode_bitmap: actor.vnode_bitmap.clone(),
324                                splits: vec![], // batch refresh has no source splits
325                            },
326                        )
327                    })
328                    .collect();
329                (
330                    *fragment_id,
331                    InflightFragmentInfo {
332                        fragment_id: *fragment_id,
333                        distribution_type: loaded.distribution_type,
334                        fragment_type_mask: loaded.fragment_type_mask,
335                        vnode_count: loaded.vnode_count,
336                        nodes: loaded.nodes.clone(),
337                        actors,
338                        state_table_ids: loaded.state_table_ids.clone(),
339                    },
340                )
341            })
342            .collect();
343
344        // Step 4: Build edges (internal-only, no upstream).
345        let mut builder = FragmentEdgeBuilder::new(fragment_infos.values().map(|f| {
346            (
347                f.fragment_id,
348                EdgeBuilderFragmentInfo::from_inflight_with_worker_nodes(
349                    f,
350                    partial_graph_id,
351                    worker_nodes,
352                ),
353            )
354        }));
355        builder.add_relations(downstreams);
356        let mut edges = builder.build();
357
358        let actors_to_create = edges.collect_actors_to_create(fragment_infos.values().map(|f| {
359            (
360                f.fragment_id,
361                &f.nodes,
362                f.actors.iter().map(|(actor_id, actor)| {
363                    let sa = stream_actors[&f.fragment_id]
364                        .iter()
365                        .find(|a| a.actor_id == *actor_id)
366                        .expect("should exist");
367                    (sa, actor.worker_id)
368                }),
369                vec![], // no subscribers for batch refresh jobs
370            )
371        }));
372
373        // Step 5: Build node_actors, state_table_ids.
374        let node_actors = InflightFragmentInfo::actor_ids_to_collect(fragment_infos.values());
375        let state_table_ids =
376            InflightFragmentInfo::existing_table_ids(fragment_infos.values()).collect();
377
378        Ok(BatchRefreshRenderResult {
379            fragment_infos,
380            node_actors,
381            state_table_ids,
382            actors_to_create,
383        })
384    }
385
386    /// Build the initial `Add` mutation for the partial graph's first barrier.
387    ///
388    /// The rendered actors come from a prior `render_actors_and_build_job_info()` call;
389    /// `backfill_nodes_to_pause` is derived from the job's backfill ordering.
390    pub(crate) fn build_initial_partial_graph_mutation(
391        render_result: &BatchRefreshRenderResult,
392        backfill_ordering: &ExtendedFragmentBackfillOrder,
393    ) -> Mutation {
394        let added_actors: Vec<ActorId> = render_result
395            .fragment_infos
396            .values()
397            .flat_map(|f| f.actors.keys().copied())
398            .collect();
399        let backfill_nodes_to_pause = get_nodes_with_backfill_dependencies(backfill_ordering)
400            .into_iter()
401            .collect();
402        Mutation::Add(AddMutation {
403            actor_dispatchers: Default::default(),
404            added_actors,
405            actor_splits: Default::default(),
406            pause: false,
407            subscriptions_to_add: Default::default(),
408            backfill_nodes_to_pause,
409            actor_cdc_table_snapshot_splits: None,
410            new_upstream_sinks: Default::default(),
411            dropped_actors: Default::default(),
412            sink_log_store_flush: Default::default(),
413        })
414    }
415
416    /// Derive no-shuffle ensembles from fragment downstreams.
417    fn resolve_ensembles(
418        fragments: &HashMap<FragmentId, LoadedFragment>,
419        downstreams: &FragmentDownstreamRelation,
420    ) -> MetaResult<Vec<NoShuffleEnsemble>> {
421        let mut new_no_shuffle: HashMap<_, HashSet<_>> = HashMap::new();
422        for (upstream_fid, relations) in downstreams {
423            for rel in relations {
424                if rel.dispatcher_type == DispatcherType::NoShuffle {
425                    new_no_shuffle
426                        .entry(*upstream_fid)
427                        .or_default()
428                        .insert(rel.downstream_fragment_id);
429                }
430            }
431        }
432
433        let mut ensembles = if new_no_shuffle.is_empty() {
434            Vec::new()
435        } else {
436            let no_shuffle_edges: Vec<(FragmentId, FragmentId)> = new_no_shuffle
437                .iter()
438                .flat_map(|(u, ds)| ds.iter().map(move |d| (*u, *d)))
439                .collect();
440            let all_fragment_ids: Vec<FragmentId> = no_shuffle_edges
441                .iter()
442                .flat_map(|(u, d)| [*u, *d])
443                .collect::<HashSet<_>>()
444                .into_iter()
445                .collect();
446            let (fwd, bwd) = build_no_shuffle_fragment_graph_edges(no_shuffle_edges);
447            find_no_shuffle_graphs(&all_fragment_ids, &fwd, &bwd)?
448        };
449
450        // Add standalone fragments as single-fragment ensembles.
451        let covered: HashSet<FragmentId> = ensembles
452            .iter()
453            .flat_map(|e| e.component_fragments())
454            .collect();
455        for fragment_id in fragments.keys() {
456            if !covered.contains(fragment_id) {
457                ensembles.push(NoShuffleEnsemble::singleton(*fragment_id));
458            }
459        }
460
461        Ok(ensembles)
462    }
463}
464
465// ── Construction ──────────────────────────────────────────────────────────────
466
467impl BatchRefreshJobCheckpointControl {
468    /// Create from DDL command. Starts in `ConsumingSnapshot`.
469    ///
470    /// Internally calls `render_actors_and_build_job_info()` and injects the
471    /// partial-graph initial barrier.
472    #[expect(clippy::too_many_arguments)]
473    pub(crate) fn new(
474        database_id: DatabaseId,
475        job_id: JobId,
476        create_info: CreateSnapshotBackfillJobCommandInfo,
477        notifiers: Vec<Notifier>,
478        snapshot_backfill_upstream_tables: HashSet<TableId>,
479        snapshot_epoch: u64,
480        version_stat: &HummockVersionStats,
481        partial_graph_manager: &mut PartialGraphManager,
482        logical: &BatchRefreshLogicalFragments,
483        worker_nodes: &HashMap<WorkerId, WorkerNode>,
484        batch_refresh_seconds: u64,
485    ) -> MetaResult<Self> {
486        debug!(
487            %job_id,
488            "new batch refresh job"
489        );
490
491        let partial_graph_id = to_partial_graph_id(database_id, Some(job_id));
492        let backfill_ordering = &create_info.info.fragment_backfill_ordering;
493        let actor_id_generator = partial_graph_manager
494            .control_stream_manager()
495            .env
496            .actor_id_generator();
497
498        let render_result = Self::render_actors_and_build_job_info(
499            &logical.fragments,
500            &logical.downstreams,
501            &create_info.info.definition,
502            actor_id_generator,
503            worker_nodes,
504            &create_info.info.database_resource_group,
505            &create_info.info.streaming_job_model,
506            partial_graph_id,
507        )?;
508        let initial_partial_graph_mutation =
509            Self::build_initial_partial_graph_mutation(&render_result, backfill_ordering);
510
511        let backfill_order_state = BackfillOrderState::new(
512            backfill_ordering,
513            &render_result.fragment_infos,
514            create_info
515                .info
516                .locality_fragment_state_table_mapping
517                .clone(),
518        );
519        let create_mview_tracker = CreateMviewProgressTracker::recover(
520            job_id,
521            &render_result.fragment_infos,
522            backfill_order_state,
523            version_stat,
524        );
525
526        let mut prev_epoch_fake_physical_time = 0;
527        let mut pending_non_checkpoint_barriers = vec![];
528
529        let initial_barrier_info = super::new_fake_barrier(
530            &mut prev_epoch_fake_physical_time,
531            &mut pending_non_checkpoint_barriers,
532            PbBarrierKind::Checkpoint,
533        );
534
535        let mut graph_adder = partial_graph_manager.add_partial_graph(
536            partial_graph_id,
537            BatchRefreshBarrierStats::new(job_id, snapshot_epoch),
538        );
539
540        if let Err(e) = Self::inject_barrier(
541            partial_graph_id,
542            graph_adder.manager(),
543            &render_result.node_actors,
544            &render_result.state_table_ids,
545            initial_barrier_info,
546            Some(render_result.actors_to_create),
547            Some(initial_partial_graph_mutation),
548            notifiers,
549            Some(create_info),
550            false,
551        ) {
552            graph_adder.failed();
553            return Err(e);
554        }
555
556        graph_adder.added();
557        assert!(pending_non_checkpoint_barriers.is_empty());
558        let this = Self {
559            partial_graph_id,
560            job_id,
561            snapshot_backfill_upstream_tables,
562            snapshot_epoch,
563            batch_refresh_seconds,
564
565            status: BatchRefreshJobStatus::ConsumingSnapshot {
566                prev_epoch_fake_physical_time,
567                version_stats: version_stat.clone(),
568                create_mview_tracker,
569                snapshot_epoch,
570                fragment_infos: render_result.fragment_infos,
571                pending_non_checkpoint_barriers,
572                node_actors: render_result.node_actors,
573                state_table_ids: render_result.state_table_ids,
574            },
575        };
576        Ok(this)
577    }
578
579    /// Recover from a persistent state during recovery.
580    ///
581    /// - If `committed_epoch >= snapshot_epoch` → Idle (snapshot completed before crash).
582    /// - If `committed_epoch < snapshot_epoch` → `ConsumingSnapshot` using pre-rendered actors.
583    #[expect(clippy::too_many_arguments)]
584    pub(crate) fn recover(
585        database_id: DatabaseId,
586        job_id: JobId,
587        snapshot_backfill_upstream_tables: HashSet<TableId>,
588        snapshot_epoch: u64,
589        committed_epoch: u64,
590        backfill_order: ExtendedFragmentBackfillOrder,
591        version_stat: &HummockVersionStats,
592        initial_mutation: Mutation,
593        render_result: BatchRefreshRenderResult,
594        partial_graph_recoverer: &mut crate::barrier::partial_graph::PartialGraphRecoverer<'_>,
595        batch_refresh_seconds: u64,
596    ) -> MetaResult<Self> {
597        let partial_graph_id = to_partial_graph_id(database_id, Some(job_id));
598
599        if committed_epoch >= snapshot_epoch {
600            // Snapshot completed; recover to Idle.
601            info!(
602                %job_id,
603                committed_epoch,
604                snapshot_epoch,
605                "recovered idle batch refresh job (no partial graph)"
606            );
607            return Ok(Self {
608                job_id,
609                partial_graph_id,
610                snapshot_backfill_upstream_tables,
611                snapshot_epoch,
612                batch_refresh_seconds,
613
614                status: BatchRefreshJobStatus::Idle {
615                    last_committed_epoch: committed_epoch,
616                },
617            });
618        }
619
620        // Snapshot still in-progress; recover to ConsumingSnapshot.
621        info!(
622            %job_id,
623            committed_epoch,
624            snapshot_epoch,
625            "recovered batch refresh job to consuming snapshot"
626        );
627
628        let mut prev_epoch_fake_physical_time = Epoch(committed_epoch).physical_time();
629        let mut pending_non_checkpoint_barriers = vec![];
630
631        let locality_fragment_state_table_mapping =
632            crate::barrier::rpc::build_locality_fragment_state_table_mapping(
633                &render_result.fragment_infos,
634            );
635        let backfill_order_state = BackfillOrderState::recover_from_fragment_infos(
636            &backfill_order,
637            &render_result.fragment_infos,
638            locality_fragment_state_table_mapping,
639        );
640
641        let create_mview_tracker = CreateMviewProgressTracker::recover(
642            job_id,
643            &render_result.fragment_infos,
644            backfill_order_state,
645            version_stat,
646        );
647
648        let first_barrier_info = super::new_fake_barrier(
649            &mut prev_epoch_fake_physical_time,
650            &mut pending_non_checkpoint_barriers,
651            PbBarrierKind::Initial,
652        );
653
654        partial_graph_recoverer.recover_graph(
655            partial_graph_id,
656            initial_mutation,
657            &first_barrier_info,
658            &render_result.node_actors,
659            render_result.state_table_ids.iter().copied(),
660            render_result.actors_to_create,
661            BatchRefreshBarrierStats::new(job_id, snapshot_epoch),
662        )?;
663
664        Ok(Self {
665            job_id,
666            partial_graph_id,
667            snapshot_backfill_upstream_tables,
668            snapshot_epoch,
669            batch_refresh_seconds,
670            status: BatchRefreshJobStatus::ConsumingSnapshot {
671                prev_epoch_fake_physical_time,
672                version_stats: version_stat.clone(),
673                create_mview_tracker,
674                fragment_infos: render_result.fragment_infos,
675                snapshot_epoch,
676                pending_non_checkpoint_barriers,
677                node_actors: render_result.node_actors,
678                state_table_ids: render_result.state_table_ids,
679            },
680        })
681    }
682}
683
684// ── Barrier injection ─────────────────────────────────────────────────────────
685
686impl BatchRefreshJobCheckpointControl {
687    fn inject_barrier(
688        partial_graph_id: PartialGraphId,
689        partial_graph_manager: &mut PartialGraphManager,
690        node_actors: &HashMap<WorkerId, HashSet<ActorId>>,
691        state_table_ids: &HashSet<TableId>,
692        barrier_info: BarrierInfo,
693        new_actors: Option<StreamJobActorsToCreate>,
694        mutation: Option<Mutation>,
695        notifiers: Vec<Notifier>,
696        first_create_info: Option<CreateSnapshotBackfillJobCommandInfo>,
697        is_stop: bool,
698    ) -> MetaResult<()> {
699        if is_stop {
700            assert!(
701                matches!(&mutation, Some(Mutation::Stop(_))),
702                "stop barrier must carry a Stop mutation"
703            );
704        }
705        partial_graph_manager.inject_barrier(
706            partial_graph_id,
707            mutation,
708            node_actors,
709            state_table_ids.iter().copied(),
710            if is_stop {
711                // Stop barrier: data already synced by the prior checkpoint.
712                itertools::Either::Left(std::iter::empty())
713            } else {
714                itertools::Either::Right(node_actors.keys().copied())
715            },
716            new_actors,
717            PartialGraphBarrierInfo::new(
718                first_create_info.map_or_else(
719                    PostCollectCommand::barrier,
720                    CreateSnapshotBackfillJobCommandInfo::into_post_collect,
721                ),
722                barrier_info,
723                notifiers,
724                state_table_ids.clone(),
725            ),
726        )?;
727        Ok(())
728    }
729}
730
731// ── Barrier forwarding and collection ─────────────────────────────────────────
732
733impl BatchRefreshJobCheckpointControl {
734    pub(crate) fn on_new_upstream_barrier(
735        &mut self,
736        partial_graph_manager: &mut PartialGraphManager,
737        barrier_info: &BarrierInfo,
738        mutation: Option<(Mutation, Vec<Notifier>)>,
739    ) -> MetaResult<()> {
740        if !matches!(self.status, BatchRefreshJobStatus::ConsumingSnapshot { .. }) {
741            // ConsumingLogStore has all barriers pre-injected; no forwarding needed.
742            // Idle and Resetting have no partial graph.
743            return Ok(());
744        }
745        let (mut mutation, mut notifiers) = match mutation {
746            Some((mutation, notifiers)) => (Some(mutation), notifiers),
747            None => (None, vec![]),
748        };
749
750        // Check if snapshot consumption is finished and we need to inject stop barriers.
751        let is_finished = matches!(
752            &self.status,
753            BatchRefreshJobStatus::ConsumingSnapshot { create_mview_tracker, .. }
754            if create_mview_tracker.is_finished()
755        );
756
757        if is_finished {
758            // Discard the upstream mutation — not needed for stop barriers.
759            mutation.take();
760
761            // Take the status out to destructure and transition to `FinishingSnapshot`.
762            // Use a placeholder; will be overwritten below.
763            let old_status = replace(
764                &mut self.status,
765                BatchRefreshJobStatus::Idle {
766                    last_committed_epoch: 0,
767                },
768            );
769            let BatchRefreshJobStatus::ConsumingSnapshot {
770                prev_epoch_fake_physical_time,
771                mut pending_non_checkpoint_barriers,
772                snapshot_epoch,
773                fragment_infos,
774                create_mview_tracker,
775                node_actors,
776                state_table_ids,
777                ..
778            } = old_status
779            else {
780                unreachable!()
781            };
782
783            let tracking_job = create_mview_tracker.into_tracking_job();
784
785            // Inject final checkpoint at snapshot epoch.
786            pending_non_checkpoint_barriers.push(snapshot_epoch);
787            let prev_epoch = Epoch::from_physical_time(prev_epoch_fake_physical_time);
788            let final_checkpoint = BarrierInfo {
789                curr_epoch: TracedEpoch::new(Epoch(snapshot_epoch)),
790                prev_epoch: TracedEpoch::new(prev_epoch),
791                kind: BarrierKind::Checkpoint(take(&mut pending_non_checkpoint_barriers)),
792            };
793
794            // Inject stop barrier with u64::MAX as curr_epoch and empty nodes_to_sync_table.
795            let stop_barrier = BarrierInfo {
796                prev_epoch: TracedEpoch::new(Epoch(snapshot_epoch)),
797                curr_epoch: TracedEpoch::new(Epoch(u64::MAX)),
798                kind: BarrierKind::Checkpoint(vec![snapshot_epoch]),
799            };
800
801            let stop_actors: Vec<ActorId> = fragment_infos
802                .values()
803                .flat_map(|f| f.actors.keys().copied())
804                .collect();
805
806            Self::inject_barrier(
807                self.partial_graph_id,
808                partial_graph_manager,
809                &node_actors,
810                &state_table_ids,
811                final_checkpoint,
812                None,
813                None,
814                take(&mut notifiers),
815                None,
816                false,
817            )?;
818            Self::inject_barrier(
819                self.partial_graph_id,
820                partial_graph_manager,
821                &node_actors,
822                &state_table_ids,
823                stop_barrier,
824                None,
825                Some(Mutation::Stop(StopMutation {
826                    actors: stop_actors,
827                    dropped_sink_fragments: vec![],
828                })),
829                vec![],
830                None,
831                true,
832            )?;
833
834            self.status = BatchRefreshJobStatus::FinishingSnapshot {
835                tracking_job: Some(tracking_job),
836                fragment_infos,
837            };
838        } else {
839            // Normal barrier — still consuming snapshot.
840            let BatchRefreshJobStatus::ConsumingSnapshot {
841                prev_epoch_fake_physical_time,
842                pending_non_checkpoint_barriers,
843                create_mview_tracker,
844                node_actors,
845                state_table_ids,
846                ..
847            } = &mut self.status
848            else {
849                unreachable!("is_finished was false, status must be ConsumingSnapshot")
850            };
851
852            // Forward a fake barrier to the partial graph.
853            let mutation = mutation.take().or_else(|| {
854                let pending_backfill_nodes = create_mview_tracker
855                    .take_pending_backfill_nodes()
856                    .collect_vec();
857                if pending_backfill_nodes.is_empty() {
858                    None
859                } else {
860                    Some(Mutation::StartFragmentBackfill(
861                        StartFragmentBackfillMutation {
862                            fragment_ids: pending_backfill_nodes,
863                        },
864                    ))
865                }
866            });
867            let barrier_to_inject = super::new_fake_barrier(
868                prev_epoch_fake_physical_time,
869                pending_non_checkpoint_barriers,
870                match barrier_info.kind {
871                    BarrierKind::Barrier => PbBarrierKind::Barrier,
872                    BarrierKind::Checkpoint(_) => PbBarrierKind::Checkpoint,
873                    BarrierKind::Initial => {
874                        unreachable!("upstream new epoch should not be initial")
875                    }
876                },
877            );
878            Self::inject_barrier(
879                self.partial_graph_id,
880                partial_graph_manager,
881                node_actors,
882                state_table_ids,
883                barrier_to_inject,
884                None,
885                mutation,
886                take(&mut notifiers),
887                None,
888                false,
889            )?;
890        }
891        assert!(mutation.is_none(), "must have consumed mutation");
892        assert!(notifiers.is_empty(), "must consumed notifiers");
893        Ok(())
894    }
895
896    pub(crate) fn collect(&mut self, collected_barrier: CollectedBarrier<'_>) -> bool {
897        match &mut self.status {
898            BatchRefreshJobStatus::ConsumingSnapshot {
899                create_mview_tracker,
900                version_stats,
901                ..
902            } => {
903                for progress in collected_barrier
904                    .resps
905                    .values()
906                    .flat_map(|resp| &resp.create_mview_progress)
907                {
908                    create_mview_tracker.apply_progress(progress, version_stats);
909                }
910                create_mview_tracker.is_finished()
911            }
912            BatchRefreshJobStatus::InitializingBatchRefresh { .. }
913            | BatchRefreshJobStatus::ConsumingLogStore { .. } => {
914                // All barriers are pre-injected; no progress tracking needed.
915                false
916            }
917            _ => false,
918        }
919    }
920}
921
922// ── Completing ────────────────────────────────────────────────────────────────
923
924impl BatchRefreshJobCheckpointControl {
925    #[expect(clippy::type_complexity)]
926    pub(crate) fn start_completing(
927        &mut self,
928        partial_graph_manager: &mut PartialGraphManager,
929    ) -> Option<(
930        u64,
931        HashMap<WorkerId, BarrierCompleteResponse>,
932        PartialGraphBarrierInfo,
933        Option<TrackingJob>,
934    )> {
935        match &self.status {
936            BatchRefreshJobStatus::ConsumingSnapshot { .. }
937            | BatchRefreshJobStatus::FinishingSnapshot { .. }
938            | BatchRefreshJobStatus::ConsumingLogStore { .. } => {}
939            BatchRefreshJobStatus::Idle { .. }
940            | BatchRefreshJobStatus::InitializingBatchRefresh { .. }
941            | BatchRefreshJobStatus::Resetting { .. } => {
942                return None;
943            }
944        };
945
946        partial_graph_manager
947            .start_completing(
948                self.partial_graph_id,
949                std::ops::Bound::Unbounded,
950                |_non_checkpoint_epoch, _resps, _| {
951                    // Progress already applied in `collect()`.
952                },
953            )
954            .map(|(epoch, resps, info)| {
955                // Take tracking job only when the snapshot stop barrier completes
956                // (i.e., we are in FinishingSnapshot and the epoch matches snapshot_epoch).
957                // Note: ConsumingLogStore's stop barrier also has prev_epoch == target_upstream_epoch,
958                // which may coincidentally equal snapshot_epoch if no new upstream commits occurred.
959                // We must check the status, not just the epoch, to avoid a false positive.
960                let tracking_job = match &mut self.status {
961                    BatchRefreshJobStatus::FinishingSnapshot { tracking_job, .. }
962                        if epoch == self.snapshot_epoch =>
963                    {
964                        Some(
965                            tracking_job
966                                .take()
967                                .expect("tracking job should not have been taken yet"),
968                        )
969                    }
970                    _ => None,
971                };
972                (epoch, resps, info, tracking_job)
973            })
974    }
975
976    pub(super) fn ack_completed(
977        &mut self,
978        partial_graph_manager: &mut PartialGraphManager,
979        completed_epoch: u64,
980    ) {
981        match &self.status {
982            BatchRefreshJobStatus::ConsumingSnapshot { .. } => {
983                partial_graph_manager.ack_completed(self.partial_graph_id, completed_epoch);
984            }
985            BatchRefreshJobStatus::FinishingSnapshot { tracking_job, .. }
986                if completed_epoch == self.snapshot_epoch =>
987            {
988                partial_graph_manager.ack_completed(self.partial_graph_id, completed_epoch);
989                assert!(
990                    tracking_job.is_none(),
991                    "tracking job should have been taken at start_completing"
992                );
993                info!(
994                    job_id = %self.job_id,
995                    completed_epoch,
996                    "batch refresh job: snapshot done, transitioned to idle, removing partial graph"
997                );
998                partial_graph_manager.remove_partial_graphs(vec![self.partial_graph_id]);
999                self.status = BatchRefreshJobStatus::Idle {
1000                    last_committed_epoch: completed_epoch,
1001                };
1002            }
1003            BatchRefreshJobStatus::FinishingSnapshot { .. } => {
1004                partial_graph_manager.ack_completed(self.partial_graph_id, completed_epoch);
1005            }
1006            BatchRefreshJobStatus::ConsumingLogStore {
1007                target_upstream_epoch,
1008                ..
1009            } if completed_epoch == *target_upstream_epoch => {
1010                let target = *target_upstream_epoch;
1011                partial_graph_manager.ack_completed(self.partial_graph_id, completed_epoch);
1012                info!(
1013                    job_id = %self.job_id,
1014                    completed_epoch,
1015                    target_upstream_epoch = target,
1016                    "batch refresh job: logstore done, transitioned to idle, removing partial graph"
1017                );
1018                partial_graph_manager.remove_partial_graphs(vec![self.partial_graph_id]);
1019                self.status = BatchRefreshJobStatus::Idle {
1020                    last_committed_epoch: target,
1021                };
1022            }
1023            BatchRefreshJobStatus::ConsumingLogStore { .. } => {
1024                partial_graph_manager.ack_completed(self.partial_graph_id, completed_epoch);
1025            }
1026            BatchRefreshJobStatus::Resetting { .. } => {
1027                // The job was dropped while the completing task was running in the background.
1028                // The partial graph has already been reset, so skip the ack.
1029            }
1030            BatchRefreshJobStatus::Idle { .. }
1031            | BatchRefreshJobStatus::InitializingBatchRefresh { .. } => {
1032                unreachable!("batch refresh job should not be completing in this state")
1033            }
1034        }
1035    }
1036
1037    /// Called when the partial graph reset is confirmed (drop only).
1038    pub(super) fn on_partial_graph_reset(mut self) {
1039        match &mut self.status {
1040            BatchRefreshJobStatus::Resetting { notifiers } => {
1041                for notifier in notifiers.drain(..) {
1042                    notifier.notify_collected();
1043                }
1044            }
1045            _ => {
1046                panic!(
1047                    "batch refresh job {}: on_partial_graph_reset in unexpected state {:?}",
1048                    self.job_id, self.status
1049                );
1050            }
1051        }
1052    }
1053}
1054
1055// ── Query methods ─────────────────────────────────────────────────────────────
1056
1057impl BatchRefreshJobCheckpointControl {
1058    pub(crate) fn gen_backfill_progress(&self) -> Option<BackfillProgress> {
1059        match &self.status {
1060            BatchRefreshJobStatus::ConsumingSnapshot {
1061                create_mview_tracker,
1062                ..
1063            } => {
1064                let progress = if create_mview_tracker.is_finished() {
1065                    "Snapshot finished".to_owned()
1066                } else {
1067                    let progress = create_mview_tracker.gen_backfill_progress();
1068                    format!("BatchRefresh Snapshot [{}]", progress)
1069                };
1070                Some(BackfillProgress {
1071                    progress,
1072                    backfill_type: PbBackfillType::SnapshotBackfill,
1073                })
1074            }
1075            BatchRefreshJobStatus::FinishingSnapshot { .. } => Some(BackfillProgress {
1076                progress: "BatchRefresh Stopping".to_owned(),
1077                backfill_type: PbBackfillType::SnapshotBackfill,
1078            }),
1079            BatchRefreshJobStatus::InitializingBatchRefresh { .. }
1080            | BatchRefreshJobStatus::ConsumingLogStore { .. } => Some(BackfillProgress {
1081                progress: "BatchRefresh LogStore".to_owned(),
1082                backfill_type: PbBackfillType::SnapshotBackfill,
1083            }),
1084            BatchRefreshJobStatus::Idle { .. } | BatchRefreshJobStatus::Resetting { .. } => None,
1085        }
1086    }
1087
1088    pub(super) fn gen_fragment_backfill_progress(&self) -> Vec<FragmentBackfillProgress> {
1089        match &self.status {
1090            BatchRefreshJobStatus::ConsumingSnapshot {
1091                create_mview_tracker,
1092                fragment_infos,
1093                ..
1094            } => create_mview_tracker.collect_fragment_progress(fragment_infos, true),
1095            BatchRefreshJobStatus::FinishingSnapshot { fragment_infos, .. } => {
1096                collect_done_fragments(self.job_id, fragment_infos)
1097            }
1098            _ => vec![],
1099        }
1100    }
1101
1102    /// Returns the pinned upstream log epoch and upstream table IDs.
1103    pub(super) fn pinned_upstream_log_epoch(&self) -> (u64, HashSet<TableId>) {
1104        match &self.status {
1105            BatchRefreshJobStatus::ConsumingSnapshot { .. }
1106            | BatchRefreshJobStatus::FinishingSnapshot { .. } => (
1107                self.snapshot_epoch,
1108                self.snapshot_backfill_upstream_tables.clone(),
1109            ),
1110            BatchRefreshJobStatus::ConsumingLogStore {
1111                logstore_start_epoch,
1112                ..
1113            }
1114            | BatchRefreshJobStatus::InitializingBatchRefresh {
1115                logstore_start_epoch,
1116                ..
1117            } => (
1118                *logstore_start_epoch,
1119                self.snapshot_backfill_upstream_tables.clone(),
1120            ),
1121            BatchRefreshJobStatus::Idle {
1122                last_committed_epoch,
1123            } => (
1124                *last_committed_epoch,
1125                self.snapshot_backfill_upstream_tables.clone(),
1126            ),
1127            BatchRefreshJobStatus::Resetting { .. } => (0, HashSet::new()),
1128        }
1129    }
1130
1131    pub(crate) fn fragment_infos(&self) -> Option<&HashMap<FragmentId, InflightFragmentInfo>> {
1132        match &self.status {
1133            BatchRefreshJobStatus::ConsumingSnapshot { fragment_infos, .. } => Some(fragment_infos),
1134            BatchRefreshJobStatus::InitializingBatchRefresh { fragment_infos, .. } => {
1135                Some(fragment_infos)
1136            }
1137            BatchRefreshJobStatus::ConsumingLogStore { fragment_infos, .. } => Some(fragment_infos),
1138            BatchRefreshJobStatus::FinishingSnapshot { .. }
1139            | BatchRefreshJobStatus::Idle { .. }
1140            | BatchRefreshJobStatus::Resetting { .. } => None,
1141        }
1142    }
1143
1144    pub(crate) fn is_snapshot_backfilling(&self) -> bool {
1145        matches!(
1146            self.status,
1147            BatchRefreshJobStatus::ConsumingSnapshot { .. }
1148                | BatchRefreshJobStatus::FinishingSnapshot { .. }
1149                | BatchRefreshJobStatus::InitializingBatchRefresh { .. }
1150                | BatchRefreshJobStatus::ConsumingLogStore { .. }
1151        )
1152    }
1153
1154    /// Whether this idle job should start a refresh run.
1155    ///
1156    /// Returns `true` if the job is idle and the upstream committed epoch is
1157    /// far enough ahead of the job's last committed epoch (by `batch_refresh_seconds`).
1158    pub(crate) fn should_start_refresh(&self, upstream_committed_epoch: u64) -> bool {
1159        if let BatchRefreshJobStatus::Idle {
1160            last_committed_epoch,
1161        } = &self.status
1162        {
1163            let job_physical_ms = Epoch(*last_committed_epoch).physical_time();
1164            let upstream_physical_ms = Epoch(upstream_committed_epoch).physical_time();
1165            let threshold_ms = self.batch_refresh_seconds * 1000;
1166            upstream_physical_ms.saturating_sub(job_physical_ms) >= threshold_ms
1167        } else {
1168            false
1169        }
1170    }
1171
1172    /// Returns the last committed epoch if the job is idle.
1173    pub(crate) fn last_committed_epoch(&self) -> Option<u64> {
1174        if let BatchRefreshJobStatus::Idle {
1175            last_committed_epoch,
1176        } = &self.status
1177        {
1178            Some(*last_committed_epoch)
1179        } else {
1180            None
1181        }
1182    }
1183}
1184
1185// ── Logstore refresh run ──────────────────────────────────────────────────────
1186
1187impl BatchRefreshJobCheckpointControl {
1188    /// Start a logstore consumption run.
1189    ///
1190    /// Preconditions: the job must be `Idle`.
1191    ///
1192    /// 1. Resolves log epochs from the hummock changelog
1193    /// 2. Re-renders actors using the cached context
1194    /// 3. Injects all barriers at once (first with `AddMutation`, last with `StopMutation`)
1195    /// 4. Transitions to `ConsumingLogStore`
1196    ///
1197    /// Returns `true` if a refresh run was started, `false` if there are no
1198    /// log epochs to consume (early return, stays idle).
1199    pub(crate) fn start_refresh_run(
1200        &mut self,
1201        context: &BatchRefreshJobTriggerContext,
1202        worker_nodes: &HashMap<WorkerId, WorkerNode>,
1203        actor_id_counter: &AtomicU32,
1204        partial_graph_manager: &mut PartialGraphManager,
1205    ) -> MetaResult<bool> {
1206        let last_committed_epoch = match &self.status {
1207            BatchRefreshJobStatus::Idle {
1208                last_committed_epoch,
1209            } => *last_committed_epoch,
1210            _ => panic!(
1211                "batch refresh job {}: start_refresh_run called in non-Idle state {:?}",
1212                self.job_id, self.status
1213            ),
1214        };
1215
1216        // Resolve log epochs into barrier infos.
1217        let target_upstream_epoch = context.target_upstream_epoch;
1218        let Some((first_epoch, pending_log_barriers)) = Self::resolve_log_epoch_barriers(
1219            &self.snapshot_backfill_upstream_tables,
1220            &context.upstream_table_log_epochs,
1221            last_committed_epoch,
1222        )?
1223        else {
1224            info!(
1225                job_id = %self.job_id,
1226                last_committed_epoch,
1227                target_upstream_epoch,
1228                "batch refresh job: no log epochs to consume, staying idle"
1229            );
1230            return Ok(false);
1231        };
1232
1233        let log_target_epoch = pending_log_barriers.last().expect("non-empty").prev_epoch();
1234        if target_upstream_epoch != log_target_epoch {
1235            info!(
1236                job_id = %self.job_id,
1237                last_committed_epoch,
1238                target_upstream_epoch,
1239                log_target_epoch,
1240                "batch refresh job: upstream target has no resolved changelog yet, staying idle"
1241            );
1242            return Ok(false);
1243        }
1244
1245        // Build logical fragments from cached context.
1246        let logical = BatchRefreshLogicalFragments::from_context(context);
1247
1248        // Re-render actors.
1249        let render_result = Self::render_actors_and_build_job_info(
1250            &logical.fragments,
1251            &logical.downstreams,
1252            &context.definition,
1253            actor_id_counter,
1254            worker_nodes,
1255            &context.database_resource_group,
1256            &context.streaming_job_model,
1257            self.partial_graph_id,
1258        )?;
1259
1260        // Build actors_to_create and initial mutation.
1261        let added_actors: Vec<ActorId> = render_result
1262            .fragment_infos
1263            .values()
1264            .flat_map(|fragment| fragment.actors.keys().copied())
1265            .collect();
1266
1267        let initial_mutation = Mutation::Add(AddMutation {
1268            actor_dispatchers: Default::default(),
1269            added_actors,
1270            actor_splits: Default::default(),
1271            pause: false,
1272            subscriptions_to_add: Default::default(),
1273            backfill_nodes_to_pause: Default::default(),
1274            actor_cdc_table_snapshot_splits: None,
1275            new_upstream_sinks: Default::default(),
1276            dropped_actors: Default::default(),
1277            sink_log_store_flush: Default::default(),
1278        });
1279
1280        let node_actors = &render_result.node_actors;
1281        let state_table_ids = &render_result.state_table_ids;
1282        let initial_barrier = BarrierInfo {
1283            prev_epoch: TracedEpoch::new(Epoch(last_committed_epoch)),
1284            curr_epoch: TracedEpoch::new(Epoch(first_epoch)),
1285            kind: BarrierKind::Initial,
1286        };
1287        let mut partial_graph_recoverer = partial_graph_manager.start_recover();
1288        let recover_result = partial_graph_recoverer.recover_graph(
1289            self.partial_graph_id,
1290            initial_mutation,
1291            &initial_barrier,
1292            node_actors,
1293            state_table_ids.iter().copied(),
1294            render_result.actors_to_create,
1295            BatchRefreshBarrierStats::new(self.job_id, self.snapshot_epoch),
1296        );
1297        match recover_result {
1298            Ok(()) => {
1299                let initializing_partial_graphs = partial_graph_recoverer.all_initializing();
1300                debug_assert_eq!(initializing_partial_graphs.len(), 1);
1301                debug_assert!(initializing_partial_graphs.contains(&self.partial_graph_id));
1302            }
1303            Err(e) => {
1304                partial_graph_recoverer.failed();
1305                return Err(e);
1306            }
1307        }
1308
1309        let logstore_start_epoch = last_committed_epoch;
1310
1311        info!(
1312            job_id = %self.job_id,
1313            last_committed_epoch,
1314            target_upstream_epoch,
1315            num_log_barriers = pending_log_barriers.len(),
1316            "batch refresh job: initialized logstore consumption partial graph"
1317        );
1318
1319        self.status = BatchRefreshJobStatus::InitializingBatchRefresh {
1320            fragment_infos: render_result.fragment_infos,
1321            node_actors: render_result.node_actors,
1322            state_table_ids: render_result.state_table_ids,
1323            pending_log_barriers,
1324            logstore_start_epoch,
1325            target_upstream_epoch,
1326        };
1327
1328        Ok(true)
1329    }
1330
1331    pub(crate) fn on_log_store_initialized(
1332        &mut self,
1333        partial_graph_manager: &mut PartialGraphManager,
1334    ) -> MetaResult<()> {
1335        let old_status = replace(
1336            &mut self.status,
1337            BatchRefreshJobStatus::Idle {
1338                last_committed_epoch: 0,
1339            },
1340        );
1341        let BatchRefreshJobStatus::InitializingBatchRefresh {
1342            fragment_infos,
1343            node_actors,
1344            state_table_ids,
1345            pending_log_barriers,
1346            logstore_start_epoch,
1347            target_upstream_epoch,
1348        } = old_status
1349        else {
1350            panic!(
1351                "batch refresh job {}: logstore initialized in unexpected status {:?}",
1352                self.job_id, old_status
1353            );
1354        };
1355
1356        let final_barrier_idx = pending_log_barriers.len() - 1;
1357        let mut stop_mutation = Some(Mutation::Stop(StopMutation {
1358            actors: fragment_infos
1359                .values()
1360                .flat_map(|fragment| fragment.actors.keys().copied())
1361                .collect(),
1362            dropped_sink_fragments: vec![],
1363        }));
1364        for (idx, barrier) in pending_log_barriers.into_iter().enumerate() {
1365            let is_stop_barrier = idx == final_barrier_idx;
1366            let mutation = is_stop_barrier.then(|| stop_mutation.take().expect("unused"));
1367            Self::inject_barrier(
1368                self.partial_graph_id,
1369                partial_graph_manager,
1370                &node_actors,
1371                &state_table_ids,
1372                barrier,
1373                None,
1374                mutation,
1375                vec![],
1376                None,
1377                is_stop_barrier,
1378            )?;
1379        }
1380
1381        self.status = BatchRefreshJobStatus::ConsumingLogStore {
1382            fragment_infos,
1383            logstore_start_epoch,
1384            target_upstream_epoch,
1385        };
1386        Ok(())
1387    }
1388
1389    /// Resolve upstream log epochs from the hummock changelog into barrier infos.
1390    ///
1391    /// Returns `(first_epoch, log_barriers)`. `first_epoch` is consumed by the
1392    /// initial barrier. `log_barriers` contains all barriers to inject after
1393    /// initialization, ending with the final checkpoint stop barrier.
1394    fn resolve_log_epoch_barriers(
1395        snapshot_backfill_upstream_tables: &HashSet<TableId>,
1396        upstream_table_log_epochs: &HashMap<TableId, Vec<(Vec<u64>, u64)>>,
1397        exclusive_start_log_epoch: u64,
1398    ) -> MetaResult<Option<(u64, Vec<BarrierInfo>)>> {
1399        let table_id = snapshot_backfill_upstream_tables
1400            .iter()
1401            .next()
1402            .expect("snapshot backfill job should have upstream");
1403        let Some(epochs) = upstream_table_log_epochs.get(table_id) else {
1404            return Ok(None);
1405        };
1406
1407        // Find the starting point: skip entries up to and including exclusive_start_log_epoch.
1408        let mut epochs_iter = epochs.iter().peekable();
1409        loop {
1410            match epochs_iter.peek() {
1411                Some((_, checkpoint_epoch)) if *checkpoint_epoch <= exclusive_start_log_epoch => {
1412                    epochs_iter.next();
1413                }
1414                _ => break,
1415            }
1416        }
1417
1418        let mut epoch_infos = vec![];
1419        for (non_checkpoint_epochs, checkpoint_epoch) in epochs_iter {
1420            epoch_infos.extend(
1421                non_checkpoint_epochs
1422                    .iter()
1423                    .copied()
1424                    .map(|epoch| (epoch, false)),
1425            );
1426            epoch_infos.push((*checkpoint_epoch, true));
1427        }
1428        if epoch_infos.is_empty() {
1429            return Ok(None);
1430        }
1431
1432        let first_epoch = epoch_infos[0].0;
1433        let mut pending_non_checkpoint_epochs = vec![];
1434        let mut replay_barriers = vec![];
1435        for window in epoch_infos.windows(2) {
1436            let (prev_epoch, is_checkpoint) = window[0];
1437            let curr_epoch = window[1].0;
1438            assert!(prev_epoch > exclusive_start_log_epoch);
1439            assert!(curr_epoch > prev_epoch);
1440            pending_non_checkpoint_epochs.push(prev_epoch);
1441            let kind = if is_checkpoint {
1442                BarrierKind::Checkpoint(take(&mut pending_non_checkpoint_epochs))
1443            } else {
1444                BarrierKind::Barrier
1445            };
1446            replay_barriers.push(BarrierInfo {
1447                prev_epoch: TracedEpoch::new(Epoch(prev_epoch)),
1448                curr_epoch: TracedEpoch::new(Epoch(curr_epoch)),
1449                kind,
1450            });
1451        }
1452
1453        let (last_epoch, _) = *epoch_infos.last().expect("non-empty");
1454        assert!(last_epoch > exclusive_start_log_epoch);
1455        pending_non_checkpoint_epochs.push(last_epoch);
1456        replay_barriers.push(BarrierInfo {
1457            prev_epoch: TracedEpoch::new(Epoch(last_epoch)),
1458            curr_epoch: TracedEpoch::new(Epoch(u64::MAX)),
1459            kind: BarrierKind::Checkpoint(pending_non_checkpoint_epochs),
1460        });
1461
1462        Ok(Some((first_epoch, replay_barriers)))
1463    }
1464}
1465
1466impl BatchRefreshLogicalFragments {
1467    /// Build logical fragments from a trigger context.
1468    pub(crate) fn from_context(ctx: &BatchRefreshJobTriggerContext) -> Self {
1469        Self {
1470            fragments: ctx.fragments.clone(),
1471            downstreams: ctx.downstreams.clone(),
1472        }
1473    }
1474}
1475
1476// ── Drop handling ─────────────────────────────────────────────────────────────
1477
1478impl BatchRefreshJobCheckpointControl {
1479    /// Drop this batch refresh job.
1480    pub(super) fn drop(
1481        &mut self,
1482        notifiers: &mut Vec<Notifier>,
1483        partial_graph_manager: &mut PartialGraphManager,
1484    ) -> bool {
1485        match &mut self.status {
1486            BatchRefreshJobStatus::Resetting {
1487                notifiers: existing_notifiers,
1488                ..
1489            } => {
1490                for notifier in &mut *notifiers {
1491                    notifier.notify_started();
1492                }
1493                existing_notifiers.append(notifiers);
1494                true
1495            }
1496            BatchRefreshJobStatus::ConsumingSnapshot { .. }
1497            | BatchRefreshJobStatus::FinishingSnapshot { .. }
1498            | BatchRefreshJobStatus::InitializingBatchRefresh { .. }
1499            | BatchRefreshJobStatus::ConsumingLogStore { .. } => {
1500                for notifier in &mut *notifiers {
1501                    notifier.notify_started();
1502                }
1503                partial_graph_manager.reset_partial_graphs([self.partial_graph_id]);
1504                self.status = BatchRefreshJobStatus::Resetting {
1505                    notifiers: take(notifiers),
1506                };
1507                true
1508            }
1509            BatchRefreshJobStatus::Idle { .. } => {
1510                // Idle has no running partial graph, but we still go through
1511                // the reset flow so the cleanup path is uniform.
1512                for notifier in &mut *notifiers {
1513                    notifier.notify_started();
1514                }
1515                partial_graph_manager.reset_partial_graphs([self.partial_graph_id]);
1516                self.status = BatchRefreshJobStatus::Resetting {
1517                    notifiers: take(notifiers),
1518                };
1519                true
1520            }
1521        }
1522    }
1523
1524    /// Reset during database recovery.
1525    ///
1526    /// Returns `true` if the partial graph was already resetting (from a prior drop),
1527    /// meaning we should not issue a new reset request.
1528    pub(crate) fn reset(self) -> bool {
1529        match self.status {
1530            BatchRefreshJobStatus::ConsumingSnapshot { .. }
1531            | BatchRefreshJobStatus::FinishingSnapshot { .. }
1532            | BatchRefreshJobStatus::InitializingBatchRefresh { .. }
1533            | BatchRefreshJobStatus::ConsumingLogStore { .. }
1534            | BatchRefreshJobStatus::Idle { .. } => false,
1535            BatchRefreshJobStatus::Resetting { notifiers, .. } => {
1536                for notifier in notifiers {
1537                    notifier.notify_collected();
1538                }
1539                true
1540            }
1541        }
1542    }
1543}
1544
1545// ── Barrier stats ─────────────────────────────────────────────────────────────
1546
1547struct BatchRefreshBarrierStats {
1548    barrier_latency: LabelGuardedHistogram,
1549    inflight_barrier_num: LabelGuardedIntGauge,
1550}
1551
1552impl BatchRefreshBarrierStats {
1553    fn new(job_id: JobId, _snapshot_epoch: u64) -> Self {
1554        let table_id_str = format!("{}", job_id);
1555        Self {
1556            barrier_latency: GLOBAL_META_METRICS
1557                .snapshot_backfill_barrier_latency
1558                .with_guarded_label_values(&[table_id_str.as_str(), "batch_refresh_snapshot"]),
1559            inflight_barrier_num: GLOBAL_META_METRICS
1560                .snapshot_backfill_inflight_barrier_num
1561                .with_guarded_label_values(&[&table_id_str]),
1562        }
1563    }
1564}
1565
1566impl PartialGraphStat for BatchRefreshBarrierStats {
1567    fn observe_barrier_latency(&self, _epoch: EpochPair, barrier_latency_secs: f64) {
1568        self.barrier_latency.observe(barrier_latency_secs);
1569    }
1570
1571    fn observe_barrier_num(&self, inflight_barrier_num: usize, _collected_barrier_num: usize) {
1572        self.inflight_barrier_num.set(inflight_barrier_num as _);
1573    }
1574}
risingwave_meta/barrier/checkpoint/independent_job/batch_refresh_job/mod.rs

risingwave_meta/barrier/checkpoint/independent_job/batch_refresh_job/
mod.rs