1use std::collections::{HashMap, HashSet, VecDeque};
16use std::future::poll_fn;
17use std::ops::Range;
18use std::sync::{Arc, LazyLock};
19use std::task::Poll;
20use std::time::{Duration, Instant};
21
22use foyer::{HybridCacheEntry, RangeBoundsExt};
23use futures::future::{join_all, try_join_all};
24use futures::{Future, FutureExt};
25use itertools::Itertools;
26use prometheus::core::{AtomicU64, GenericCounter, GenericCounterVec};
27use prometheus::{
28 Histogram, HistogramVec, IntGauge, Registry, register_histogram_vec_with_registry,
29 register_int_counter_vec_with_registry, register_int_gauge_with_registry,
30};
31use risingwave_common::license::Feature;
32use risingwave_common::monitor::GLOBAL_METRICS_REGISTRY;
33use risingwave_hummock_sdk::compaction_group::hummock_version_ext::SstDeltaInfo;
34use risingwave_hummock_sdk::{HummockSstableObjectId, KeyComparator};
35use thiserror_ext::AsReport;
36use tokio::sync::Semaphore;
37use tokio::task::JoinHandle;
38
39use crate::hummock::local_version::pinned_version::PinnedVersion;
40use crate::hummock::{
41 Block, HummockError, HummockResult, RecentFilterTrait, Sstable, SstableBlockIndex,
42 SstableStoreRef, TableHolder,
43};
44use crate::monitor::StoreLocalStatistic;
45use crate::opts::StorageOpts;
46
47pub static GLOBAL_CACHE_REFILL_METRICS: LazyLock<CacheRefillMetrics> =
48 LazyLock::new(|| CacheRefillMetrics::new(&GLOBAL_METRICS_REGISTRY));
49
50pub struct CacheRefillMetrics {
51 pub refill_duration: HistogramVec,
52 pub refill_total: GenericCounterVec<AtomicU64>,
53 pub refill_bytes: GenericCounterVec<AtomicU64>,
54
55 pub data_refill_success_duration: Histogram,
56 pub meta_refill_success_duration: Histogram,
57
58 pub data_refill_filtered_total: GenericCounter<AtomicU64>,
59 pub data_refill_attempts_total: GenericCounter<AtomicU64>,
60 pub data_refill_started_total: GenericCounter<AtomicU64>,
61 pub meta_refill_attempts_total: GenericCounter<AtomicU64>,
62
63 pub data_refill_parent_meta_lookup_hit_total: GenericCounter<AtomicU64>,
64 pub data_refill_parent_meta_lookup_miss_total: GenericCounter<AtomicU64>,
65 pub data_refill_unit_inheritance_hit_total: GenericCounter<AtomicU64>,
66 pub data_refill_unit_inheritance_miss_total: GenericCounter<AtomicU64>,
67
68 pub data_refill_block_unfiltered_total: GenericCounter<AtomicU64>,
69 pub data_refill_block_success_total: GenericCounter<AtomicU64>,
70
71 pub data_refill_ideal_bytes: GenericCounter<AtomicU64>,
72 pub data_refill_success_bytes: GenericCounter<AtomicU64>,
73
74 pub refill_queue_total: IntGauge,
75}
76
77impl CacheRefillMetrics {
78 pub fn new(registry: &Registry) -> Self {
79 let refill_duration = register_histogram_vec_with_registry!(
80 "refill_duration",
81 "refill duration",
82 &["type", "op"],
83 registry,
84 )
85 .unwrap();
86 let refill_total = register_int_counter_vec_with_registry!(
87 "refill_total",
88 "refill total",
89 &["type", "op"],
90 registry,
91 )
92 .unwrap();
93 let refill_bytes = register_int_counter_vec_with_registry!(
94 "refill_bytes",
95 "refill bytes",
96 &["type", "op"],
97 registry,
98 )
99 .unwrap();
100
101 let data_refill_success_duration = refill_duration
102 .get_metric_with_label_values(&["data", "success"])
103 .unwrap();
104 let meta_refill_success_duration = refill_duration
105 .get_metric_with_label_values(&["meta", "success"])
106 .unwrap();
107
108 let data_refill_filtered_total = refill_total
109 .get_metric_with_label_values(&["data", "filtered"])
110 .unwrap();
111 let data_refill_attempts_total = refill_total
112 .get_metric_with_label_values(&["data", "attempts"])
113 .unwrap();
114 let data_refill_started_total = refill_total
115 .get_metric_with_label_values(&["data", "started"])
116 .unwrap();
117 let meta_refill_attempts_total = refill_total
118 .get_metric_with_label_values(&["meta", "attempts"])
119 .unwrap();
120
121 let data_refill_parent_meta_lookup_hit_total = refill_total
122 .get_metric_with_label_values(&["parent_meta", "hit"])
123 .unwrap();
124 let data_refill_parent_meta_lookup_miss_total = refill_total
125 .get_metric_with_label_values(&["parent_meta", "miss"])
126 .unwrap();
127 let data_refill_unit_inheritance_hit_total = refill_total
128 .get_metric_with_label_values(&["unit_inheritance", "hit"])
129 .unwrap();
130 let data_refill_unit_inheritance_miss_total = refill_total
131 .get_metric_with_label_values(&["unit_inheritance", "miss"])
132 .unwrap();
133
134 let data_refill_block_unfiltered_total = refill_total
135 .get_metric_with_label_values(&["block", "unfiltered"])
136 .unwrap();
137 let data_refill_block_success_total = refill_total
138 .get_metric_with_label_values(&["block", "success"])
139 .unwrap();
140
141 let data_refill_ideal_bytes = refill_bytes
142 .get_metric_with_label_values(&["data", "ideal"])
143 .unwrap();
144 let data_refill_success_bytes = refill_bytes
145 .get_metric_with_label_values(&["data", "success"])
146 .unwrap();
147
148 let refill_queue_total = register_int_gauge_with_registry!(
149 "refill_queue_total",
150 "refill queue total",
151 registry,
152 )
153 .unwrap();
154
155 Self {
156 refill_duration,
157 refill_total,
158 refill_bytes,
159
160 data_refill_success_duration,
161 meta_refill_success_duration,
162 data_refill_filtered_total,
163 data_refill_attempts_total,
164 data_refill_started_total,
165 meta_refill_attempts_total,
166
167 data_refill_parent_meta_lookup_hit_total,
168 data_refill_parent_meta_lookup_miss_total,
169 data_refill_unit_inheritance_hit_total,
170 data_refill_unit_inheritance_miss_total,
171
172 data_refill_block_unfiltered_total,
173 data_refill_block_success_total,
174
175 data_refill_ideal_bytes,
176 data_refill_success_bytes,
177
178 refill_queue_total,
179 }
180 }
181}
182
183#[derive(Debug)]
184pub struct CacheRefillConfig {
185 pub timeout: Duration,
187
188 pub data_refill_levels: HashSet<u32>,
190
191 pub concurrency: usize,
193
194 pub unit: usize,
196
197 pub threshold: f64,
201
202 pub skip_recent_filter: bool,
204}
205
206impl CacheRefillConfig {
207 pub fn from_storage_opts(options: &StorageOpts) -> Self {
208 let data_refill_levels = match Feature::ElasticDiskCache.check_available() {
209 Ok(_) => options
210 .cache_refill_data_refill_levels
211 .iter()
212 .copied()
213 .collect(),
214 Err(e) => {
215 tracing::warn!(error = %e.as_report(), "ElasticDiskCache is not available.");
216 HashSet::new()
217 }
218 };
219
220 Self {
221 timeout: Duration::from_millis(options.cache_refill_timeout_ms),
222 data_refill_levels,
223 concurrency: options.cache_refill_concurrency,
224 unit: options.cache_refill_unit,
225 threshold: options.cache_refill_threshold,
226 skip_recent_filter: options.cache_refill_skip_recent_filter,
227 }
228 }
229}
230
231struct Item {
232 handle: JoinHandle<()>,
233 event: CacheRefillerEvent,
234}
235
236pub(crate) type SpawnRefillTask = Arc<
237 dyn Fn(Vec<SstDeltaInfo>, CacheRefillContext, PinnedVersion, PinnedVersion) -> JoinHandle<()>
239 + Send
240 + Sync
241 + 'static,
242>;
243
244pub(crate) struct CacheRefiller {
246 queue: VecDeque<Item>,
248
249 context: CacheRefillContext,
250
251 spawn_refill_task: SpawnRefillTask,
252}
253
254impl CacheRefiller {
255 pub(crate) fn new(
256 config: CacheRefillConfig,
257 sstable_store: SstableStoreRef,
258 spawn_refill_task: SpawnRefillTask,
259 ) -> Self {
260 let config = Arc::new(config);
261 let concurrency = Arc::new(Semaphore::new(config.concurrency));
262 Self {
263 queue: VecDeque::new(),
264 context: CacheRefillContext {
265 config,
266 concurrency,
267 sstable_store,
268 },
269 spawn_refill_task,
270 }
271 }
272
273 pub(crate) fn default_spawn_refill_task() -> SpawnRefillTask {
274 Arc::new(|deltas, context, _, _| {
275 let task = CacheRefillTask { deltas, context };
276 tokio::spawn(task.run())
277 })
278 }
279
280 pub(crate) fn start_cache_refill(
281 &mut self,
282 deltas: Vec<SstDeltaInfo>,
283 pinned_version: PinnedVersion,
284 new_pinned_version: PinnedVersion,
285 ) {
286 let handle = (self.spawn_refill_task)(
287 deltas,
288 self.context.clone(),
289 pinned_version.clone(),
290 new_pinned_version.clone(),
291 );
292 let event = CacheRefillerEvent {
293 pinned_version,
294 new_pinned_version,
295 };
296 let item = Item { handle, event };
297 self.queue.push_back(item);
298 GLOBAL_CACHE_REFILL_METRICS.refill_queue_total.add(1);
299 }
300
301 pub(crate) fn last_new_pinned_version(&self) -> Option<&PinnedVersion> {
302 self.queue.back().map(|item| &item.event.new_pinned_version)
303 }
304}
305
306impl CacheRefiller {
307 pub(crate) fn next_events(&mut self) -> impl Future<Output = Vec<CacheRefillerEvent>> + '_ {
308 poll_fn(|cx| {
309 const MAX_BATCH_SIZE: usize = 16;
310 let mut events = None;
311 while let Some(item) = self.queue.front_mut()
312 && let Poll::Ready(result) = item.handle.poll_unpin(cx)
313 {
314 result.unwrap();
315 let item = self.queue.pop_front().unwrap();
316 GLOBAL_CACHE_REFILL_METRICS.refill_queue_total.sub(1);
317 let events = events.get_or_insert_with(|| Vec::with_capacity(MAX_BATCH_SIZE));
318 events.push(item.event);
319 if events.len() >= MAX_BATCH_SIZE {
320 break;
321 }
322 }
323 if let Some(events) = events {
324 Poll::Ready(events)
325 } else {
326 Poll::Pending
327 }
328 })
329 }
330}
331
332pub struct CacheRefillerEvent {
333 pub pinned_version: PinnedVersion,
334 pub new_pinned_version: PinnedVersion,
335}
336
337#[derive(Clone)]
338pub(crate) struct CacheRefillContext {
339 config: Arc<CacheRefillConfig>,
340 concurrency: Arc<Semaphore>,
341 sstable_store: SstableStoreRef,
342}
343
344struct CacheRefillTask {
345 deltas: Vec<SstDeltaInfo>,
346 context: CacheRefillContext,
347}
348
349impl CacheRefillTask {
350 async fn run(self) {
351 let tasks = self
352 .deltas
353 .iter()
354 .map(|delta| {
355 let context = self.context.clone();
356 async move {
357 let holders = match Self::meta_cache_refill(&context, delta).await {
358 Ok(holders) => holders,
359 Err(e) => {
360 tracing::warn!(error = %e.as_report(), "meta cache refill error");
361 return;
362 }
363 };
364 Self::data_cache_refill(&context, delta, holders).await;
365 }
366 })
367 .collect_vec();
368 let future = join_all(tasks);
369
370 let _ = tokio::time::timeout(self.context.config.timeout, future).await;
371 }
372
373 async fn meta_cache_refill(
374 context: &CacheRefillContext,
375 delta: &SstDeltaInfo,
376 ) -> HummockResult<Vec<TableHolder>> {
377 let tasks = delta
378 .insert_sst_infos
379 .iter()
380 .map(|info| async {
381 let mut stats = StoreLocalStatistic::default();
382 GLOBAL_CACHE_REFILL_METRICS.meta_refill_attempts_total.inc();
383
384 let now = Instant::now();
385 let res = context.sstable_store.sstable(info, &mut stats).await;
386 stats.discard();
387 GLOBAL_CACHE_REFILL_METRICS
388 .meta_refill_success_duration
389 .observe(now.elapsed().as_secs_f64());
390 res
391 })
392 .collect_vec();
393 let holders = try_join_all(tasks).await?;
394 Ok(holders)
395 }
396
397 fn get_units_to_refill_by_inheritance(
399 context: &CacheRefillContext,
400 ssts: &[TableHolder],
401 parent_ssts: impl IntoIterator<Item = HybridCacheEntry<HummockSstableObjectId, Box<Sstable>>>,
402 ) -> HashSet<SstableUnit> {
403 let mut res = HashSet::default();
404
405 let recent_filter = context.sstable_store.recent_filter();
406
407 let units = {
408 let unit = context.config.unit;
409 ssts.iter()
410 .flat_map(|sst| {
411 let units = Unit::units(sst, unit);
412 (0..units).map(|uidx| Unit::new(sst, unit, uidx))
413 })
414 .collect_vec()
415 };
416
417 if cfg!(debug_assertions) {
418 units.iter().tuple_windows().for_each(|(a, b)| {
420 debug_assert_ne!(
421 KeyComparator::compare_encoded_full_key(a.largest_key(), b.smallest_key()),
422 std::cmp::Ordering::Greater
423 )
424 });
425 }
426
427 for psst in parent_ssts {
428 for pblk in 0..psst.block_count() {
429 let pleft = &psst.meta.block_metas[pblk].smallest_key;
430 let pright = if pblk + 1 == psst.block_count() {
431 &psst.meta.largest_key
433 } else {
434 &psst.meta.block_metas[pblk + 1].smallest_key
435 };
436
437 let uleft = units.partition_point(|unit| {
439 KeyComparator::compare_encoded_full_key(unit.largest_key(), pleft)
440 == std::cmp::Ordering::Less
441 });
442 let uright = units.partition_point(|unit| {
444 KeyComparator::compare_encoded_full_key(unit.smallest_key(), pright)
445 != std::cmp::Ordering::Greater
446 });
447
448 for u in units.iter().take(uright).skip(uleft) {
450 let unit = SstableUnit {
451 sst_obj_id: u.sst.id,
452 blks: u.blks.clone(),
453 };
454 if res.contains(&unit) {
455 continue;
456 }
457 if context.config.skip_recent_filter || recent_filter.contains(&(psst.id, pblk))
458 {
459 res.insert(unit);
460 }
461 }
462 }
463 }
464
465 let hit = res.len();
466 let miss = units.len() - res.len();
467 GLOBAL_CACHE_REFILL_METRICS
468 .data_refill_unit_inheritance_hit_total
469 .inc_by(hit as u64);
470 GLOBAL_CACHE_REFILL_METRICS
471 .data_refill_unit_inheritance_miss_total
472 .inc_by(miss as u64);
473
474 res
475 }
476
477 async fn data_cache_refill(
479 context: &CacheRefillContext,
480 delta: &SstDeltaInfo,
481 holders: Vec<TableHolder>,
482 ) {
483 if !context.sstable_store.block_cache().is_hybrid() {
485 return;
486 }
487
488 if delta.insert_sst_infos.is_empty() || delta.delete_sst_object_ids.is_empty() {
490 return;
491 }
492
493 if !context
495 .config
496 .data_refill_levels
497 .contains(&delta.insert_sst_level)
498 {
499 return;
500 }
501
502 let recent_filter = context.sstable_store.recent_filter();
503
504 let targets = delta
506 .delete_sst_object_ids
507 .iter()
508 .map(|id| (*id, usize::MAX))
509 .collect_vec();
510 if !context.config.skip_recent_filter && !recent_filter.contains_any(targets.iter()) {
511 GLOBAL_CACHE_REFILL_METRICS
512 .data_refill_filtered_total
513 .inc_by(delta.delete_sst_object_ids.len() as _);
514 return;
515 }
516
517 GLOBAL_CACHE_REFILL_METRICS
518 .data_refill_block_unfiltered_total
519 .inc_by(
520 holders
521 .iter()
522 .map(|sst| sst.block_count() as u64)
523 .sum::<u64>(),
524 );
525
526 if delta.insert_sst_level == 0 || context.config.skip_recent_filter {
527 Self::data_file_cache_refill_full_impl(context, delta, holders).await;
528 } else {
529 Self::data_file_cache_impl(context, delta, holders).await;
530 }
531 }
532
533 async fn data_file_cache_refill_full_impl(
534 context: &CacheRefillContext,
535 _delta: &SstDeltaInfo,
536 holders: Vec<TableHolder>,
537 ) {
538 let unit = context.config.unit;
539
540 let mut futures = vec![];
541
542 for sst in &holders {
543 for blk_start in (0..sst.block_count()).step_by(unit) {
544 let blk_end = std::cmp::min(sst.block_count(), blk_start + unit);
545 let unit = SstableUnit {
546 sst_obj_id: sst.id,
547 blks: blk_start..blk_end,
548 };
549 futures.push(
550 async move { Self::data_file_cache_refill_unit(context, sst, unit).await },
551 );
552 }
553 }
554 join_all(futures).await;
555 }
556
557 async fn data_file_cache_impl(
558 context: &CacheRefillContext,
559 delta: &SstDeltaInfo,
560 holders: Vec<TableHolder>,
561 ) {
562 let sstable_store = context.sstable_store.clone();
563 let futures = delta.delete_sst_object_ids.iter().map(|sst_obj_id| {
564 let store = &sstable_store;
565 async move {
566 let res = store.sstable_cached(*sst_obj_id).await;
567 match res {
568 Ok(Some(_)) => GLOBAL_CACHE_REFILL_METRICS
569 .data_refill_parent_meta_lookup_hit_total
570 .inc(),
571 Ok(None) => GLOBAL_CACHE_REFILL_METRICS
572 .data_refill_parent_meta_lookup_miss_total
573 .inc(),
574 _ => {}
575 }
576 res
577 }
578 });
579 let parent_ssts = match try_join_all(futures).await {
580 Ok(parent_ssts) => parent_ssts.into_iter().flatten(),
581 Err(e) => {
582 return tracing::error!(error = %e.as_report(), "get old meta from cache error");
583 }
584 };
585 let units = Self::get_units_to_refill_by_inheritance(context, &holders, parent_ssts);
586
587 let ssts: HashMap<HummockSstableObjectId, TableHolder> =
588 holders.into_iter().map(|meta| (meta.id, meta)).collect();
589 let futures = units.into_iter().map(|unit| {
590 let ssts = &ssts;
591 async move {
592 let sst = ssts.get(&unit.sst_obj_id).unwrap();
593 if let Err(e) = Self::data_file_cache_refill_unit(context, sst, unit).await {
594 tracing::error!(error = %e.as_report(), "data file cache unit refill error");
595 }
596 }
597 });
598 join_all(futures).await;
599 }
600
601 async fn data_file_cache_refill_unit(
602 context: &CacheRefillContext,
603 sst: &Sstable,
604 unit: SstableUnit,
605 ) -> HummockResult<()> {
606 let sstable_store = &context.sstable_store;
607 let threshold = context.config.threshold;
608 let recent_filter = sstable_store.recent_filter();
609
610 recent_filter.insert((sst.id, usize::MAX));
612
613 let blocks = unit.blks.size().unwrap();
614
615 let mut tasks = vec![];
616 let mut contexts = Vec::with_capacity(blocks);
617 let mut admits = 0;
618
619 let (range_first, _) = sst.calculate_block_info(unit.blks.start);
620 let (range_last, _) = sst.calculate_block_info(unit.blks.end - 1);
621 let range = range_first.start..range_last.end;
622
623 let size = range.size().unwrap();
624
625 GLOBAL_CACHE_REFILL_METRICS
626 .data_refill_ideal_bytes
627 .inc_by(size as _);
628
629 for blk in unit.blks {
630 let (range, uncompressed_capacity) = sst.calculate_block_info(blk);
631 let key = SstableBlockIndex {
632 sst_id: sst.id,
633 block_idx: blk as u64,
634 };
635
636 let mut writer = sstable_store.block_cache().storage_writer(key);
637
638 if writer.filter(size).is_admitted() {
639 admits += 1;
640 }
641
642 contexts.push((writer, range, uncompressed_capacity))
643 }
644
645 if admits as f64 / contexts.len() as f64 >= threshold {
646 let task = async move {
647 GLOBAL_CACHE_REFILL_METRICS.data_refill_attempts_total.inc();
648
649 let permit = context.concurrency.acquire().await.unwrap();
650
651 GLOBAL_CACHE_REFILL_METRICS.data_refill_started_total.inc();
652
653 let timer = GLOBAL_CACHE_REFILL_METRICS
654 .data_refill_success_duration
655 .start_timer();
656
657 let data = sstable_store
658 .store()
659 .read(&sstable_store.get_sst_data_path(sst.id), range.clone())
660 .await?;
661 let mut futures = vec![];
662 for (w, r, uc) in contexts {
663 let offset = r.start - range.start;
664 let len = r.end - r.start;
665 let bytes = data.slice(offset..offset + len);
666 let future = async move {
667 let value = Box::new(Block::decode(bytes, uc)?);
668 if let Some(_entry) = w.force().insert(value) {
670 GLOBAL_CACHE_REFILL_METRICS
671 .data_refill_success_bytes
672 .inc_by(len as u64);
673 GLOBAL_CACHE_REFILL_METRICS
674 .data_refill_block_success_total
675 .inc();
676 }
677 Ok::<_, HummockError>(())
678 };
679 futures.push(future);
680 }
681 try_join_all(futures)
682 .await
683 .map_err(HummockError::file_cache)?;
684
685 drop(permit);
686 drop(timer);
687
688 Ok::<_, HummockError>(())
689 };
690 tasks.push(task);
691 }
692
693 try_join_all(tasks).await?;
694
695 Ok(())
696 }
697}
698
699#[derive(Debug)]
700pub struct SstableBlock {
701 pub sst_obj_id: HummockSstableObjectId,
702 pub blk_idx: usize,
703}
704
705#[derive(Debug, Hash, PartialEq, Eq)]
706pub struct SstableUnit {
707 pub sst_obj_id: HummockSstableObjectId,
708 pub blks: Range<usize>,
709}
710
711impl Ord for SstableUnit {
712 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
713 match self.sst_obj_id.cmp(&other.sst_obj_id) {
714 std::cmp::Ordering::Equal => {}
715 ord => return ord,
716 }
717 match self.blks.start.cmp(&other.blks.start) {
718 std::cmp::Ordering::Equal => {}
719 ord => return ord,
720 }
721 self.blks.end.cmp(&other.blks.end)
722 }
723}
724
725impl PartialOrd for SstableUnit {
726 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
727 Some(self.cmp(other))
728 }
729}
730
731#[derive(Debug)]
732struct Unit<'a> {
733 sst: &'a Sstable,
734 blks: Range<usize>,
735}
736
737impl<'a> Unit<'a> {
738 fn new(sst: &'a Sstable, unit: usize, uidx: usize) -> Self {
739 let blks = unit * uidx..std::cmp::min(unit * (uidx + 1), sst.block_count());
740 Self { sst, blks }
741 }
742
743 fn smallest_key(&self) -> &Vec<u8> {
744 &self.sst.meta.block_metas[self.blks.start].smallest_key
745 }
746
747 fn largest_key(&self) -> &Vec<u8> {
749 if self.blks.end == self.sst.block_count() {
750 &self.sst.meta.largest_key
751 } else {
752 &self.sst.meta.block_metas[self.blks.end].smallest_key
753 }
754 }
755
756 fn units(sst: &Sstable, unit: usize) -> usize {
757 sst.block_count() / unit
758 + if sst.block_count().is_multiple_of(unit) {
759 0
760 } else {
761 1
762 }
763 }
764}