Skip to content

Commit f469fbf

Browse files
authored
availability-recovery: bump chunk fetch threshold to 1MB for Polkadot and 4MB for Kusama + testnets (#4399)
Doing this change ensures that we minimize the CPU usage we spend in reed-solomon by only doing the re-encoding into chunks if PoV size is less than 4MB (which means all PoVs right now) Based on susbystem benchmark results we concluded that it is safe to bump this number higher. At worst case scenario the network pressure for a backing group of 5 is around 25% of the network bandwidth in hw specs. Assuming 6s block times (max_candidate_depth 3) and needed_approvals 30 the amount of bandwidth usage of a backing group used would hover above `30 * 4 * 3 = 360MB` per relay chain block. Given a backing group of 5 that gives 72MB per block per validator -> 12 MB/s. <details> <summary>Reality check on Kusama PoV sizes (click for chart)</summary> <br> <img width="697" alt="Screenshot 2024-05-07 at 14 30 38" src="https://github.com/paritytech/polkadot-sdk/assets/54316454/bfed32d4-8623-48b0-9ec0-8b95dd2a9d8c"> </details> --------- Signed-off-by: Andrei Sandu <[email protected]>
1 parent 1c7a1a5 commit f469fbf

File tree

4 files changed

+33
-9
lines changed

4 files changed

+33
-9
lines changed

polkadot/node/network/availability-recovery/src/lib.rs

+15-7
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,10 @@ const LRU_SIZE: u32 = 16;
7777

7878
const COST_INVALID_REQUEST: Rep = Rep::CostMajor("Peer sent unparsable request");
7979

80-
/// PoV size limit in bytes for which prefer fetching from backers.
81-
const SMALL_POV_LIMIT: usize = 128 * 1024;
80+
/// PoV size limit in bytes for which prefer fetching from backers. (conservative, Polkadot for now)
81+
pub(crate) const CONSERVATIVE_FETCH_CHUNKS_THRESHOLD: usize = 1 * 1024 * 1024;
82+
/// PoV size limit in bytes for which prefer fetching from backers. (Kusama and all testnets)
83+
pub const FETCH_CHUNKS_THRESHOLD: usize = 4 * 1024 * 1024;
8284

8385
#[derive(Clone, PartialEq)]
8486
/// The strategy we use to recover the PoV.
@@ -448,7 +450,7 @@ async fn handle_recover<Context>(
448450
if let Some(backing_validators) = session_info.validator_groups.get(backing_group) {
449451
let mut small_pov_size = true;
450452

451-
if let RecoveryStrategyKind::BackersFirstIfSizeLower(small_pov_limit) =
453+
if let RecoveryStrategyKind::BackersFirstIfSizeLower(fetch_chunks_threshold) =
452454
recovery_strategy_kind
453455
{
454456
// Get our own chunk size to get an estimate of the PoV size.
@@ -457,13 +459,13 @@ async fn handle_recover<Context>(
457459
if let Ok(Some(chunk_size)) = chunk_size {
458460
let pov_size_estimate =
459461
chunk_size.saturating_mul(session_info.validators.len()) / 3;
460-
small_pov_size = pov_size_estimate < small_pov_limit;
462+
small_pov_size = pov_size_estimate < fetch_chunks_threshold;
461463

462464
gum::trace!(
463465
target: LOG_TARGET,
464466
?candidate_hash,
465467
pov_size_estimate,
466-
small_pov_limit,
468+
fetch_chunks_threshold,
467469
enabled = small_pov_size,
468470
"Prefer fetch from backing group",
469471
);
@@ -547,11 +549,14 @@ impl AvailabilityRecoverySubsystem {
547549
/// which never requests the `AvailabilityStoreSubsystem` subsystem and only checks the POV hash
548550
/// instead of reencoding the available data.
549551
pub fn for_collator(
552+
fetch_chunks_threshold: Option<usize>,
550553
req_receiver: IncomingRequestReceiver<request_v1::AvailableDataFetchingRequest>,
551554
metrics: Metrics,
552555
) -> Self {
553556
Self {
554-
recovery_strategy_kind: RecoveryStrategyKind::BackersFirstIfSizeLower(SMALL_POV_LIMIT),
557+
recovery_strategy_kind: RecoveryStrategyKind::BackersFirstIfSizeLower(
558+
fetch_chunks_threshold.unwrap_or(CONSERVATIVE_FETCH_CHUNKS_THRESHOLD),
559+
),
555560
bypass_availability_store: true,
556561
post_recovery_check: PostRecoveryCheck::PovHash,
557562
req_receiver,
@@ -591,11 +596,14 @@ impl AvailabilityRecoverySubsystem {
591596
/// Create a new instance of `AvailabilityRecoverySubsystem` which requests chunks if PoV is
592597
/// above a threshold.
593598
pub fn with_chunks_if_pov_large(
599+
fetch_chunks_threshold: Option<usize>,
594600
req_receiver: IncomingRequestReceiver<request_v1::AvailableDataFetchingRequest>,
595601
metrics: Metrics,
596602
) -> Self {
597603
Self {
598-
recovery_strategy_kind: RecoveryStrategyKind::BackersFirstIfSizeLower(SMALL_POV_LIMIT),
604+
recovery_strategy_kind: RecoveryStrategyKind::BackersFirstIfSizeLower(
605+
fetch_chunks_threshold.unwrap_or(CONSERVATIVE_FETCH_CHUNKS_THRESHOLD),
606+
),
599607
bypass_availability_store: false,
600608
post_recovery_check: PostRecoveryCheck::Reencode,
601609
req_receiver,

polkadot/node/network/availability-recovery/src/tests.rs

+4-2
Original file line numberDiff line numberDiff line change
@@ -906,6 +906,7 @@ fn recovers_from_only_chunks_if_pov_large() {
906906
let test_state = TestState::default();
907907
let req_protocol_names = ReqProtocolNames::new(&GENESIS_HASH, None);
908908
let subsystem = AvailabilityRecoverySubsystem::with_chunks_if_pov_large(
909+
Some(FETCH_CHUNKS_THRESHOLD),
909910
request_receiver(&req_protocol_names),
910911
Metrics::new_dummy(),
911912
);
@@ -942,7 +943,7 @@ fn recovers_from_only_chunks_if_pov_large() {
942943
AllMessages::AvailabilityStore(
943944
AvailabilityStoreMessage::QueryChunkSize(_, tx)
944945
) => {
945-
let _ = tx.send(Some(1000000));
946+
let _ = tx.send(Some(crate::FETCH_CHUNKS_THRESHOLD + 1));
946947
}
947948
);
948949

@@ -987,7 +988,7 @@ fn recovers_from_only_chunks_if_pov_large() {
987988
AllMessages::AvailabilityStore(
988989
AvailabilityStoreMessage::QueryChunkSize(_, tx)
989990
) => {
990-
let _ = tx.send(Some(1000000));
991+
let _ = tx.send(Some(crate::FETCH_CHUNKS_THRESHOLD + 1));
991992
}
992993
);
993994

@@ -1015,6 +1016,7 @@ fn fast_path_backing_group_recovers_if_pov_small() {
10151016
let test_state = TestState::default();
10161017
let req_protocol_names = ReqProtocolNames::new(&GENESIS_HASH, None);
10171018
let subsystem = AvailabilityRecoverySubsystem::with_chunks_if_pov_large(
1019+
Some(FETCH_CHUNKS_THRESHOLD),
10181020
request_receiver(&req_protocol_names),
10191021
Metrics::new_dummy(),
10201022
);

polkadot/node/service/src/lib.rs

+7
Original file line numberDiff line numberDiff line change
@@ -750,6 +750,7 @@ pub fn new_full<
750750
prepare_workers_hard_max_num,
751751
}: NewFullParams<OverseerGenerator>,
752752
) -> Result<NewFull, Error> {
753+
use polkadot_availability_recovery::FETCH_CHUNKS_THRESHOLD;
753754
use polkadot_node_network_protocol::request_response::IncomingRequest;
754755
use sc_network_sync::WarpSyncParams;
755756

@@ -988,6 +989,11 @@ pub fn new_full<
988989
stagnant_check_interval: Default::default(),
989990
stagnant_check_mode: chain_selection_subsystem::StagnantCheckMode::PruneOnly,
990991
};
992+
993+
// Kusama + testnets get a higher threshold, we are conservative on Polkadot for now.
994+
let fetch_chunks_threshold =
995+
if config.chain_spec.is_polkadot() { None } else { Some(FETCH_CHUNKS_THRESHOLD) };
996+
991997
Some(ExtendedOverseerGenArgs {
992998
keystore,
993999
parachains_db,
@@ -1001,6 +1007,7 @@ pub fn new_full<
10011007
dispute_req_receiver,
10021008
dispute_coordinator_config,
10031009
chain_selection_config,
1010+
fetch_chunks_threshold,
10041011
})
10051012
};
10061013

polkadot/node/service/src/overseer.rs

+7
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,10 @@ pub struct ExtendedOverseerGenArgs {
133133
pub dispute_coordinator_config: DisputeCoordinatorConfig,
134134
/// Configuration for the chain selection subsystem.
135135
pub chain_selection_config: ChainSelectionConfig,
136+
/// Optional availability recovery fetch chunks threshold. If PoV size size is lower
137+
/// than the value put in here we always try to recovery availability from backers.
138+
/// The presence of this parameter here is needed to have different values per chain.
139+
pub fetch_chunks_threshold: Option<usize>,
136140
}
137141

138142
/// Obtain a prepared validator `Overseer`, that is initialized with all default values.
@@ -166,6 +170,7 @@ pub fn validator_overseer_builder<Spawner, RuntimeClient>(
166170
dispute_req_receiver,
167171
dispute_coordinator_config,
168172
chain_selection_config,
173+
fetch_chunks_threshold,
169174
}: ExtendedOverseerGenArgs,
170175
) -> Result<
171176
InitializedOverseerBuilder<
@@ -240,6 +245,7 @@ where
240245
Metrics::register(registry)?,
241246
))
242247
.availability_recovery(AvailabilityRecoverySubsystem::with_chunks_if_pov_large(
248+
fetch_chunks_threshold,
243249
available_data_req_receiver,
244250
Metrics::register(registry)?,
245251
))
@@ -421,6 +427,7 @@ where
421427
))
422428
.availability_distribution(DummySubsystem)
423429
.availability_recovery(AvailabilityRecoverySubsystem::for_collator(
430+
None,
424431
available_data_req_receiver,
425432
Metrics::register(registry)?,
426433
))

0 commit comments

Comments
 (0)