availability-recovery: bump chunk fetch threshold to 1MB for Polkadot and 4MB for Kusama + testnets (#4399)

sandreim · web-flow · commit f469fbfb0a44 · 2024-05-24T14:14:44.000Z
Doing this change ensures that we minimize the CPU usage we spend in reed-solomon by only doing the re-encoding into chunks if PoV size is less than 4MB (which means all PoVs right now) Based on susbystem benchmark results we concluded that it is safe to bump this number higher. At worst case scenario the network pressure for a backing group of 5 is around 25% of the network bandwidth in hw specs. Assuming 6s block times (max_candidate_depth 3) and needed_approvals 30 the amount of bandwidth usage of a backing group used would hover above `30 * 4 * 3 = 360MB` per relay chain block. Given a backing group of 5 that gives 72MB per block per validator -> 12 MB/s. <details> <summary>Reality check on Kusama PoV sizes (click for chart)</summary> <br> <img width="697" alt="Screenshot 2024-05-07 at 14 30 38" src="https://github.com/paritytech/polkadot-sdk/assets/54316454/bfed32d4-8623-48b0-9ec0-8b95dd2a9d8c"> </details> --------- Signed-off-by: Andrei Sandu <andrei-mihail@parity.io>
diff --git a/polkadot/node/network/availability-recovery/src/lib.rs b/polkadot/node/network/availability-recovery/src/lib.rs
@@ -77,8 +77,10 @@ const LRU_SIZE: u32 = 16;
 
 const COST_INVALID_REQUEST: Rep = Rep::CostMajor("Peer sent unparsable request");
 
-/// PoV size limit in bytes for which prefer fetching from backers.
-const SMALL_POV_LIMIT: usize = 128 * 1024;
+/// PoV size limit in bytes for which prefer fetching from backers. (conservative, Polkadot for now)
+pub(crate) const CONSERVATIVE_FETCH_CHUNKS_THRESHOLD: usize = 1 * 1024 * 1024;
+/// PoV size limit in bytes for which prefer fetching from backers. (Kusama and all testnets)
+pub const FETCH_CHUNKS_THRESHOLD: usize = 4 * 1024 * 1024;
 
 #[derive(Clone, PartialEq)]
 /// The strategy we use to recover the PoV.
@@ -448,7 +450,7 @@ async fn handle_recover<Context>(
 				if let Some(backing_validators) = session_info.validator_groups.get(backing_group) {
 					let mut small_pov_size = true;
 
-					if let RecoveryStrategyKind::BackersFirstIfSizeLower(small_pov_limit) =
+					if let RecoveryStrategyKind::BackersFirstIfSizeLower(fetch_chunks_threshold) =
 						recovery_strategy_kind
 					{
 						// Get our own chunk size to get an estimate of the PoV size.
@@ -457,13 +459,13 @@ async fn handle_recover<Context>(
 						if let Ok(Some(chunk_size)) = chunk_size {
 							let pov_size_estimate =
 								chunk_size.saturating_mul(session_info.validators.len()) / 3;
-							small_pov_size = pov_size_estimate < small_pov_limit;
+							small_pov_size = pov_size_estimate < fetch_chunks_threshold;
 
 							gum::trace!(
 								target: LOG_TARGET,
 								?candidate_hash,
 								pov_size_estimate,
-								small_pov_limit,
+								fetch_chunks_threshold,
 								enabled = small_pov_size,
 								"Prefer fetch from backing group",
 							);
@@ -547,11 +549,14 @@ impl AvailabilityRecoverySubsystem {
 	/// which never requests the `AvailabilityStoreSubsystem` subsystem and only checks the POV hash
 	/// instead of reencoding the available data.
 	pub fn for_collator(
+		fetch_chunks_threshold: Option<usize>,
 		req_receiver: IncomingRequestReceiver<request_v1::AvailableDataFetchingRequest>,
 		metrics: Metrics,
 	) -> Self {
 		Self {
-			recovery_strategy_kind: RecoveryStrategyKind::BackersFirstIfSizeLower(SMALL_POV_LIMIT),
+			recovery_strategy_kind: RecoveryStrategyKind::BackersFirstIfSizeLower(
+				fetch_chunks_threshold.unwrap_or(CONSERVATIVE_FETCH_CHUNKS_THRESHOLD),
+			),
 			bypass_availability_store: true,
 			post_recovery_check: PostRecoveryCheck::PovHash,
 			req_receiver,
@@ -591,11 +596,14 @@ impl AvailabilityRecoverySubsystem {
 	/// Create a new instance of `AvailabilityRecoverySubsystem` which requests chunks if PoV is
 	/// above a threshold.
 	pub fn with_chunks_if_pov_large(
+		fetch_chunks_threshold: Option<usize>,
 		req_receiver: IncomingRequestReceiver<request_v1::AvailableDataFetchingRequest>,
 		metrics: Metrics,
 	) -> Self {
 		Self {
-			recovery_strategy_kind: RecoveryStrategyKind::BackersFirstIfSizeLower(SMALL_POV_LIMIT),
+			recovery_strategy_kind: RecoveryStrategyKind::BackersFirstIfSizeLower(
+				fetch_chunks_threshold.unwrap_or(CONSERVATIVE_FETCH_CHUNKS_THRESHOLD),
+			),
 			bypass_availability_store: false,
 			post_recovery_check: PostRecoveryCheck::Reencode,
 			req_receiver,
diff --git a/polkadot/node/network/availability-recovery/src/tests.rs b/polkadot/node/network/availability-recovery/src/tests.rs
@@ -906,6 +906,7 @@ fn recovers_from_only_chunks_if_pov_large() {
 	let test_state = TestState::default();
 	let req_protocol_names = ReqProtocolNames::new(&GENESIS_HASH, None);
 	let subsystem = AvailabilityRecoverySubsystem::with_chunks_if_pov_large(
+		Some(FETCH_CHUNKS_THRESHOLD),
 		request_receiver(&req_protocol_names),
 		Metrics::new_dummy(),
 	);
@@ -942,7 +943,7 @@ fn recovers_from_only_chunks_if_pov_large() {
 			AllMessages::AvailabilityStore(
 				AvailabilityStoreMessage::QueryChunkSize(_, tx)
 			) => {
-				let _ = tx.send(Some(1000000));
+				let _ = tx.send(Some(crate::FETCH_CHUNKS_THRESHOLD + 1));
 			}
 		);
 
@@ -987,7 +988,7 @@ fn recovers_from_only_chunks_if_pov_large() {
 			AllMessages::AvailabilityStore(
 				AvailabilityStoreMessage::QueryChunkSize(_, tx)
 			) => {
-				let _ = tx.send(Some(1000000));
+				let _ = tx.send(Some(crate::FETCH_CHUNKS_THRESHOLD + 1));
 			}
 		);
 
@@ -1015,6 +1016,7 @@ fn fast_path_backing_group_recovers_if_pov_small() {
 	let test_state = TestState::default();
 	let req_protocol_names = ReqProtocolNames::new(&GENESIS_HASH, None);
 	let subsystem = AvailabilityRecoverySubsystem::with_chunks_if_pov_large(
+		Some(FETCH_CHUNKS_THRESHOLD),
 		request_receiver(&req_protocol_names),
 		Metrics::new_dummy(),
 	);
diff --git a/polkadot/node/service/src/lib.rs b/polkadot/node/service/src/lib.rs
@@ -750,6 +750,7 @@ pub fn new_full<
 		prepare_workers_hard_max_num,
 	}: NewFullParams<OverseerGenerator>,
 ) -> Result<NewFull, Error> {
+	use polkadot_availability_recovery::FETCH_CHUNKS_THRESHOLD;
 	use polkadot_node_network_protocol::request_response::IncomingRequest;
 	use sc_network_sync::WarpSyncParams;
 
@@ -988,6 +989,11 @@ pub fn new_full<
 			stagnant_check_interval: Default::default(),
 			stagnant_check_mode: chain_selection_subsystem::StagnantCheckMode::PruneOnly,
 		};
+
+		// Kusama + testnets get a higher threshold, we are conservative on Polkadot for now.
+		let fetch_chunks_threshold =
+			if config.chain_spec.is_polkadot() { None } else { Some(FETCH_CHUNKS_THRESHOLD) };
+
 		Some(ExtendedOverseerGenArgs {
 			keystore,
 			parachains_db,
@@ -1001,6 +1007,7 @@ pub fn new_full<
 			dispute_req_receiver,
 			dispute_coordinator_config,
 			chain_selection_config,
+			fetch_chunks_threshold,
 		})
 	};
 
diff --git a/polkadot/node/service/src/overseer.rs b/polkadot/node/service/src/overseer.rs
@@ -133,6 +133,10 @@ pub struct ExtendedOverseerGenArgs {
 	pub dispute_coordinator_config: DisputeCoordinatorConfig,
 	/// Configuration for the chain selection subsystem.
 	pub chain_selection_config: ChainSelectionConfig,
+	/// Optional availability recovery fetch chunks threshold. If PoV size size is lower
+	/// than the value put in here we always try to recovery availability from backers.
+	/// The presence of this parameter here is needed to have different values per chain.
+	pub fetch_chunks_threshold: Option<usize>,
 }
 
 /// Obtain a prepared validator `Overseer`, that is initialized with all default values.
@@ -166,6 +170,7 @@ pub fn validator_overseer_builder<Spawner, RuntimeClient>(
 		dispute_req_receiver,
 		dispute_coordinator_config,
 		chain_selection_config,
+		fetch_chunks_threshold,
 	}: ExtendedOverseerGenArgs,
 ) -> Result<
 	InitializedOverseerBuilder<
@@ -240,6 +245,7 @@ where
 			Metrics::register(registry)?,
 		))
 		.availability_recovery(AvailabilityRecoverySubsystem::with_chunks_if_pov_large(
+			fetch_chunks_threshold,
 			available_data_req_receiver,
 			Metrics::register(registry)?,
 		))
@@ -421,6 +427,7 @@ where
 		))
 		.availability_distribution(DummySubsystem)
 		.availability_recovery(AvailabilityRecoverySubsystem::for_collator(
+			None,
 			available_data_req_receiver,
 			Metrics::register(registry)?,
 		))