Skip to content

Commit d622f38

Browse files
S3 Destination Uses New Load CDK Interface (temporarily disabled)
1 parent d266981 commit d622f38

File tree

29 files changed

+857
-172
lines changed

29 files changed

+857
-172
lines changed

airbyte-cdk/bulk/core/load/src/main/kotlin/io/airbyte/cdk/load/config/SyncBeanFactory.kt

+1-2
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ import io.airbyte.cdk.load.message.PipelineEvent
1616
import io.airbyte.cdk.load.message.StreamKey
1717
import io.airbyte.cdk.load.pipeline.BatchUpdate
1818
import io.airbyte.cdk.load.state.ReservationManager
19-
import io.airbyte.cdk.load.state.Reserved
2019
import io.airbyte.cdk.load.task.implementor.FileAggregateMessage
2120
import io.airbyte.cdk.load.task.implementor.FileTransferQueueMessage
2221
import io.airbyte.cdk.load.write.LoadStrategy
@@ -120,7 +119,7 @@ class SyncBeanFactory {
120119
@Named("recordQueue")
121120
fun recordQueue(
122121
loadStrategy: LoadStrategy? = null,
123-
): PartitionedQueue<Reserved<PipelineEvent<StreamKey, DestinationRecordAirbyteValue>>> {
122+
): PartitionedQueue<PipelineEvent<StreamKey, DestinationRecordAirbyteValue>> {
124123
return PartitionedQueue(
125124
Array(loadStrategy?.inputPartitions ?: 1) {
126125
ChannelMessageQueue(Channel(Channel.UNLIMITED))

airbyte-cdk/bulk/core/load/src/main/kotlin/io/airbyte/cdk/load/message/PipelineEvent.kt

+9-5
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,19 @@ import io.airbyte.cdk.load.state.CheckpointId
1010
/** Used internally by the CDK to pass messages between steps in the loader pipeline. */
1111
sealed interface PipelineEvent<K : WithStream, T>
1212

13+
/**
14+
* A message that contains a keyed payload. The key is used to manage the state of the payload's
15+
* corresponding [io.airbyte.cdk.load.pipeline.BatchAccumulator]. [checkpointCounts] is used by the
16+
* CDK to perform state message bookkeeping. [postProcessingCallback] is for releasing resources
17+
* associated with the message.
18+
*/
1319
class PipelineMessage<K : WithStream, T>(
1420
val checkpointCounts: Map<CheckpointId, Long>,
1521
val key: K,
16-
val value: T
22+
val value: T,
23+
val postProcessingCallback: suspend () -> Unit = {},
1724
) : PipelineEvent<K, T>
1825

19-
/**
20-
* We send the end message on the stream and not the key, because there's no way to partition an
21-
* empty message.
22-
*/
26+
/** Broadcast at end-of-stream to all partitions to signal that the stream has ended. */
2327
class PipelineEndOfStream<K : WithStream, T>(val stream: DestinationStream.Descriptor) :
2428
PipelineEvent<K, T>

airbyte-cdk/bulk/core/load/src/main/kotlin/io/airbyte/cdk/load/pipeline/BatchAccumulator.kt

+32-5
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,38 @@ package io.airbyte.cdk.load.pipeline
77
import io.airbyte.cdk.load.message.WithStream
88

99
/**
10-
* [BatchAccumulator] is used internally by the CDK to implement RecordLoaders. Connector devs
11-
* should never need to implement this interface.
10+
* [BatchAccumulator] is used internally by the CDK to implement
11+
* [io.airbyte.cdk.load.write.LoadStrategy]s. Connector devs should never need to implement this
12+
* interface.
13+
*
14+
* It is the glue that connects a specific step in a specific pipeline to the generic pipeline on
15+
* the back end. (For example, in a three-stage pipeline like bulk load, step 1 is to create a part,
16+
* step 2 is to upload it, and step 3 is to load it from object storage into a table.)
17+
*
18+
* - [S] is a state type that will be threaded through accumulator calls.
19+
* - [K] is a key type associated the input data. (NOTE: Currently, there is no support for
20+
* key-mapping, so the key is always [io.airbyte.cdk.load.message.StreamKey]). Specifically, state
21+
* will always be managed per-key.
22+
* - [T] is the input data type
23+
* - [U] is the output data type
24+
*
25+
* The first time data is seen for a given key, [start] is called (with the partition number). The
26+
* state returned by [start] will be passed per input to [accept].
27+
*
28+
* If [accept] returns a non-null output, that output will be forwarded to the next stage (if
29+
* applicable) and/or trigger bookkeeping (iff the output type implements
30+
* [io.airbyte.cdk.load.message.WithBatchState]).
31+
*
32+
* If [accept] returns a non-null state, that state will be passed to the next call to [accept]. If
33+
* [accept] returns a null state, the state will be discarded and a new one will be created on the
34+
* next input by a new call to [start].
35+
*
36+
* When the input stream is exhausted, [finish] will be called with any remaining state iff at least
37+
* one input was seen for that key. This means that [finish] will not be called on empty keys or on
38+
* keys where the last call to [accept] yielded a null (finished) state.
1239
*/
1340
interface BatchAccumulator<S, K : WithStream, T, U> {
14-
fun start(key: K, part: Int): S
15-
fun accept(record: T, state: S): Pair<S, U?>
16-
fun finish(state: S): U
41+
suspend fun start(key: K, part: Int): S
42+
suspend fun accept(input: T, state: S): Pair<S?, U?>
43+
suspend fun finish(state: S): U
1744
}

airbyte-cdk/bulk/core/load/src/main/kotlin/io/airbyte/cdk/load/pipeline/DirectLoadPipelineStep.kt

+1-3
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ import io.airbyte.cdk.load.message.PartitionedQueue
99
import io.airbyte.cdk.load.message.PipelineEvent
1010
import io.airbyte.cdk.load.message.QueueWriter
1111
import io.airbyte.cdk.load.message.StreamKey
12-
import io.airbyte.cdk.load.state.Reserved
1312
import io.airbyte.cdk.load.task.internal.LoadPipelineStepTask
1413
import io.airbyte.cdk.load.write.DirectLoader
1514
import io.airbyte.cdk.load.write.DirectLoaderFactory
@@ -24,8 +23,7 @@ import jakarta.inject.Singleton
2423
class DirectLoadPipelineStep<S : DirectLoader>(
2524
val accumulator: DirectLoadRecordAccumulator<S, StreamKey>,
2625
@Named("recordQueue")
27-
val inputQueue:
28-
PartitionedQueue<Reserved<PipelineEvent<StreamKey, DestinationRecordAirbyteValue>>>,
26+
val inputQueue: PartitionedQueue<PipelineEvent<StreamKey, DestinationRecordAirbyteValue>>,
2927
@Named("batchStateUpdateQueue") val batchQueue: QueueWriter<BatchUpdate>,
3028
@Value("\${airbyte.destination.core.record-batch-size-override:null}")
3129
val batchSizeOverride: Long? = null,

airbyte-cdk/bulk/core/load/src/main/kotlin/io/airbyte/cdk/load/pipeline/DirectLoadRecordAccumulator.kt

+7-7
Original file line numberDiff line numberDiff line change
@@ -26,23 +26,23 @@ data class DirectLoadAccResult(override val state: Batch.State) : WithBatchState
2626
class DirectLoadRecordAccumulator<S : DirectLoader, K : WithStream>(
2727
val directLoaderFactory: DirectLoaderFactory<S>
2828
) : BatchAccumulator<S, K, DestinationRecordAirbyteValue, DirectLoadAccResult> {
29-
override fun start(key: K, part: Int): S {
29+
override suspend fun start(key: K, part: Int): S {
3030
return directLoaderFactory.create(key.stream, part)
3131
}
3232

33-
override fun accept(
34-
record: DestinationRecordAirbyteValue,
33+
override suspend fun accept(
34+
input: DestinationRecordAirbyteValue,
3535
state: S
36-
): Pair<S, DirectLoadAccResult?> {
37-
state.accept(record).let {
36+
): Pair<S?, DirectLoadAccResult?> {
37+
state.accept(input).let {
3838
return when (it) {
3939
is Incomplete -> Pair(state, null)
40-
is Complete -> Pair(state, DirectLoadAccResult(Batch.State.COMPLETE))
40+
is Complete -> Pair(null, DirectLoadAccResult(Batch.State.COMPLETE))
4141
}
4242
}
4343
}
4444

45-
override fun finish(state: S): DirectLoadAccResult {
45+
override suspend fun finish(state: S): DirectLoadAccResult {
4646
state.finish()
4747
return DirectLoadAccResult(Batch.State.COMPLETE)
4848
}

airbyte-cdk/bulk/core/load/src/main/kotlin/io/airbyte/cdk/load/pipeline/InputPartitioner.kt

+3
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ interface InputPartitioner {
1717
fun getPartition(record: DestinationRecordAirbyteValue, numParts: Int): Int
1818
}
1919

20+
/**
21+
* The default input partitioner, which partitions by the stream name. TODO: Should be round-robin?
22+
*/
2023
@Singleton
2124
@Secondary
2225
class ByStreamInputPartitioner : InputPartitioner {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
/*
2+
* Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3+
*/
4+
5+
package io.airbyte.cdk.load.pipeline
6+
7+
import io.airbyte.cdk.load.message.DestinationRecordAirbyteValue
8+
import kotlin.math.abs
9+
import kotlin.random.Random
10+
11+
/**
12+
* Declare a singleton of this type to have input distributed evenly across the input partitions.
13+
* (The default is to [ByStreamInputPartitioner].)
14+
*/
15+
open class RoundRobinInputPartitioner(private val rotateEveryNRecords: Int = 10_000) :
16+
InputPartitioner {
17+
private var nextPartition =
18+
Random(System.currentTimeMillis()).nextInt(Int.MAX_VALUE / rotateEveryNRecords) *
19+
rotateEveryNRecords
20+
21+
override fun getPartition(record: DestinationRecordAirbyteValue, numParts: Int): Int {
22+
val part = nextPartition++ / rotateEveryNRecords
23+
return if (part == Int.MIN_VALUE) { // avoid overflow
24+
0
25+
} else {
26+
abs(part) % numParts
27+
}
28+
}
29+
}

airbyte-cdk/bulk/core/load/src/main/kotlin/io/airbyte/cdk/load/task/DestinationTaskLauncher.kt

+1-1
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ class DefaultDestinationTaskLauncher<K : WithStream>(
145145
// New interface shim
146146
@Named("recordQueue")
147147
private val recordQueueForPipeline:
148-
PartitionedQueue<Reserved<PipelineEvent<StreamKey, DestinationRecordAirbyteValue>>>,
148+
PartitionedQueue<PipelineEvent<StreamKey, DestinationRecordAirbyteValue>>,
149149
@Named("batchStateUpdateQueue") private val batchUpdateQueue: ChannelMessageQueue<BatchUpdate>,
150150
private val loadPipeline: LoadPipeline?,
151151
private val partitioner: InputPartitioner,

airbyte-cdk/bulk/core/load/src/main/kotlin/io/airbyte/cdk/load/task/internal/InputConsumerTask.kt

+7-7
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ class DefaultInputConsumerTask(
8080
// Required by new interface
8181
@Named("recordQueue")
8282
private val recordQueueForPipeline:
83-
PartitionedQueue<Reserved<PipelineEvent<StreamKey, DestinationRecordAirbyteValue>>>,
83+
PartitionedQueue<PipelineEvent<StreamKey, DestinationRecordAirbyteValue>>,
8484
private val loadPipeline: LoadPipeline? = null,
8585
private val partitioner: InputPartitioner,
8686
private val openStreamQueue: QueueWriter<DestinationStream>
@@ -165,20 +165,20 @@ class DefaultInputConsumerTask(
165165
mapOf(manager.getCurrentCheckpointId() to 1),
166166
StreamKey(stream),
167167
record
168-
)
168+
) { reserved.release() }
169169
val partition = partitioner.getPartition(record, recordQueueForPipeline.partitions)
170-
recordQueueForPipeline.publish(reserved.replace(pipelineMessage), partition)
170+
recordQueueForPipeline.publish(pipelineMessage, partition)
171171
}
172172
is DestinationRecordStreamComplete -> {
173173
manager.markEndOfStream(true)
174174
log.info { "Read COMPLETE for stream $stream" }
175-
recordQueueForPipeline.broadcast(reserved.replace(PipelineEndOfStream(stream)))
175+
recordQueueForPipeline.broadcast(PipelineEndOfStream(stream))
176176
reserved.release()
177177
}
178178
is DestinationRecordStreamIncomplete -> {
179179
manager.markEndOfStream(false)
180180
log.info { "Read INCOMPLETE for stream $stream" }
181-
recordQueueForPipeline.broadcast(reserved.replace(PipelineEndOfStream(stream)))
181+
recordQueueForPipeline.broadcast(PipelineEndOfStream(stream))
182182
reserved.release()
183183
}
184184
is DestinationFile -> {
@@ -310,7 +310,7 @@ interface InputConsumerTaskFactory {
310310

311311
// Required by new interface
312312
recordQueueForPipeline:
313-
PartitionedQueue<Reserved<PipelineEvent<StreamKey, DestinationRecordAirbyteValue>>>,
313+
PartitionedQueue<PipelineEvent<StreamKey, DestinationRecordAirbyteValue>>,
314314
loadPipeline: LoadPipeline?,
315315
partitioner: InputPartitioner,
316316
openStreamQueue: QueueWriter<DestinationStream>,
@@ -333,7 +333,7 @@ class DefaultInputConsumerTaskFactory(
333333

334334
// Required by new interface
335335
recordQueueForPipeline:
336-
PartitionedQueue<Reserved<PipelineEvent<StreamKey, DestinationRecordAirbyteValue>>>,
336+
PartitionedQueue<PipelineEvent<StreamKey, DestinationRecordAirbyteValue>>,
337337
loadPipeline: LoadPipeline?,
338338
partitioner: InputPartitioner,
339339
openStreamQueue: QueueWriter<DestinationStream>,

airbyte-cdk/bulk/core/load/src/main/kotlin/io/airbyte/cdk/load/task/internal/LoadPipelineStepTask.kt

+53-24
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ import io.airbyte.cdk.load.pipeline.BatchUpdate
1818
import io.airbyte.cdk.load.pipeline.OutputPartitioner
1919
import io.airbyte.cdk.load.pipeline.PipelineFlushStrategy
2020
import io.airbyte.cdk.load.state.CheckpointId
21-
import io.airbyte.cdk.load.state.Reserved
2221
import io.airbyte.cdk.load.task.OnEndOfSync
2322
import io.airbyte.cdk.load.task.Task
2423
import io.airbyte.cdk.load.task.TerminalCondition
@@ -34,7 +33,7 @@ data class RangeState<S>(
3433
/** A long-running task that actually implements a load pipeline step. */
3534
class LoadPipelineStepTask<S : AutoCloseable, K1 : WithStream, T, K2 : WithStream, U : Any>(
3635
private val batchAccumulator: BatchAccumulator<S, K1, T, U>,
37-
private val inputFlow: Flow<Reserved<PipelineEvent<K1, T>>>,
36+
private val inputFlow: Flow<PipelineEvent<K1, T>>,
3837
private val batchUpdateQueue: QueueWriter<BatchUpdate>,
3938
private val outputPartitioner: OutputPartitioner<K1, T, K2, U>?,
4039
private val outputQueue: PartitionedQueue<PipelineEvent<K2, U>>?,
@@ -44,11 +43,11 @@ class LoadPipelineStepTask<S : AutoCloseable, K1 : WithStream, T, K2 : WithStrea
4443
override val terminalCondition: TerminalCondition = OnEndOfSync
4544

4645
override suspend fun execute() {
47-
inputFlow.fold(mutableMapOf<K1, RangeState<S>>()) { stateStore, reservation ->
46+
inputFlow.fold(mutableMapOf<K1, RangeState<S>>()) { stateStore, input ->
4847
try {
49-
when (val input = reservation.value) {
48+
when (input) {
5049
is PipelineMessage -> {
51-
// Fetch and update the local state associated with the current batch.
50+
// Get or create the accumulator state associated w/ the input key.
5251
val state =
5352
stateStore
5453
.getOrPut(input.key) {
@@ -57,43 +56,73 @@ class LoadPipelineStepTask<S : AutoCloseable, K1 : WithStream, T, K2 : WithStrea
5756
)
5857
}
5958
.let { it.copy(inputCount = it.inputCount + 1) }
60-
val (newState, output) =
59+
60+
// Accumulate the input and get the new state and output.
61+
val (newStateMaybe, outputMaybe) =
6162
batchAccumulator.accept(
6263
input.value,
6364
state.state,
6465
)
65-
reservation.release() // TODO: Accumulate and release when persisted
66+
/** TODO: Make this impossible at the return type level */
67+
if (newStateMaybe == null && outputMaybe == null) {
68+
throw IllegalStateException(
69+
"BatchAccumulator must return a new state or an output"
70+
)
71+
}
72+
73+
// Update bookkeeping metadata
74+
input
75+
.postProcessingCallback() // TODO: Accumulate and release when persisted
6676
input.checkpointCounts.forEach {
6777
state.checkpointCounts.merge(it.key, it.value) { old, new -> old + new }
6878
}
6979

70-
// If the accumulator did not produce a result, check if we should flush.
71-
// If so, use the result of a finish call as the output.
72-
val finalOutput =
73-
output
74-
?: if (flushStrategy?.shouldFlush(state.inputCount) == true) {
75-
batchAccumulator.finish(newState)
80+
// Finalize the state and output
81+
val (finalState, finalOutput) =
82+
if (outputMaybe == null) {
83+
// Possibly force an output (and if so, discard the state)
84+
if (flushStrategy?.shouldFlush(state.inputCount) == true) {
85+
val finalOutput = batchAccumulator.finish(newStateMaybe!!)
86+
Pair(null, finalOutput)
7687
} else {
77-
null
88+
Pair(newStateMaybe, null)
7889
}
90+
} else {
91+
// Otherwise, just use what we were given
92+
Pair(newStateMaybe, outputMaybe)
93+
}
7994

80-
if (finalOutput != null) {
81-
// Publish the emitted output and evict the state.
82-
handleOutput(input.key, state.checkpointCounts, finalOutput)
83-
stateStore.remove(input.key)
95+
// Publish the output if there is one & reset the input count
96+
val inputCount =
97+
if (finalOutput != null) {
98+
// Publish the emitted output and evict the state.
99+
handleOutput(input.key, state.checkpointCounts, finalOutput)
100+
state.checkpointCounts.clear()
101+
0
102+
} else {
103+
state.inputCount
104+
}
105+
106+
// Update the state if `accept` returned a new state, otherwise evict.
107+
if (finalState != null) {
108+
// If accept returned a new state, update the state store.
109+
stateStore[input.key] =
110+
state.copy(state = finalState, inputCount = inputCount)
84111
} else {
85-
// If there's no output yet, just update the local state.
86-
stateStore[input.key] = RangeState(newState, state.checkpointCounts)
112+
stateStore.remove(input.key)
87113
}
114+
88115
stateStore
89116
}
90117
is PipelineEndOfStream -> {
91118
// Give any key associated with the stream a chance to finish
92119
val keysToRemove = stateStore.keys.filter { it.stream == input.stream }
93120
keysToRemove.forEach { key ->
94121
stateStore.remove(key)?.let { stored ->
95-
val output = batchAccumulator.finish(stored.state)
96-
handleOutput(key, stored.checkpointCounts, output)
122+
if (stored.inputCount > 0) {
123+
val output = batchAccumulator.finish(stored.state)
124+
handleOutput(key, stored.checkpointCounts, output)
125+
}
97126
}
98127
}
99128

@@ -122,7 +151,7 @@ class LoadPipelineStepTask<S : AutoCloseable, K1 : WithStream, T, K2 : WithStrea
122151
// Only publish the output if there's a next step.
123152
outputQueue?.let {
124153
val outputKey = outputPartitioner!!.getOutputKey(inputKey, output)
125-
val message = PipelineMessage(checkpointCounts, outputKey, output)
154+
val message = PipelineMessage(checkpointCounts.toMap(), outputKey, output)
126155
val outputPart = outputPartitioner.getPart(outputKey, it.partitions)
127156
it.publish(message, outputPart)
128157
}
@@ -132,7 +161,7 @@ class LoadPipelineStepTask<S : AutoCloseable, K1 : WithStream, T, K2 : WithStrea
132161
val update =
133162
BatchStateUpdate(
134163
stream = inputKey.stream,
135-
checkpointCounts = checkpointCounts,
164+
checkpointCounts = checkpointCounts.toMap(),
136165
state = output.state
137166
)
138167
batchUpdateQueue.publish(update)

0 commit comments

Comments
 (0)