Skip to content

Commit 7da16c8

Browse files
WIP: Load CDK BulkLoad interface + MSSQL V2 Usage
1 parent b9fcc52 commit 7da16c8

File tree

18 files changed

+588
-58
lines changed

18 files changed

+588
-58
lines changed

airbyte-cdk/bulk/core/load/src/main/kotlin/io/airbyte/cdk/load/task/DestinationTaskLauncher.kt

+39-19
Original file line numberDiff line numberDiff line change
@@ -197,23 +197,6 @@ class DefaultDestinationTaskLauncher<K : WithStream>(
197197
}
198198

199199
override suspend fun run() {
200-
// Start the input consumer ASAP
201-
log.info { "Starting input consumer task" }
202-
val inputConsumerTask =
203-
inputConsumerTaskFactory.make(
204-
catalog = catalog,
205-
inputFlow = inputFlow,
206-
recordQueueSupplier = recordQueueSupplier,
207-
checkpointQueue = checkpointQueue,
208-
fileTransferQueue = fileTransferQueue,
209-
destinationTaskLauncher = this,
210-
recordQueueForPipeline = recordQueueForPipeline,
211-
loadPipeline = loadPipeline,
212-
partitioner = partitioner,
213-
openStreamQueue = openStreamQueue,
214-
)
215-
launch(inputConsumerTask)
216-
217200
// Launch the client interface setup task
218201
log.info { "Starting startup task" }
219202
val setupTask = setupTaskFactory.make(this)
@@ -225,12 +208,29 @@ class DefaultDestinationTaskLauncher<K : WithStream>(
225208
}
226209

227210
if (loadPipeline != null) {
228-
log.info { "Setting up load pipeline" }
229-
loadPipeline.start { launch(it) }
211+
log.info { "Setup load pipeline" }
212+
loadPipeline.start { task -> launch(task, withExceptionHandling = true) }
230213
log.info { "Launching update batch task" }
231214
val updateBatchTask = updateBatchTaskFactory.make(this)
232215
launch(updateBatchTask)
233216
} else {
217+
// Start the input consumer ASAP
218+
log.info { "Starting input consumer task" }
219+
val inputConsumerTask =
220+
inputConsumerTaskFactory.make(
221+
catalog = catalog,
222+
inputFlow = inputFlow,
223+
recordQueueSupplier = recordQueueSupplier,
224+
checkpointQueue = checkpointQueue,
225+
fileTransferQueue = fileTransferQueue,
226+
destinationTaskLauncher = this,
227+
recordQueueForPipeline = recordQueueForPipeline,
228+
loadPipeline = loadPipeline,
229+
partitioner = partitioner,
230+
openStreamQueue = openStreamQueue,
231+
)
232+
launch(inputConsumerTask)
233+
234234
// TODO: pluggable file transfer
235235
if (!fileTransferEnabled) {
236236
// Start a spill-to-disk task for each record stream
@@ -289,6 +289,26 @@ class DefaultDestinationTaskLauncher<K : WithStream>(
289289
catalog.streams.forEach { openStreamQueue.publish(it) }
290290
log.info { "Closing open stream queue" }
291291
openStreamQueue.close()
292+
} else {
293+
// When the pipeline is enabled, input consuming for
294+
// each stream will wait on stream start to complete,
295+
// but not on setup. This is the simplest way to make
296+
// it do that.
297+
log.info { "Setup complete, starting input consumer task" }
298+
val inputConsumerTask =
299+
inputConsumerTaskFactory.make(
300+
catalog = catalog,
301+
inputFlow = inputFlow,
302+
recordQueueSupplier = recordQueueSupplier,
303+
checkpointQueue = checkpointQueue,
304+
fileTransferQueue = fileTransferQueue,
305+
destinationTaskLauncher = this,
306+
recordQueueForPipeline = recordQueueForPipeline,
307+
loadPipeline = loadPipeline,
308+
partitioner = partitioner,
309+
openStreamQueue = openStreamQueue,
310+
)
311+
launch(inputConsumerTask)
292312
}
293313
}
294314

airbyte-cdk/bulk/core/load/src/main/kotlin/io/airbyte/cdk/load/write/StreamStateStore.kt

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ class StreamStateStore<S> {
2121
store[stream] = state
2222
}
2323

24-
fun get(stream: DestinationStream.Descriptor): S? {
24+
operator fun get(stream: DestinationStream.Descriptor): S? {
2525
return store[stream]
2626
}
2727
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
dependencies {
2+
implementation project(':airbyte-cdk:bulk:core:bulk-cdk-core-base')
3+
implementation project(':airbyte-cdk:bulk:core:bulk-cdk-core-load')
4+
5+
api project(':airbyte-cdk:bulk:toolkits:bulk-cdk-toolkit-load-object-storage')
6+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
/*
2+
* Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3+
*/
4+
5+
package io.airbyte.cdk.load.pipeline.db
6+
7+
import io.airbyte.cdk.load.file.object_storage.RemoteObject
8+
import io.airbyte.cdk.load.message.Batch
9+
import io.airbyte.cdk.load.message.WithBatchState
10+
import io.airbyte.cdk.load.message.WithStream
11+
import io.airbyte.cdk.load.pipeline.BatchAccumulator
12+
import io.airbyte.cdk.load.pipline.object_storage.LoadedObject
13+
import io.airbyte.cdk.load.write.db.BulkLoad
14+
import io.airbyte.cdk.load.write.db.BulkLoadFactory
15+
import io.micronaut.context.annotation.Requires
16+
import jakarta.inject.Singleton
17+
18+
@Singleton
19+
@Requires(bean = BulkLoadFactory::class)
20+
class BulkLoadAccumulator<K : WithStream, T : RemoteObject<*>>(
21+
val bulkLoad: BulkLoadFactory<K, T>
22+
) :
23+
BatchAccumulator<
24+
BulkLoadAccumulator<K, T>.State, K, LoadedObject<T>, BulkLoadAccumulator.LoadResult> {
25+
inner class State(val bulkLoad: BulkLoad<K, T>) : AutoCloseable {
26+
override fun close() {
27+
bulkLoad.close()
28+
}
29+
}
30+
31+
data object LoadResult : WithBatchState {
32+
override val state: Batch.State = Batch.State.COMPLETE
33+
}
34+
35+
override suspend fun start(key: K, part: Int): State {
36+
return State(bulkLoad.create(key))
37+
}
38+
39+
override suspend fun accept(input: LoadedObject<T>, state: State): Pair<State?, LoadResult> {
40+
state.bulkLoad.load(input.remoteObject)
41+
return state to LoadResult
42+
}
43+
44+
override suspend fun finish(state: State): LoadResult = LoadResult
45+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
/*
2+
* Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3+
*/
4+
5+
package io.airbyte.cdk.load.pipeline.db
6+
7+
import io.airbyte.cdk.load.file.object_storage.RemoteObject
8+
import io.airbyte.cdk.load.message.PartitionedQueue
9+
import io.airbyte.cdk.load.message.PipelineEvent
10+
import io.airbyte.cdk.load.message.QueueWriter
11+
import io.airbyte.cdk.load.message.WithStream
12+
import io.airbyte.cdk.load.pipeline.BatchUpdate
13+
import io.airbyte.cdk.load.pipeline.LoadPipelineStep
14+
import io.airbyte.cdk.load.pipline.object_storage.LoadedObject
15+
import io.airbyte.cdk.load.task.internal.LoadPipelineStepTask
16+
import io.airbyte.cdk.load.write.db.BulkLoadFactory
17+
import io.micronaut.context.annotation.Requires
18+
import jakarta.inject.Named
19+
import jakarta.inject.Singleton
20+
21+
@Singleton
22+
@Requires(bean = BulkLoadFactory::class)
23+
class BulkLoadIntoTableStep<K : WithStream, T : RemoteObject<*>>(
24+
val bulkLoadAccumulator: BulkLoadAccumulator<K, T>,
25+
val bulkLoad: BulkLoadFactory<K, T>,
26+
@Named("objectLoaderOutputQueue")
27+
val bulkLoadObjectQueue: PartitionedQueue<PipelineEvent<K, LoadedObject<T>>>,
28+
@Named("batchStateUpdateQueue") val batchUpdateQueue: QueueWriter<BatchUpdate>,
29+
) : LoadPipelineStep {
30+
override val numWorkers: Int = bulkLoad.maxNumConcurrentLoads
31+
32+
/** TODO: This should just be a task: no need for a whole accumulator pipeline here */
33+
override fun taskForPartition(partition: Int): LoadPipelineStepTask<*, *, *, *, *> {
34+
return LoadPipelineStepTask(
35+
bulkLoadAccumulator,
36+
bulkLoadObjectQueue.consume(partition),
37+
batchUpdateQueue,
38+
null,
39+
null as PartitionedQueue<PipelineEvent<K, BulkLoadAccumulator.LoadResult>>?,
40+
null,
41+
partition
42+
)
43+
}
44+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
/*
2+
* Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3+
*/
4+
5+
package io.airbyte.cdk.load.pipeline.db
6+
7+
import io.airbyte.cdk.load.file.object_storage.RemoteObject
8+
import io.airbyte.cdk.load.message.ChannelMessageQueue
9+
import io.airbyte.cdk.load.message.PartitionedQueue
10+
import io.airbyte.cdk.load.message.PipelineEvent
11+
import io.airbyte.cdk.load.message.WithStream
12+
import io.airbyte.cdk.load.pipline.object_storage.LoadedObject
13+
import io.airbyte.cdk.load.write.db.BulkLoadFactory
14+
import io.micronaut.context.annotation.Factory
15+
import io.micronaut.context.annotation.Requires
16+
import io.micronaut.context.annotation.Secondary
17+
import jakarta.inject.Named
18+
import jakarta.inject.Singleton
19+
import kotlinx.coroutines.channels.Channel
20+
21+
@Factory
22+
@Requires(bean = BulkLoadFactory::class)
23+
class BulkLoadObjectQueueFactory<K : WithStream, T : RemoteObject<*>>(
24+
val bulkLoad: BulkLoadFactory<K, T>
25+
) {
26+
@Singleton
27+
@Named("objectLoaderOutputQueue")
28+
@Secondary
29+
fun bulkLoadObjectQueue(): PartitionedQueue<PipelineEvent<K, LoadedObject<T>>> =
30+
PartitionedQueue(
31+
(0 until bulkLoad.maxNumConcurrentLoads)
32+
.map { ChannelMessageQueue<PipelineEvent<K, LoadedObject<T>>>(Channel(1)) }
33+
.toTypedArray()
34+
)
35+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
/*
2+
* Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3+
*/
4+
5+
package io.airbyte.cdk.load.pipeline.db
6+
7+
import io.airbyte.cdk.load.file.object_storage.RemoteObject
8+
import io.airbyte.cdk.load.message.WithStream
9+
import io.airbyte.cdk.load.pipeline.LoadPipeline
10+
import io.airbyte.cdk.load.pipline.object_storage.ObjectLoaderPartStep
11+
import io.airbyte.cdk.load.pipline.object_storage.ObjectLoaderPipeline
12+
import io.airbyte.cdk.load.pipline.object_storage.ObjectLoaderUploadStep
13+
import io.airbyte.cdk.load.write.db.BulkLoadFactory
14+
import io.micronaut.context.annotation.Replaces
15+
import io.micronaut.context.annotation.Requires
16+
import jakarta.inject.Singleton
17+
18+
@Singleton
19+
@Requires(bean = BulkLoadFactory::class)
20+
@Replaces(ObjectLoaderPipeline::class)
21+
class BulkLoadPipeline<K : WithStream, T : RemoteObject<*>>(
22+
partStep: ObjectLoaderPartStep,
23+
uploadStep: ObjectLoaderUploadStep<K, T>,
24+
loadIntoTableStep: BulkLoadIntoTableStep<*, *>,
25+
) : LoadPipeline(listOf(partStep, uploadStep, loadIntoTableStep))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
/*
2+
* Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3+
*/
4+
5+
package io.airbyte.cdk.load.write.db
6+
7+
import io.airbyte.cdk.load.file.object_storage.RemoteObject
8+
import io.airbyte.cdk.load.message.Batch
9+
import io.airbyte.cdk.load.message.WithStream
10+
import io.airbyte.cdk.load.write.object_storage.ObjectLoader
11+
12+
/**
13+
* [BulkLoad] is for the use case in which a destination first stages records in a temporary
14+
* location, usually cloud object storage, then loads them into the destination database in bulk.
15+
*
16+
* To use, declare a singleton of type [BulkLoadFactory] and implement the [BulkLoadFactory.create]
17+
* method.
18+
*
19+
* This strategy composes [ObjectLoader] with a post-processing step provided by the [load] method.
20+
* As [ObjectLoader] makes loaded objects as they become available, [load] is called on each one in
21+
* sequence.
22+
*
23+
* [BulkLoadFactory.maxNumConcurrentLoads] determines the number of concurrent loads that can be in
24+
* progress at once.
25+
*
26+
* The key type [K] determines how the destination will partition the work. By default,
27+
* [io.airbyte.cdk.load.message.StreamKey] is provided and an interface using this type will just
28+
* work. Specifically, no more than one [load] will ever be in progress per stream, but up to
29+
* [BulkLoadFactory.maxNumConcurrentLoads] can be in progress at once.
30+
*
31+
* Additionally, the configuration values provided by [ObjectLoader] can be overridden on the
32+
* factory and will work as documents.
33+
*
34+
* The factory method [BulkLoadFactory.create] will be called once per key the first time a key is
35+
* seen. It is guaranteed to be closed if created.
36+
*
37+
* To adjust this behavior, declare a named singleton "objectLoaderOutputPartitioner" using the
38+
* desired key and/or partition strategy. (See
39+
* [io.airbyte.cdk.load.pipline.object_storage.ObjectLoaderUploadStep])
40+
*
41+
* TODO: provide a method that allows the user to check for an available connection and defer work
42+
* if it is not available.
43+
*/
44+
interface BulkLoad<K : WithStream, T : RemoteObject<*>> : AutoCloseable {
45+
/** Called as uploaded parts become available */
46+
suspend fun load(remoteObject: T)
47+
}
48+
49+
interface BulkLoadFactory<K : WithStream, T : RemoteObject<*>> : ObjectLoader {
50+
val maxNumConcurrentLoads: Int
51+
override val batchStateOnUpload: Batch.State
52+
get() = Batch.State.PERSISTED
53+
54+
fun create(key: K): BulkLoad<K, T>
55+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/*
2+
* Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3+
*/
4+
5+
package io.airbyte.cdk.load.pipline.object_storage
6+
7+
import io.airbyte.cdk.load.file.object_storage.Part
8+
import io.airbyte.cdk.load.file.object_storage.RemoteObject
9+
import io.airbyte.cdk.load.message.StreamKey
10+
import io.airbyte.cdk.load.pipeline.OutputPartitioner
11+
import io.airbyte.cdk.load.write.object_storage.ObjectLoader
12+
import io.micronaut.context.annotation.Requires
13+
import io.micronaut.context.annotation.Secondary
14+
import jakarta.inject.Named
15+
import jakarta.inject.Singleton
16+
import kotlin.math.abs
17+
import kotlin.random.Random
18+
19+
/**
20+
* The default output partitioner for the ObjectLoader pipeline. It will not actually be used unless
21+
* an output queue for the loaded objects is provided. (see
22+
* [io.airbyte.cdk.load.write.object_storage.ObjectLoader]).
23+
*
24+
* In that case, the default behavior will be to key and partition by stream. This means that the
25+
* number of concurrent loaders can be chosen based on the available resources, but no individual
26+
* stream will ever run concurrently.
27+
*/
28+
@Singleton
29+
@Secondary
30+
@Requires(bean = ObjectLoader::class)
31+
@Named("objectLoaderOutputPartitioner")
32+
class ObjectLoaderObjectPartitioner<T : RemoteObject<*>> :
33+
OutputPartitioner<
34+
ObjectKey,
35+
Part,
36+
StreamKey,
37+
LoadedObject<T>,
38+
> {
39+
// TODO: Abstract this out to a round-robin partition generator
40+
private var nextPartition = Random(System.currentTimeMillis()).nextInt(Int.MAX_VALUE)
41+
42+
override fun getPart(outputKey: StreamKey, numParts: Int): Int {
43+
val part = nextPartition++
44+
return if (part == Int.MIN_VALUE) {
45+
0
46+
} else {
47+
abs(part) % numParts
48+
}
49+
}
50+
51+
override fun getOutputKey(inputKey: ObjectKey, output: LoadedObject<T>): StreamKey =
52+
StreamKey(inputKey.stream)
53+
}

0 commit comments

Comments
 (0)