|
| 1 | +package io.airbyte.integrations.destination.s3_v2 |
| 2 | + |
| 3 | +import io.airbyte.cdk.load.command.DestinationCatalog |
| 4 | +import io.airbyte.cdk.load.file.object_storage.PathFactory |
| 5 | +import io.airbyte.cdk.load.file.object_storage.StreamingUpload |
| 6 | +import io.airbyte.cdk.load.file.s3.S3Client |
| 7 | +import io.airbyte.cdk.load.file.s3.S3Object |
| 8 | +import io.airbyte.cdk.load.task.SelfTerminating |
| 9 | +import io.airbyte.cdk.load.task.TerminalCondition |
| 10 | +import io.airbyte.cdk.load.write.WriteOpOverride |
| 11 | +import io.github.oshai.kotlinlogging.KotlinLogging |
| 12 | +import jakarta.inject.Singleton |
| 13 | +import java.util.concurrent.ConcurrentHashMap |
| 14 | +import java.util.concurrent.atomic.AtomicLong |
| 15 | +import kotlin.random.Random |
| 16 | +import kotlin.time.measureTime |
| 17 | +import kotlinx.coroutines.Dispatchers |
| 18 | +import kotlinx.coroutines.ExperimentalCoroutinesApi |
| 19 | +import kotlinx.coroutines.async |
| 20 | +import kotlinx.coroutines.awaitAll |
| 21 | +import kotlinx.coroutines.coroutineScope |
| 22 | +import kotlinx.coroutines.withContext |
| 23 | +import org.apache.sshd.common.channel.StreamingChannel.Streaming |
| 24 | + |
| 25 | +@Singleton |
| 26 | +class S3V2WriteOpOverride( |
| 27 | + private val client: S3Client, |
| 28 | + private val catalog: DestinationCatalog, |
| 29 | + private val config: S3V2Configuration<*>, |
| 30 | + private val pathFactory: PathFactory, |
| 31 | +): WriteOpOverride { |
| 32 | + private val log = KotlinLogging.logger { } |
| 33 | + |
| 34 | + override val terminalCondition: TerminalCondition = SelfTerminating |
| 35 | + |
| 36 | + @OptIn(ExperimentalCoroutinesApi::class) |
| 37 | + override suspend fun execute() = coroutineScope { |
| 38 | + val prng = Random(System.currentTimeMillis()) |
| 39 | + val randomPart = prng.nextBytes(config.partSizeBytes.toInt()) |
| 40 | + val randomString = randomPart.take(32).joinToString("") { "%02x".format(it) } |
| 41 | + val stream = catalog.streams.first() |
| 42 | + val objectKey = pathFactory.getFinalDirectory(stream) + "/mock-perf-test-$randomString" |
| 43 | + |
| 44 | + val numParts = (config.objectSizeBytes / config.partSizeBytes).toInt() |
| 45 | + val partsPerWorker = numParts / config.numUploadWorkers |
| 46 | + val actualSizeBytes = partsPerWorker * config.numUploadWorkers * config.partSizeBytes |
| 47 | + |
| 48 | + log.info { |
| 49 | + "root key=$objectKey; part_size=${config.partSizeBytes}b; num_parts=$numParts (per_worker=$partsPerWorker); total_size=${actualSizeBytes}b; num_workers=${config.numUploadWorkers}" |
| 50 | + } |
| 51 | + |
| 52 | + val duration = measureTime { |
| 53 | + log.info { "Starting upload to $objectKey using approach ${config.approach}" } |
| 54 | + if (config.approach == "one_object_per_worker") { |
| 55 | + withContext(Dispatchers.IO.limitedParallelism(config.numUploadWorkers)) { |
| 56 | + (0 until config.numUploadWorkers).map { |
| 57 | + async { |
| 58 | + val workerKey = "$objectKey-worker-$it" |
| 59 | + log.info { "Starting upload to $workerKey" } |
| 60 | + val upload = client.startStreamingUpload(workerKey) |
| 61 | + repeat(partsPerWorker) { |
| 62 | + log.info { "Uploading part ${it + 1} of $workerKey" } |
| 63 | + upload.uploadPart(randomPart, it + 1) |
| 64 | + } |
| 65 | + log.info { "Completing upload to $workerKey" } |
| 66 | + upload.complete() |
| 67 | + } |
| 68 | + }.awaitAll() |
| 69 | + } |
| 70 | + } else if (config.approach == "distributed_parts") { |
| 71 | + withContext(Dispatchers.IO.limitedParallelism(config.numUploadWorkers)) { |
| 72 | + val workerKeys = (0 until config.numUploadWorkers).map { "$objectKey-worker-$it" } |
| 73 | + val keysWithUploads = workerKeys.map { Pair(it, client.startStreamingUpload(it)) } |
| 74 | + val keysWithUploadsAndParts = keysWithUploads.flatMap { (key, upload) -> |
| 75 | + (0 until partsPerWorker).map { Triple(key, upload, it + 1) } |
| 76 | + }.shuffled() |
| 77 | + val keyCounts = ConcurrentHashMap(workerKeys.associateWith { AtomicLong(partsPerWorker.toLong()) }) |
| 78 | + (0 until config.numUploadWorkers).map { |
| 79 | + async { |
| 80 | + val range = |
| 81 | + keysWithUploadsAndParts.slice(it * partsPerWorker until (it + 1) * partsPerWorker) |
| 82 | + range.forEach { (key, upload, part) -> |
| 83 | + log.info { "[$it] Uploading part $part of $key" } |
| 84 | + upload.uploadPart(randomPart, part) |
| 85 | + if (keyCounts[key]!!.decrementAndGet() == 0L) { |
| 86 | + log.info { "[$it] Completing upload to $key" } |
| 87 | + upload.complete() |
| 88 | + } |
| 89 | + } |
| 90 | + } |
| 91 | + }.awaitAll() |
| 92 | + } |
| 93 | + } else { |
| 94 | + error("Unknown approach: ${config.approach}") |
| 95 | + } |
| 96 | + } |
| 97 | + val mbs = actualSizeBytes.toFloat() / duration.inWholeSeconds.toFloat() / 1024 / 1024 |
| 98 | + log.info { |
| 99 | + // format mbs to 2 decimal places |
| 100 | + "Uploaded $actualSizeBytes bytes in $duration seconds (${"%.2f".format(mbs)} MB/s)" |
| 101 | + } |
| 102 | + } |
| 103 | +} |
0 commit comments