Skip to content

Commit ff6b1bb

Browse files
author
Marius Posta
authored
bulk-cdk: fix bugs surfaced by CAT tests (#44543)
## What I tried to get the CAT tests running on airbyte-enterprise today. There were a few failures which surfaced bugs in the Bulk CDK, which doesn't always emit STATE or TRACE ERROR messages when required during a READ. ## How Emit TRACE ERROR messages if the configured streams are bad. Emit at least one STATE message for each stream with an input state. ## Review guide Commit by commit ## User Impact None ## Can this PR be safely reverted and rolled back? <!-- * If unsure, leave it blank. --> - [x] YES 💚 - [ ] NO ❌
1 parent 5d1b1cd commit ff6b1bb

File tree

11 files changed

+281
-12
lines changed

11 files changed

+281
-12
lines changed

airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/read/FeedReader.kt

+15-5
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ class FeedReader(
4141
log.info {
4242
"no more partitions to read for '${feed.label}' in round $partitionsCreatorID"
4343
}
44+
// Publish a checkpoint if applicable.
45+
maybeCheckpoint()
46+
// Publish stream completion.
4447
emitStreamStatus(AirbyteStreamStatusTraceMessage.AirbyteStreamStatus.COMPLETE)
4548
break
4649
}
@@ -279,18 +282,25 @@ class FeedReader(
279282
}
280283
} finally {
281284
// Publish a checkpoint if applicable.
282-
val stateMessages: List<AirbyteStateMessage> = root.stateManager.checkpoint()
283-
if (stateMessages.isNotEmpty()) {
284-
log.info { "checkpoint of ${stateMessages.size} state message(s)" }
285-
stateMessages.forEach(root.outputConsumer::accept)
286-
}
285+
maybeCheckpoint()
287286
}
288287
}
289288
}
290289

291290
private suspend fun ctx(nameSuffix: String): CoroutineContext =
292291
coroutineContext + ThreadRenamingCoroutineName("${feed.label}-$nameSuffix") + Dispatchers.IO
293292

293+
private fun maybeCheckpoint() {
294+
val stateMessages: List<AirbyteStateMessage> = root.stateManager.checkpoint()
295+
if (stateMessages.isEmpty()) {
296+
return
297+
}
298+
log.info { "checkpoint of ${stateMessages.size} state message(s)" }
299+
for (stateMessage in stateMessages) {
300+
root.outputConsumer.accept(stateMessage)
301+
}
302+
}
303+
294304
private fun emitStreamStatus(status: AirbyteStreamStatusTraceMessage.AirbyteStreamStatus) {
295305
if (feed is Stream) {
296306
root.outputConsumer.accept(

airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/read/StateManager.kt

+15-4
Original file line numberDiff line numberDiff line change
@@ -96,9 +96,19 @@ class StateManager(
9696
initialState: OpaqueStateValue?,
9797
private val isCheckpointUnique: Boolean = true,
9898
) : StateManagerScopedToFeed {
99-
private var current: OpaqueStateValue? = initialState
100-
private var pending: OpaqueStateValue? = initialState
101-
private var pendingNumRecords: Long = 0L
99+
private var current: OpaqueStateValue?
100+
private var pending: OpaqueStateValue?
101+
private var isPending: Boolean
102+
private var pendingNumRecords: Long
103+
104+
init {
105+
synchronized(this) {
106+
current = initialState
107+
pending = initialState
108+
isPending = initialState != null
109+
pendingNumRecords = 0L
110+
}
111+
}
102112

103113
override fun current(): OpaqueStateValue? = synchronized(this) { current }
104114

@@ -108,13 +118,14 @@ class StateManager(
108118
) {
109119
synchronized(this) {
110120
pending = state
121+
isPending = true
111122
pendingNumRecords += numRecords
112123
}
113124
}
114125

115126
fun swap(): Pair<OpaqueStateValue?, Long>? {
116127
synchronized(this) {
117-
if (isCheckpointUnique && pendingNumRecords == 0L && pending == current) {
128+
if (isCheckpointUnique && !isPending) {
118129
return null
119130
}
120131
val returnValue: Pair<OpaqueStateValue?, Long> = pending to pendingNumRecords

airbyte-cdk/bulk/core/extract/src/main/kotlin/io/airbyte/cdk/read/StateManagerFactory.kt

+25
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,15 @@ import io.airbyte.cdk.output.FieldTypeMismatch
2222
import io.airbyte.cdk.output.InvalidIncrementalSyncMode
2323
import io.airbyte.cdk.output.InvalidPrimaryKey
2424
import io.airbyte.cdk.output.MultipleStreamsFound
25+
import io.airbyte.cdk.output.OutputConsumer
2526
import io.airbyte.cdk.output.StreamHasNoFields
2627
import io.airbyte.cdk.output.StreamNotFound
28+
import io.airbyte.protocol.models.v0.AirbyteErrorTraceMessage
2729
import io.airbyte.protocol.models.v0.AirbyteStream
30+
import io.airbyte.protocol.models.v0.AirbyteStreamNameNamespacePair
2831
import io.airbyte.protocol.models.v0.ConfiguredAirbyteCatalog
2932
import io.airbyte.protocol.models.v0.ConfiguredAirbyteStream
33+
import io.airbyte.protocol.models.v0.StreamDescriptor
3034
import io.airbyte.protocol.models.v0.SyncMode
3135
import jakarta.inject.Singleton
3236

@@ -37,6 +41,7 @@ import jakarta.inject.Singleton
3741
@Singleton
3842
class StateManagerFactory(
3943
val metadataQuerierFactory: MetadataQuerier.Factory<SourceConfiguration>,
44+
val outputConsumer: OutputConsumer,
4045
val handler: CatalogValidationFailureHandler,
4146
) {
4247
/** Generates a [StateManager] instance based on the provided inputs. */
@@ -101,14 +106,28 @@ class StateManagerFactory(
101106
val jsonSchemaProperties: JsonNode = stream.jsonSchema["properties"]
102107
val name: String = stream.name!!
103108
val namespace: String? = stream.namespace
109+
val streamDescriptor = StreamDescriptor().withName(name).withNamespace(namespace)
110+
val streamLabel: String = AirbyteStreamNameNamespacePair(name, namespace).toString()
104111
when (metadataQuerier.streamNames(namespace).filter { it == name }.size) {
105112
0 -> {
106113
handler.accept(StreamNotFound(name, namespace))
114+
outputConsumer.accept(
115+
AirbyteErrorTraceMessage()
116+
.withStreamDescriptor(streamDescriptor)
117+
.withFailureType(AirbyteErrorTraceMessage.FailureType.CONFIG_ERROR)
118+
.withMessage("Stream '$streamLabel' not found or not accessible in source.")
119+
)
107120
return null
108121
}
109122
1 -> Unit
110123
else -> {
111124
handler.accept(MultipleStreamsFound(name, namespace))
125+
outputConsumer.accept(
126+
AirbyteErrorTraceMessage()
127+
.withStreamDescriptor(streamDescriptor)
128+
.withFailureType(AirbyteErrorTraceMessage.FailureType.CONFIG_ERROR)
129+
.withMessage("Multiple streams '$streamLabel' found in source.")
130+
)
112131
return null
113132
}
114133
}
@@ -153,6 +172,12 @@ class StateManagerFactory(
153172
}
154173
if (streamFields.isEmpty()) {
155174
handler.accept(StreamHasNoFields(name, namespace))
175+
outputConsumer.accept(
176+
AirbyteErrorTraceMessage()
177+
.withStreamDescriptor(streamDescriptor)
178+
.withFailureType(AirbyteErrorTraceMessage.FailureType.CONFIG_ERROR)
179+
.withMessage("Stream '$streamLabel' has no accessible fields.")
180+
)
156181
return null
157182
}
158183

airbyte-cdk/bulk/core/extract/src/testFixtures/kotlin/io/airbyte/cdk/command/SyncsTestFixture.kt

+26-2
Original file line numberDiff line numberDiff line change
@@ -95,10 +95,11 @@ data object SyncsTestFixture {
9595
connectionSupplier: Supplier<Connection>,
9696
prelude: (Connection) -> Unit,
9797
configuredCatalog: ConfiguredAirbyteCatalog,
98+
initialState: List<AirbyteStateMessage> = listOf(),
9899
vararg afterRead: AfterRead,
99100
) {
100101
connectionSupplier.get().use(prelude)
101-
var state: List<AirbyteStateMessage> = listOf()
102+
var state: List<AirbyteStateMessage> = initialState
102103
for (step in afterRead) {
103104
val readOutput: BufferingOutputConsumer =
104105
CliRunner.runSource("read", configPojo, configuredCatalog, state)
@@ -113,13 +114,15 @@ data object SyncsTestFixture {
113114
connectionSupplier: Supplier<Connection>,
114115
prelude: (Connection) -> Unit,
115116
configuredCatalogResource: String,
117+
initialStateResource: String?,
116118
vararg afterRead: AfterRead,
117119
) {
118120
testReads(
119121
configPojo,
120122
connectionSupplier,
121123
prelude,
122124
configuredCatalogFromResource(configuredCatalogResource),
125+
initialStateFromResource(initialStateResource),
123126
*afterRead,
124127
)
125128
}
@@ -169,6 +172,14 @@ data object SyncsTestFixture {
169172
ConfiguredAirbyteCatalog::class.java,
170173
)
171174

175+
fun initialStateFromResource(initialStateResource: String?): List<AirbyteStateMessage> =
176+
if (initialStateResource == null) {
177+
listOf()
178+
} else {
179+
val initialStateJson: String = ResourceUtils.readResource(initialStateResource)
180+
ValidatedJsonUtils.parseList(AirbyteStateMessage::class.java, initialStateJson)
181+
}
182+
172183
interface AfterRead {
173184
fun validate(actualOutput: BufferingOutputConsumer)
174185

@@ -182,7 +193,7 @@ data object SyncsTestFixture {
182193
object : AfterRead {
183194
override fun validate(actualOutput: BufferingOutputConsumer) {
184195
// State messages are timing-sensitive and therefore non-deterministic.
185-
// Ignore them.
196+
// Ignore them for now.
186197
val expectedWithoutStates: List<AirbyteMessage> =
187198
expectedMessages
188199
.filterNot { it.type == AirbyteMessage.Type.STATE }
@@ -193,6 +204,19 @@ data object SyncsTestFixture {
193204
.filterNot { it.type == AirbyteMessage.Type.STATE }
194205
.sortedBy { Jsons.writeValueAsString(it) }
195206
Assertions.assertIterableEquals(expectedWithoutStates, actualWithoutStates)
207+
// Check for state message counts (null if no state messages).
208+
val expectedCount: Double? =
209+
expectedMessages
210+
.filter { it.type == AirbyteMessage.Type.STATE }
211+
.mapNotNull { it.state?.sourceStats?.recordCount }
212+
.reduceRightOrNull { a: Double, b: Double -> a + b }
213+
val actualCount: Double? =
214+
actualOutput
215+
.messages()
216+
.filter { it.type == AirbyteMessage.Type.STATE }
217+
.mapNotNull { it.state?.sourceStats?.recordCount }
218+
.reduceRightOrNull { a: Double, b: Double -> a + b }
219+
Assertions.assertEquals(expectedCount, actualCount)
196220
}
197221

198222
override fun update(connection: Connection) {

airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/kotlin/io/airbyte/cdk/h2source/H2SourceIntegrationTest.kt

+44
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,50 @@ class H2SourceIntegrationTest {
146146
}
147147
}
148148

149+
@Test
150+
fun testReadStreamStateTooFarAhead() {
151+
H2TestFixture().use { h2: H2TestFixture ->
152+
val configPojo =
153+
H2SourceConfigurationJsonObject().apply {
154+
port = h2.port
155+
database = h2.database
156+
resumablePreferred = true
157+
}
158+
SyncsTestFixture.testReads(
159+
configPojo,
160+
h2::createConnection,
161+
Companion::prelude,
162+
"h2source/incremental-only-catalog.json",
163+
"h2source/state-too-far-ahead.json",
164+
SyncsTestFixture.AfterRead.Companion.fromExpectedMessages(
165+
"h2source/expected-messages-stream-too-far-ahead.json",
166+
),
167+
)
168+
}
169+
}
170+
171+
@Test
172+
fun testReadBadCatalog() {
173+
H2TestFixture().use { h2: H2TestFixture ->
174+
val configPojo =
175+
H2SourceConfigurationJsonObject().apply {
176+
port = h2.port
177+
database = h2.database
178+
resumablePreferred = true
179+
}
180+
SyncsTestFixture.testReads(
181+
configPojo,
182+
h2::createConnection,
183+
Companion::prelude,
184+
"h2source/bad-catalog.json",
185+
initialStateResource = null,
186+
SyncsTestFixture.AfterRead.Companion.fromExpectedMessages(
187+
"h2source/expected-messages-stream-bad-catalog.json",
188+
),
189+
)
190+
}
191+
}
192+
149193
companion object {
150194
@JvmStatic
151195
fun prelude(connection: Connection) {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
{
2+
"streams": [
3+
{
4+
"stream": {
5+
"name": "FOO",
6+
"json_schema": {
7+
"type": "object",
8+
"properties": {
9+
"BAR": {
10+
"type": "string"
11+
}
12+
}
13+
},
14+
"supported_sync_modes": ["full_refresh", "incremental"],
15+
"source_defined_cursor": false,
16+
"default_cursor_field": [],
17+
"source_defined_primary_key": [],
18+
"is_resumable": false,
19+
"namespace": "PUBLIC"
20+
},
21+
"sync_mode": "incremental",
22+
"cursor_field": ["BAR"],
23+
"destination_sync_mode": "overwrite",
24+
"primary_key": []
25+
}
26+
]
27+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
[
2+
{
3+
"type": "LOG",
4+
"log": {
5+
"level": "WARN",
6+
"message": "StreamNotFound(streamName=FOO, streamNamespace=PUBLIC)"
7+
}
8+
},
9+
{
10+
"type": "TRACE",
11+
"trace": {
12+
"type": "ERROR",
13+
"emitted_at": 3.1336416e12,
14+
"error": {
15+
"stream_descriptor": {
16+
"name": "FOO",
17+
"namespace": "PUBLIC"
18+
},
19+
"message": "Stream 'PUBLIC_FOO' not found or not accessible in source.",
20+
"failure_type": "config_error"
21+
}
22+
}
23+
}
24+
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
[
2+
{
3+
"type": "TRACE",
4+
"trace": {
5+
"type": "STREAM_STATUS",
6+
"emitted_at": 3.1336416e12,
7+
"stream_status": {
8+
"stream_descriptor": {
9+
"name": "EVENTS",
10+
"namespace": "PUBLIC"
11+
},
12+
"status": "STARTED"
13+
}
14+
}
15+
},
16+
{
17+
"type": "STATE",
18+
"state": {
19+
"type": "STREAM",
20+
"stream": {
21+
"stream_descriptor": {
22+
"name": "EVENTS",
23+
"namespace": "PUBLIC"
24+
},
25+
"stream_state": {
26+
"primary_key": {},
27+
"cursors": {
28+
"TS": "2024-04-30T00:00:00.000000-04:00"
29+
}
30+
}
31+
},
32+
"sourceStats": {
33+
"recordCount": 0.0
34+
}
35+
}
36+
},
37+
{
38+
"type": "TRACE",
39+
"trace": {
40+
"type": "STREAM_STATUS",
41+
"emitted_at": 3.1336416e12,
42+
"stream_status": {
43+
"stream_descriptor": {
44+
"name": "EVENTS",
45+
"namespace": "PUBLIC"
46+
},
47+
"status": "COMPLETE"
48+
}
49+
}
50+
}
51+
]

airbyte-cdk/bulk/toolkits/extract-jdbc/src/test/resources/h2source/expected-messages-stream-warm-start.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757
}
5858
},
5959
"sourceStats": {
60-
"recordCount": 2.0
60+
"recordCount": 1.0
6161
}
6262
}
6363
},

0 commit comments

Comments
 (0)