Skip to content

Commit 939bb2e

Browse files
authored
refactor(source-s3): migrate to CDK v3 (#41986)
1 parent 354da60 commit 939bb2e

File tree

6 files changed

+153
-140
lines changed

6 files changed

+153
-140
lines changed

airbyte-integrations/connectors/source-s3/integration_tests/cloud_spec.json

+16-10
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@
3131
},
3232
"globs": {
3333
"title": "Globs",
34+
"description": "The pattern used to specify which files should be selected from the file system. For more information on glob pattern matching look <a href=\"https://en.wikipedia.org/wiki/Glob_(programming)\">here</a>.",
3435
"default": ["**"],
3536
"order": 1,
36-
"description": "The pattern used to specify which files should be selected from the file system. For more information on glob pattern matching look <a href=\"https://en.wikipedia.org/wiki/Glob_(programming)\">here</a>.",
3737
"type": "array",
3838
"items": {
3939
"type": "string"
@@ -59,8 +59,8 @@
5959
"primary_key": {
6060
"title": "Primary Key",
6161
"description": "The column or columns (for a composite key) that serves as the unique identifier of a record. If empty, the primary key will default to the parser's default primary key.",
62-
"type": "string",
63-
"airbyte_hidden": true
62+
"airbyte_hidden": true,
63+
"type": "string"
6464
},
6565
"days_to_sync_if_history_is_full": {
6666
"title": "Days To Sync If History Is Full",
@@ -295,20 +295,20 @@
295295
"type": "string"
296296
},
297297
"skip_unprocessable_files": {
298-
"type": "boolean",
299-
"default": true,
300298
"title": "Skip Unprocessable Files",
301299
"description": "If true, skip files that cannot be parsed and pass the error message along as the _ab_source_file_parse_error field. If false, fail the sync.",
302-
"always_show": true
300+
"default": true,
301+
"always_show": true,
302+
"type": "boolean"
303303
},
304304
"strategy": {
305-
"type": "string",
305+
"title": "Parsing Strategy",
306+
"description": "The strategy used to parse documents. `fast` extracts text directly from the document which doesn't work for all files. `ocr_only` is more reliable, but slower. `hi_res` is the most reliable, but requires an API key and a hosted instance of unstructured and can't be used with local mode. See the unstructured.io documentation for more details: https://unstructured-io.github.io/unstructured/core/partition.html#partition-pdf",
307+
"default": "auto",
306308
"always_show": true,
307309
"order": 0,
308-
"default": "auto",
309-
"title": "Parsing Strategy",
310310
"enum": ["auto", "fast", "ocr_only", "hi_res"],
311-
"description": "The strategy used to parse documents. `fast` extracts text directly from the document which doesn't work for all files. `ocr_only` is more reliable, but slower. `hi_res` is the most reliable, but requires an API key and a hosted instance of unstructured and can't be used with local mode. See the unstructured.io documentation for more details: https://unstructured-io.github.io/unstructured/core/partition.html#partition-pdf"
311+
"type": "string"
312312
},
313313
"processing": {
314314
"title": "Processing",
@@ -346,6 +346,12 @@
346346
"description": "When enabled, syncs will not validate or structure records against the stream's schema.",
347347
"default": false,
348348
"type": "boolean"
349+
},
350+
"recent_n_files_to_read_for_schema_discovery": {
351+
"title": "Files To Read For Schema Discover",
352+
"description": "The number of resent files which will be used to discover the schema for this stream.",
353+
"exclusiveMinimum": 0,
354+
"type": "integer"
349355
}
350356
},
351357
"required": ["name", "format"]

airbyte-integrations/connectors/source-s3/integration_tests/spec.json

+22-16
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@
3131
},
3232
"globs": {
3333
"title": "Globs",
34+
"description": "The pattern used to specify which files should be selected from the file system. For more information on glob pattern matching look <a href=\"https://en.wikipedia.org/wiki/Glob_(programming)\">here</a>.",
3435
"default": ["**"],
3536
"order": 1,
36-
"description": "The pattern used to specify which files should be selected from the file system. For more information on glob pattern matching look <a href=\"https://en.wikipedia.org/wiki/Glob_(programming)\">here</a>.",
3737
"type": "array",
3838
"items": {
3939
"type": "string"
@@ -59,8 +59,8 @@
5959
"primary_key": {
6060
"title": "Primary Key",
6161
"description": "The column or columns (for a composite key) that serves as the unique identifier of a record. If empty, the primary key will default to the parser's default primary key.",
62-
"type": "string",
63-
"airbyte_hidden": true
62+
"airbyte_hidden": true,
63+
"type": "string"
6464
},
6565
"days_to_sync_if_history_is_full": {
6666
"title": "Days To Sync If History Is Full",
@@ -295,20 +295,20 @@
295295
"type": "string"
296296
},
297297
"skip_unprocessable_files": {
298-
"type": "boolean",
299-
"default": true,
300298
"title": "Skip Unprocessable Files",
301299
"description": "If true, skip files that cannot be parsed and pass the error message along as the _ab_source_file_parse_error field. If false, fail the sync.",
302-
"always_show": true
300+
"default": true,
301+
"always_show": true,
302+
"type": "boolean"
303303
},
304304
"strategy": {
305-
"type": "string",
305+
"title": "Parsing Strategy",
306+
"description": "The strategy used to parse documents. `fast` extracts text directly from the document which doesn't work for all files. `ocr_only` is more reliable, but slower. `hi_res` is the most reliable, but requires an API key and a hosted instance of unstructured and can't be used with local mode. See the unstructured.io documentation for more details: https://unstructured-io.github.io/unstructured/core/partition.html#partition-pdf",
307+
"default": "auto",
306308
"always_show": true,
307309
"order": 0,
308-
"default": "auto",
309-
"title": "Parsing Strategy",
310310
"enum": ["auto", "fast", "ocr_only", "hi_res"],
311-
"description": "The strategy used to parse documents. `fast` extracts text directly from the document which doesn't work for all files. `ocr_only` is more reliable, but slower. `hi_res` is the most reliable, but requires an API key and a hosted instance of unstructured and can't be used with local mode. See the unstructured.io documentation for more details: https://unstructured-io.github.io/unstructured/core/partition.html#partition-pdf"
311+
"type": "string"
312312
},
313313
"processing": {
314314
"title": "Processing",
@@ -346,6 +346,12 @@
346346
"description": "When enabled, syncs will not validate or structure records against the stream's schema.",
347347
"default": false,
348348
"type": "boolean"
349+
},
350+
"recent_n_files_to_read_for_schema_discovery": {
351+
"title": "Files To Read For Schema Discover",
352+
"description": "The number of resent files which will be used to discover the schema for this stream.",
353+
"exclusiveMinimum": 0,
354+
"type": "integer"
349355
}
350356
},
351357
"required": ["name", "format"]
@@ -364,19 +370,19 @@
364370
"order": 2,
365371
"type": "string"
366372
},
373+
"role_arn": {
374+
"title": "AWS Role ARN",
375+
"description": "Specifies the Amazon Resource Name (ARN) of an IAM role that you want to use to perform operations requested using this profile. Set the External ID to the Airbyte workspace ID, which can be found in the URL of this page.",
376+
"order": 6,
377+
"type": "string"
378+
},
367379
"aws_secret_access_key": {
368380
"title": "AWS Secret Access Key",
369381
"description": "In order to access private Buckets stored on AWS S3, this connector requires credentials with the proper permissions. If accessing publicly available data, this field is not necessary.",
370382
"airbyte_secret": true,
371383
"order": 3,
372384
"type": "string"
373385
},
374-
"role_arn": {
375-
"title": "AWS Role ARN",
376-
"description": "Specifies the Amazon Resource Name (ARN) of an IAM role that you want to use to perform operations requested using this profile. Set the External ID to the Airbyte workspace ID, which can be found in the URL of this page.",
377-
"order": 6,
378-
"type": "string"
379-
},
380386
"endpoint": {
381387
"title": "Endpoint",
382388
"description": "Endpoint to an S3 compatible service. Leave empty to use AWS.",

airbyte-integrations/connectors/source-s3/metadata.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ data:
1010
connectorSubtype: file
1111
connectorType: source
1212
definitionId: 69589781-7828-43c5-9f63-8925b1c1ccc2
13-
dockerImageTag: 4.6.3
13+
dockerImageTag: 4.7.0
1414
dockerRepository: airbyte/source-s3
1515
documentationUrl: https://docs.airbyte.com/integrations/sources/s3
1616
githubIssueLabel: source-s3

airbyte-integrations/connectors/source-s3/poetry.lock

+13-13
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

airbyte-integrations/connectors/source-s3/pyproject.toml

+2-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ requires = [ "poetry-core>=1.0.0",]
33
build-backend = "poetry.core.masonry.api"
44

55
[tool.poetry]
6-
version = "4.6.3"
6+
version = "4.7.0"
77
name = "source-s3"
88
description = "Source implementation for S3."
99
authors = [ "Airbyte <[email protected]>",]
@@ -30,7 +30,7 @@ source-s3 = "source_s3.run:run"
3030

3131
[tool.poetry.dependencies.airbyte-cdk]
3232
extras = [ "file-based",]
33-
version = "^2"
33+
version = "^3"
3434

3535
[tool.poetry.dependencies.smart-open]
3636
extras = [ "s3",]

0 commit comments

Comments
 (0)