|
16 | 16 | ClassificationHandler,
|
17 | 17 | classification_workunit_processor,
|
18 | 18 | )
|
| 19 | +from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage |
19 | 20 | from datahub.ingestion.source.common.subtypes import (
|
20 | 21 | DatasetContainerSubTypes,
|
21 | 22 | DatasetSubTypes,
|
|
35 | 36 | )
|
36 | 37 | from datahub.ingestion.source.snowflake.snowflake_data_reader import SnowflakeDataReader
|
37 | 38 | from datahub.ingestion.source.snowflake.snowflake_profiler import SnowflakeProfiler
|
| 39 | +from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery |
38 | 40 | from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
|
39 | 41 | from datahub.ingestion.source.snowflake.snowflake_schema import (
|
40 | 42 | SCHEMA_PARALLELISM,
|
|
65 | 67 | get_domain_wu,
|
66 | 68 | )
|
67 | 69 | from datahub.ingestion.source_report.ingestion_stage import (
|
| 70 | + EXTERNAL_TABLE_DDL_LINEAGE, |
68 | 71 | METADATA_EXTRACTION,
|
69 | 72 | PROFILING,
|
70 | 73 | )
|
|
96 | 99 | TimeType,
|
97 | 100 | )
|
98 | 101 | from datahub.metadata.com.linkedin.pegasus2avro.tag import TagProperties
|
99 |
| -from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator |
| 102 | +from datahub.sql_parsing.sql_parsing_aggregator import ( |
| 103 | + KnownLineageMapping, |
| 104 | + SqlParsingAggregator, |
| 105 | +) |
100 | 106 | from datahub.utilities.registries.domain_registry import DomainRegistry
|
101 | 107 | from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
|
102 | 108 |
|
@@ -180,7 +186,8 @@ def __init__(
|
180 | 186 |
|
181 | 187 | # These are populated as side-effects of get_workunits_internal.
|
182 | 188 | self.databases: List[SnowflakeDatabase] = []
|
183 |
| - self.aggregator: Optional[SqlParsingAggregator] = aggregator |
| 189 | + |
| 190 | + self.aggregator = aggregator |
184 | 191 |
|
185 | 192 | def get_connection(self) -> SnowflakeConnection:
|
186 | 193 | return self.connection
|
@@ -212,6 +219,19 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
212 | 219 | self.report.set_ingestion_stage(snowflake_db.name, METADATA_EXTRACTION)
|
213 | 220 | yield from self._process_database(snowflake_db)
|
214 | 221 |
|
| 222 | + self.report.set_ingestion_stage("*", EXTERNAL_TABLE_DDL_LINEAGE) |
| 223 | + discovered_tables: List[str] = [ |
| 224 | + self.identifiers.get_dataset_identifier( |
| 225 | + table_name, schema.name, db.name |
| 226 | + ) |
| 227 | + for db in self.databases |
| 228 | + for schema in db.schemas |
| 229 | + for table_name in schema.tables |
| 230 | + ] |
| 231 | + if self.aggregator: |
| 232 | + for entry in self._external_tables_ddl_lineage(discovered_tables): |
| 233 | + self.aggregator.add(entry) |
| 234 | + |
215 | 235 | except SnowflakePermissionError as e:
|
216 | 236 | self.structured_reporter.failure(
|
217 | 237 | GENERIC_PERMISSION_ERROR_KEY,
|
@@ -1082,3 +1102,33 @@ def get_fk_constraints_for_table(
|
1082 | 1102 |
|
1083 | 1103 | # Access to table but none of its constraints - is this possible ?
|
1084 | 1104 | return constraints.get(table_name, [])
|
| 1105 | + |
| 1106 | + # Handles the case for explicitly created external tables. |
| 1107 | + # NOTE: Snowflake does not log this information to the access_history table. |
| 1108 | + def _external_tables_ddl_lineage( |
| 1109 | + self, discovered_tables: List[str] |
| 1110 | + ) -> Iterable[KnownLineageMapping]: |
| 1111 | + external_tables_query: str = SnowflakeQuery.show_external_tables() |
| 1112 | + try: |
| 1113 | + for db_row in self.connection.query(external_tables_query): |
| 1114 | + key = self.identifiers.get_dataset_identifier( |
| 1115 | + db_row["name"], db_row["schema_name"], db_row["database_name"] |
| 1116 | + ) |
| 1117 | + |
| 1118 | + if key not in discovered_tables: |
| 1119 | + continue |
| 1120 | + if db_row["location"].startswith("s3://"): |
| 1121 | + yield KnownLineageMapping( |
| 1122 | + upstream_urn=make_s3_urn_for_lineage( |
| 1123 | + db_row["location"], self.config.env |
| 1124 | + ), |
| 1125 | + downstream_urn=self.identifiers.gen_dataset_urn(key), |
| 1126 | + ) |
| 1127 | + self.report.num_external_table_edges_scanned += 1 |
| 1128 | + |
| 1129 | + self.report.num_external_table_edges_scanned += 1 |
| 1130 | + except Exception as e: |
| 1131 | + self.structured_reporter.warning( |
| 1132 | + "External table ddl lineage extraction failed", |
| 1133 | + exc=e, |
| 1134 | + ) |
0 commit comments