1
1
import logging
2
2
import traceback
3
- from typing import Callable , Iterable , List , Optional , Tuple
3
+ from typing import Callable , Dict , Iterable , List , Optional , Tuple , Union
4
4
5
5
import redshift_connector
6
6
7
+ from datahub .emitter import mce_builder
7
8
from datahub .ingestion .api .common import PipelineContext
8
9
from datahub .ingestion .api .workunit import MetadataWorkUnit
9
10
from datahub .ingestion .source .redshift .config import LineageMode , RedshiftConfig
15
16
from datahub .ingestion .source .redshift .redshift_schema import (
16
17
LineageRow ,
17
18
RedshiftDataDictionary ,
19
+ RedshiftSchema ,
20
+ RedshiftTable ,
21
+ RedshiftView ,
18
22
)
19
23
from datahub .ingestion .source .redshift .report import RedshiftReport
20
24
from datahub .ingestion .source .state .redundant_run_skip_handler import (
@@ -40,13 +44,14 @@ def __init__(
40
44
database : str ,
41
45
redundant_run_skip_handler : Optional [RedundantLineageRunSkipHandler ] = None ,
42
46
):
47
+ self .platform = "redshift"
43
48
self .config = config
44
49
self .report = report
45
50
self .context = context
46
51
47
52
self .database = database
48
53
self .aggregator = SqlParsingAggregator (
49
- platform = "redshift" ,
54
+ platform = self . platform ,
50
55
platform_instance = self .config .platform_instance ,
51
56
env = self .config .env ,
52
57
generate_lineage = True ,
@@ -70,7 +75,27 @@ def __init__(
70
75
self .report .lineage_end_time ,
71
76
) = self ._lineage_v1 .get_time_window ()
72
77
73
- def build (self , connection : redshift_connector .Connection ) -> None :
78
+ def build (
79
+ self ,
80
+ connection : redshift_connector .Connection ,
81
+ all_tables : Dict [str , Dict [str , List [Union [RedshiftView , RedshiftTable ]]]],
82
+ db_schemas : Dict [str , Dict [str , RedshiftSchema ]],
83
+ ) -> None :
84
+ # Assume things not in `all_tables` as temp tables.
85
+ known_urns = set (
86
+ DatasetUrn .create_from_ids (
87
+ self .platform ,
88
+ f"{ db } .{ schema } .{ table .name } " ,
89
+ env = self .config .env ,
90
+ platform_instance = self .config .platform_instance ,
91
+ ).urn ()
92
+ for db , schemas in all_tables .items ()
93
+ for schema , tables in schemas .items ()
94
+ for table in tables
95
+ )
96
+ self .aggregator .is_temp_table = lambda urn : urn not in known_urns
97
+
98
+ # Handle all the temp tables up front.
74
99
if self .config .resolve_temp_table_in_lineage :
75
100
for temp_row in self ._lineage_v1 .get_temp_tables (connection = connection ):
76
101
self .aggregator .add_observed_query (
@@ -173,7 +198,8 @@ def build(self, connection: redshift_connector.Connection) -> None:
173
198
connection = connection ,
174
199
)
175
200
176
- # TODO add lineage for external tables
201
+ # Populate lineage for external tables.
202
+ self ._process_external_tables (all_tables = all_tables , db_schemas = db_schemas )
177
203
178
204
def _populate_lineage_agg (
179
205
self ,
@@ -214,13 +240,13 @@ def _process_sql_parser_lineage(self, lineage_row: LineageRow) -> None:
214
240
215
241
def _process_stl_scan_lineage (self , lineage_row : LineageRow ) -> None :
216
242
target = DatasetUrn .create_from_ids (
217
- "redshift" ,
243
+ self . platform ,
218
244
f"{ self .database } .{ lineage_row .target_schema } .{ lineage_row .target_table } " ,
219
245
env = self .config .env ,
220
246
platform_instance = self .config .platform_instance ,
221
247
)
222
248
source = DatasetUrn .create_from_ids (
223
- "redshift" ,
249
+ self . platform ,
224
250
f"{ self .database } .{ lineage_row .source_schema } .{ lineage_row .source_table } " ,
225
251
env = self .config .env ,
226
252
platform_instance = self .config .platform_instance ,
@@ -246,7 +272,7 @@ def _process_view_lineage(self, lineage_row: LineageRow) -> None:
246
272
f"{ self .database } .{ lineage_row .target_schema } .{ lineage_row .target_table } "
247
273
)
248
274
target = DatasetUrn .create_from_ids (
249
- "redshift" ,
275
+ self . platform ,
250
276
target_name ,
251
277
env = self .config .env ,
252
278
platform_instance = self .config .platform_instance ,
@@ -275,7 +301,7 @@ def _process_copy_command(self, lineage_row: LineageRow) -> None:
275
301
if not lineage_row .target_schema or not lineage_row .target_table :
276
302
return
277
303
target = DatasetUrn .create_from_ids (
278
- "redshift" ,
304
+ self . platform ,
279
305
f"{ self .database } .{ lineage_row .target_schema } .{ lineage_row .target_table } " ,
280
306
env = self .config .env ,
281
307
platform_instance = self .config .platform_instance ,
@@ -299,7 +325,7 @@ def _process_unload_command(self, lineage_row: LineageRow) -> None:
299
325
if not lineage_row .source_schema or not lineage_row .source_table :
300
326
return
301
327
source = DatasetUrn .create_from_ids (
302
- "redshift" ,
328
+ self . platform ,
303
329
f"{ self .database } .{ lineage_row .source_schema } .{ lineage_row .source_table } " ,
304
330
env = self .config .env ,
305
331
platform_instance = self .config .platform_instance ,
@@ -309,6 +335,41 @@ def _process_unload_command(self, lineage_row: LineageRow) -> None:
309
335
upstream_urn = source .urn (), downstream_urn = output_urn
310
336
)
311
337
338
+ def _process_external_tables (
339
+ self ,
340
+ all_tables : Dict [str , Dict [str , List [Union [RedshiftView , RedshiftTable ]]]],
341
+ db_schemas : Dict [str , Dict [str , RedshiftSchema ]],
342
+ ) -> None :
343
+ for schema_name , tables in all_tables [self .database ].items ():
344
+ for table in tables :
345
+ if table .type == "EXTERNAL_TABLE" :
346
+ schema = db_schemas [self .database ][schema_name ]
347
+
348
+ # external_db_params = schema.option
349
+ upstream_platform = schema .type .lower ()
350
+
351
+ table_urn = mce_builder .make_dataset_urn_with_platform_instance (
352
+ self .platform ,
353
+ f"{ self .database } .{ schema_name } .{ table .name } " ,
354
+ platform_instance = self .config .platform_instance ,
355
+ env = self .config .env ,
356
+ )
357
+ upstream_urn = mce_builder .make_dataset_urn_with_platform_instance (
358
+ upstream_platform ,
359
+ f"{ schema .external_database } .{ table .name } " ,
360
+ platform_instance = (
361
+ self .config .platform_instance_map .get (upstream_platform )
362
+ if self .config .platform_instance_map
363
+ else None
364
+ ),
365
+ env = self .config .env ,
366
+ )
367
+
368
+ self .aggregator .add_known_lineage_mapping (
369
+ upstream_urn = upstream_urn ,
370
+ downstream_urn = table_urn ,
371
+ )
372
+
312
373
def generate (self ) -> Iterable [MetadataWorkUnit ]:
313
374
for mcp in self .aggregator .gen_metadata ():
314
375
yield mcp .as_workunit ()
0 commit comments