@@ -75,14 +75,16 @@ def __init__(
75
75
self .report .lineage_end_time ,
76
76
) = self ._lineage_v1 .get_time_window ()
77
77
78
+ self .known_urns = set () # will be set later
79
+
78
80
def build (
79
81
self ,
80
82
connection : redshift_connector .Connection ,
81
83
all_tables : Dict [str , Dict [str , List [Union [RedshiftView , RedshiftTable ]]]],
82
84
db_schemas : Dict [str , Dict [str , RedshiftSchema ]],
83
85
) -> None :
84
86
# Assume things not in `all_tables` as temp tables.
85
- known_urns = set (
87
+ self . known_urns = set (
86
88
DatasetUrn .create_from_ids (
87
89
self .platform ,
88
90
f"{ db } .{ schema } .{ table .name } " ,
@@ -93,7 +95,7 @@ def build(
93
95
for schema , tables in schemas .items ()
94
96
for table in tables
95
97
)
96
- self .aggregator .is_temp_table = lambda urn : urn not in known_urns
98
+ self .aggregator .is_temp_table = lambda urn : urn not in self . known_urns
97
99
98
100
# Handle all the temp tables up front.
99
101
if self .config .resolve_temp_table_in_lineage :
@@ -238,13 +240,26 @@ def _process_sql_parser_lineage(self, lineage_row: LineageRow) -> None:
238
240
query_timestamp = lineage_row .timestamp ,
239
241
)
240
242
241
- def _process_stl_scan_lineage (self , lineage_row : LineageRow ) -> None :
243
+ def _make_filtered_target (self , lineage_row : LineageRow ) -> Optional [ DatasetUrn ] :
242
244
target = DatasetUrn .create_from_ids (
243
245
self .platform ,
244
246
f"{ self .database } .{ lineage_row .target_schema } .{ lineage_row .target_table } " ,
245
247
env = self .config .env ,
246
248
platform_instance = self .config .platform_instance ,
247
249
)
250
+ if target .urn () not in self .known_urns :
251
+ logger .debug (
252
+ f"Skipping lineage for { target .urn ()} as it is not in known_urns"
253
+ )
254
+ return
255
+
256
+ return target
257
+
258
+ def _process_stl_scan_lineage (self , lineage_row : LineageRow ) -> None :
259
+ target = self ._make_filtered_target (lineage_row )
260
+ if not target :
261
+ return
262
+
248
263
source = DatasetUrn .create_from_ids (
249
264
self .platform ,
250
265
f"{ self .database } .{ lineage_row .source_schema } .{ lineage_row .source_table } " ,
@@ -268,15 +283,9 @@ def _process_view_lineage(self, lineage_row: LineageRow) -> None:
268
283
if ddl is None :
269
284
return
270
285
271
- target_name = (
272
- f"{ self .database } .{ lineage_row .target_schema } .{ lineage_row .target_table } "
273
- )
274
- target = DatasetUrn .create_from_ids (
275
- self .platform ,
276
- target_name ,
277
- env = self .config .env ,
278
- platform_instance = self .config .platform_instance ,
279
- )
286
+ target = self ._make_filtered_target (lineage_row )
287
+ if not target :
288
+ return
280
289
281
290
self .aggregator .add_view_definition (
282
291
view_urn = target ,
@@ -300,12 +309,9 @@ def _process_copy_command(self, lineage_row: LineageRow) -> None:
300
309
301
310
if not lineage_row .target_schema or not lineage_row .target_table :
302
311
return
303
- target = DatasetUrn .create_from_ids (
304
- self .platform ,
305
- f"{ self .database } .{ lineage_row .target_schema } .{ lineage_row .target_table } " ,
306
- env = self .config .env ,
307
- platform_instance = self .config .platform_instance ,
308
- )
312
+ target = self ._make_filtered_target (lineage_row )
313
+ if not target :
314
+ return
309
315
310
316
self .aggregator .add_known_lineage_mapping (
311
317
upstream_urn = s3_urn , downstream_urn = target .urn ()
@@ -330,6 +336,11 @@ def _process_unload_command(self, lineage_row: LineageRow) -> None:
330
336
env = self .config .env ,
331
337
platform_instance = self .config .platform_instance ,
332
338
)
339
+ if source .urn () not in self .known_urns :
340
+ logger .debug (
341
+ f"Skipping unload lineage for { source .urn ()} as it is not in known_urns"
342
+ )
343
+ return
333
344
334
345
self .aggregator .add_known_lineage_mapping (
335
346
upstream_urn = source .urn (), downstream_urn = output_urn
0 commit comments