@@ -301,13 +301,11 @@ def _get_fine_grained_lineages(
301
301
storage_urn : str ,
302
302
dataset_schema : SchemaMetadataClass ,
303
303
storage_schema : SchemaMetadataClass ,
304
- ) -> Optional [ List [ FineGrainedLineageClass ] ]:
304
+ ) -> Iterable [ FineGrainedLineageClass ]:
305
305
"""Generate column-level lineage between dataset and storage"""
306
306
307
307
if not self .config .include_column_lineage :
308
- return None
309
-
310
- fine_grained_lineages : List [FineGrainedLineageClass ] = []
308
+ return
311
309
312
310
for dataset_field in dataset_schema .fields :
313
311
dataset_path = dataset_field .fieldPath
@@ -320,82 +318,82 @@ def _get_fine_grained_lineages(
320
318
321
319
if matching_field :
322
320
if self .config .hive_storage_lineage_direction == "upstream" :
323
- fine_grained_lineages .append (
324
- FineGrainedLineageClass (
325
- upstreamType = FineGrainedLineageUpstreamTypeClass .FIELD_SET ,
326
- upstreams = [
327
- make_schema_field_urn (
328
- parent_urn = storage_urn ,
329
- field_path = matching_field .fieldPath ,
330
- )
331
- ],
332
- downstreamType = FineGrainedLineageDownstreamTypeClass .FIELD ,
333
- downstreams = [
334
- make_schema_field_urn (
335
- parent_urn = dataset_urn ,
336
- field_path = dataset_path ,
337
- )
338
- ],
339
- )
321
+ yield FineGrainedLineageClass (
322
+ upstreamType = FineGrainedLineageUpstreamTypeClass .FIELD_SET ,
323
+ upstreams = [
324
+ make_schema_field_urn (
325
+ parent_urn = storage_urn ,
326
+ field_path = matching_field .fieldPath ,
327
+ )
328
+ ],
329
+ downstreamType = FineGrainedLineageDownstreamTypeClass .FIELD ,
330
+ downstreams = [
331
+ make_schema_field_urn (
332
+ parent_urn = dataset_urn ,
333
+ field_path = dataset_path ,
334
+ )
335
+ ],
340
336
)
341
337
else :
342
- fine_grained_lineages .append (
343
- FineGrainedLineageClass (
344
- upstreamType = FineGrainedLineageUpstreamTypeClass .FIELD_SET ,
345
- upstreams = [
346
- make_schema_field_urn (
347
- parent_urn = dataset_urn ,
348
- field_path = dataset_path ,
349
- )
350
- ],
351
- downstreamType = FineGrainedLineageDownstreamTypeClass .FIELD ,
352
- downstreams = [
353
- make_schema_field_urn (
354
- parent_urn = storage_urn ,
355
- field_path = matching_field .fieldPath ,
356
- )
357
- ],
358
- )
338
+ yield FineGrainedLineageClass (
339
+ upstreamType = FineGrainedLineageUpstreamTypeClass .FIELD_SET ,
340
+ upstreams = [
341
+ make_schema_field_urn (
342
+ parent_urn = dataset_urn ,
343
+ field_path = dataset_path ,
344
+ )
345
+ ],
346
+ downstreamType = FineGrainedLineageDownstreamTypeClass .FIELD ,
347
+ downstreams = [
348
+ make_schema_field_urn (
349
+ parent_urn = storage_urn ,
350
+ field_path = matching_field .fieldPath ,
351
+ )
352
+ ],
359
353
)
360
354
361
- return fine_grained_lineages if fine_grained_lineages else None
362
-
363
355
def _create_lineage_mcp (
364
356
self ,
365
357
source_urn : str ,
366
358
target_urn : str ,
367
- fine_grained_lineages : Optional [List [FineGrainedLineageClass ]] = None ,
368
- ) -> MetadataChangeProposalWrapper :
359
+ fine_grained_lineages : Optional [Iterable [FineGrainedLineageClass ]] = None ,
360
+ ) -> Iterable [ MetadataWorkUnit ] :
369
361
"""Create lineage MCP between source and target datasets"""
370
362
363
+ lineages_list = (
364
+ list (fine_grained_lineages ) if fine_grained_lineages is not None else None
365
+ )
366
+
371
367
upstream_lineage = UpstreamLineageClass (
372
368
upstreams = [
373
369
UpstreamClass (dataset = source_urn , type = DatasetLineageTypeClass .COPY )
374
370
],
375
- fineGrainedLineages = fine_grained_lineages ,
371
+ fineGrainedLineages = lineages_list ,
376
372
)
377
373
378
- return MetadataChangeProposalWrapper (
379
- entityUrn = target_urn , aspect = upstream_lineage
374
+ yield MetadataWorkUnit (
375
+ id = f"{ source_urn } -{ target_urn } -lineage" ,
376
+ mcp = MetadataChangeProposalWrapper (
377
+ entityUrn = target_urn , aspect = upstream_lineage
378
+ ),
380
379
)
381
380
382
381
def get_storage_dataset_mcp (
383
382
self ,
384
383
storage_location : str ,
385
384
platform_instance : Optional [str ] = None ,
386
385
schema_metadata : Optional [SchemaMetadataClass ] = None ,
387
- ) -> Optional [ List [ MetadataChangeProposalWrapper ] ]:
386
+ ) -> Iterable [ MetadataWorkUnit ]:
388
387
"""
389
388
Generate MCPs for storage dataset if needed.
390
389
This creates the storage dataset entity in DataHub.
391
390
"""
392
391
393
- platform_instance = None
394
392
storage_info = StoragePathParser .parse_storage_location (
395
393
storage_location ,
396
394
)
397
395
if not storage_info :
398
- return None
396
+ return
399
397
400
398
platform , path = storage_info
401
399
platform_name = StoragePathParser .get_platform_name (platform )
@@ -414,52 +412,50 @@ def get_storage_dataset_mcp(
414
412
platform_instance = platform_instance ,
415
413
)
416
414
417
- mcps = []
418
-
415
+ # Dataset properties
419
416
props = DatasetPropertiesClass (name = path )
420
-
421
- mcps . append (
422
- MetadataChangeProposalWrapper (
417
+ yield MetadataWorkUnit (
418
+ id = f"storage- { storage_urn } -props" ,
419
+ mcp = MetadataChangeProposalWrapper (
423
420
entityUrn = storage_urn ,
424
421
aspect = props ,
425
- )
422
+ ),
426
423
)
427
424
428
- # Add platform instance
425
+ # Platform instance
429
426
platform_instance_aspect = self ._make_dataset_platform_instance (
430
427
platform = platform_name ,
431
428
instance = platform_instance ,
432
429
)
433
-
434
- mcps . append (
435
- MetadataChangeProposalWrapper (
430
+ yield MetadataWorkUnit (
431
+ id = f"storage- { storage_urn } -platform" ,
432
+ mcp = MetadataChangeProposalWrapper (
436
433
entityUrn = storage_urn , aspect = platform_instance_aspect
437
- )
434
+ ),
438
435
)
439
436
440
- # Add schema if available
437
+ # Schema if available
441
438
if schema_metadata :
442
439
storage_schema = SchemaMetadataClass (
443
440
schemaName = f"{ platform .value } _schema" ,
444
441
platform = f"urn:li:dataPlatform:{ platform .value } " ,
445
442
version = 0 ,
446
- fields = schema_metadata .fields , # Use the same fields as the table
443
+ fields = schema_metadata .fields ,
447
444
hash = "" ,
448
445
platformSchema = OtherSchemaClass (rawSchema = "" ),
449
446
)
450
-
451
- mcps . append (
452
- MetadataChangeProposalWrapper (
447
+ yield MetadataWorkUnit (
448
+ id = f"storage- { storage_urn } -schema" ,
449
+ mcp = MetadataChangeProposalWrapper (
453
450
entityUrn = storage_urn , aspect = storage_schema
454
- )
451
+ ),
455
452
)
456
453
457
- return mcps
458
454
except Exception as e :
459
455
logger .error (
460
456
f"Failed to create storage dataset MCPs for { storage_location } : { e } "
461
457
)
462
- return None
458
+ return
463
459
464
460
def get_lineage_mcp (
465
461
self ,
@@ -502,17 +498,11 @@ def get_lineage_mcp(
502
498
platform_instance = self .config .storage_platform_instance .lower ()
503
499
504
500
# Create storage dataset entity
505
- storage_mcps = self .get_storage_dataset_mcp (
501
+ yield from self .get_storage_dataset_mcp (
506
502
storage_location = storage_location ,
507
503
platform_instance = platform_instance ,
508
504
schema_metadata = dataset_schema ,
509
505
)
510
- if storage_mcps :
511
- for mcp in storage_mcps :
512
- yield MetadataWorkUnit (
513
- id = f"storage-{ storage_urn } " ,
514
- mcp = mcp ,
515
- )
516
506
517
507
# Get storage schema if available (implement based on storage system)
518
508
storage_schema = (
@@ -522,31 +512,28 @@ def get_lineage_mcp(
522
512
)
523
513
524
514
# Generate fine-grained lineage if schemas available
525
- fine_grained_lineages = None
526
- if dataset_schema and storage_schema :
527
- fine_grained_lineages = self ._get_fine_grained_lineages (
515
+ fine_grained_lineages = (
516
+ None
517
+ if not (dataset_schema and storage_schema )
518
+ else self ._get_fine_grained_lineages (
528
519
dataset_urn , storage_urn , dataset_schema , storage_schema
529
520
)
521
+ )
530
522
531
523
# Create lineage MCP
532
524
if self .config .hive_storage_lineage_direction == "upstream" :
533
- mcp = self ._create_lineage_mcp (
525
+ yield from self ._create_lineage_mcp (
534
526
source_urn = storage_urn ,
535
527
target_urn = dataset_urn ,
536
528
fine_grained_lineages = fine_grained_lineages ,
537
529
)
538
530
else :
539
- mcp = self ._create_lineage_mcp (
531
+ yield from self ._create_lineage_mcp (
540
532
source_urn = dataset_urn ,
541
533
target_urn = storage_urn ,
542
534
fine_grained_lineages = fine_grained_lineages ,
543
535
)
544
536
545
- yield MetadataWorkUnit (
546
- id = f"{ dataset_urn } -{ storage_urn } -lineage" ,
547
- mcp = mcp ,
548
- )
549
-
550
537
def _get_storage_schema (
551
538
self ,
552
539
storage_location : str ,
0 commit comments