remove siblings& update golden riles

yoonhyejin · yoonhyejin · commit 99f77d142636 · 2025-03-14T17:39:02.000+09:00
diff --git a/metadata-ingestion/src/datahub/ingestion/source/mlflow.py b/metadata-ingestion/src/datahub/ingestion/source/mlflow.py
@@ -39,7 +39,6 @@
     StatefulIngestionConfigBase,
     StatefulIngestionSourceBase,
 )
-from datahub.metadata.com.linkedin.pegasus2avro.common import Siblings
 from datahub.metadata.schema_classes import (
     AuditStampClass,
     ContainerClass,
@@ -63,11 +62,13 @@
     TagAssociationClass,
     TagPropertiesClass,
     TimeStampClass,
+    UpstreamClass,
+    UpstreamLineageClass,
     VersionPropertiesClass,
     VersionTagClass,
     _Aspect,
 )
-from datahub.metadata.urns import DataPlatformUrn, DatasetUrn, MlModelUrn, VersionSetUrn
+from datahub.metadata.urns import DataPlatformUrn, MlModelUrn, VersionSetUrn
 from datahub.sdk.container import Container
 from datahub.sdk.dataset import Dataset
 
@@ -207,14 +208,13 @@ def _create_workunit(self, urn: str, aspect: _Aspect) -> MetadataWorkUnit:
     def _get_experiment_workunits(self) -> Iterable[MetadataWorkUnit]:
         experiments = self._get_mlflow_experiments()
         for experiment in experiments:
-            if experiment.name == "lineage_platform_dataset_lineage_experiment":
-                yield from self._get_experiment_container_workunit(experiment)
+            yield from self._get_experiment_container_workunit(experiment)
 
-                runs = self._get_mlflow_runs_from_experiment(experiment)
-                if runs:
-                    for run in runs:
-                        yield from self._get_run_workunits(experiment, run)
-                        yield from self._get_dataset_input_workunits(run)
+            runs = self._get_mlflow_runs_from_experiment(experiment)
+            if runs:
+                for run in runs:
+                    yield from self._get_run_workunits(experiment, run)
+                    yield from self._get_dataset_input_workunits(run)
 
     def _get_experiment_custom_properties(self, experiment):
         experiment_custom_props = getattr(experiment, "tags", {}) or {}
@@ -271,7 +271,6 @@ def _get_dataset_schema(self, schema: str) -> Optional[List[Tuple[str, str]]]:
             print("Failed to parse schema JSON")
             return None
 
-        # Check for mlflow_colspec and extract field information
         if "mlflow_colspec" in schema_dict:
             try:
                 return [
@@ -281,12 +280,9 @@ def _get_dataset_schema(self, schema: str) -> Optional[List[Tuple[str, str]]]:
             except (KeyError, TypeError):
                 return None
 
-        # If we reach here, schema doesn't have the expected structure
         return None
 
     def _get_dataset_platform_from_source_type(self, source_type):
-        # source_type_to_platform = {}
-        # TODO: add ingestion config for this
         if source_type == "gs":
             return "gcs"
         return source_type
@@ -318,40 +314,36 @@ def _get_dataset_input_workunits(self, run: Run) -> Iterable[MetadataWorkUnit]:
                 dataset_reference_urns.append(str(local_dataset_reference.urn))
 
             else:
-                # workaround for setting siblings
-                hosted_dataset_reference_urn = DatasetUrn.create_from_ids(
-                    platform_id=self.platform, table_name=dataset.name, env="PROD"
-                )
                 hosted_dataset = Dataset(
                     platform=self._get_dataset_platform_from_source_type(source_type),
                     name=dataset.name,
                     schema=formatted_schema,
                     custom_properties=dataset_tags,
-                    extra_aspects=[
-                        Siblings(
-                            primary=True, siblings=[str(hosted_dataset_reference_urn)]
-                        )
-                    ],
                 )
-                # create dataset reference
                 hosted_dataset_reference = Dataset(
                     platform=self.platform,
                     name=dataset.name,
                     schema=formatted_schema,
                     custom_properties=dataset_tags,
-                    extra_aspects=[
-                        Siblings(primary=False, siblings=[str(hosted_dataset.urn)])
-                    ],
+                    upstreams=UpstreamLineageClass(
+                        upstreams=[
+                            UpstreamClass(dataset=str(hosted_dataset.urn), type="COPY")
+                        ]
+                    ),
                 )
                 dataset_reference_urns.append(str(hosted_dataset_reference.urn))
 
                 yield from hosted_dataset.as_workunits()
                 yield from hosted_dataset_reference.as_workunits()
 
         if dataset_reference_urns:
+            input_edges = [
+                EdgeClass(destinationUrn=dataset_referece_urn)
+                for dataset_referece_urn in dataset_reference_urns
+            ]
             yield MetadataChangeProposalWrapper(
                 entityUrn=str(run_urn),
-                aspect=DataProcessInstanceInputClass(inputs=dataset_reference_urns),
+                aspect=DataProcessInstanceInputClass(inputs=[], inputEdges=input_edges),
             ).as_workunit()
 
     def _get_run_workunits(
@@ -405,12 +397,7 @@ def _get_run_workunits(
             model_version_urn = self._make_ml_model_urn(model_versions[0])
             yield MetadataChangeProposalWrapper(
                 entityUrn=str(data_process_instance.urn),
-                aspect=DataProcessInstanceOutputClass(
-                    outputs=[],
-                    outputEdges=[
-                        EdgeClass(destinationUrn=model_version_urn),
-                    ],
-                ),
+                aspect=DataProcessInstanceOutputClass(outputs=[model_version_urn]),
             ).as_workunit()
 
         metrics = self._get_run_metrics(run)
diff --git a/metadata-ingestion/tests/integration/mlflow/mlflow_mcps_golden.json b/metadata-ingestion/tests/integration/mlflow/mlflow_mcps_golden.json
@@ -308,11 +308,8 @@
     "aspectName": "dataProcessInstanceOutput",
     "aspect": {
         "json": {
-            "outputs": [],
-            "outputEdges": [
-                {
-                    "destinationUrn": "urn:li:mlModel:(urn:li:dataPlatform:mlflow,test-model_1,PROD)"
-                }
+            "outputs": [
+                "urn:li:mlModel:(urn:li:dataPlatform:mlflow,test-model_1,PROD)"
             ]
         }
     },

Original file line number	Diff line number	Diff line change
`@@ -308,11 +308,8 @@`
`308`	`308`	`"aspectName": "dataProcessInstanceOutput",`
`309`	`309`	`"aspect": {`
`310`	`310`	`"json": {`
`311`		`- "outputs": [],`
`312`		`- "outputEdges": [`
`313`		`- {`
`314`		`- "destinationUrn": "urn:li:mlModel:(urn:li:dataPlatform:mlflow,test-model_1,PROD)"`
`315`		`- }`
	`311`	`+ "outputs": [`
	`312`	`+ "urn:li:mlModel:(urn:li:dataPlatform:mlflow,test-model_1,PROD)"`
`316`	`313`	`]`
`317`	`314`	`}`
`318`	`315`	`},`