Skip to content

Commit 99f77d1

Browse files
committed
remove siblings& update golden riles
1 parent b9fca52 commit 99f77d1

File tree

2 files changed

+22
-38
lines changed

2 files changed

+22
-38
lines changed

metadata-ingestion/src/datahub/ingestion/source/mlflow.py

+20-33
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@
3939
StatefulIngestionConfigBase,
4040
StatefulIngestionSourceBase,
4141
)
42-
from datahub.metadata.com.linkedin.pegasus2avro.common import Siblings
4342
from datahub.metadata.schema_classes import (
4443
AuditStampClass,
4544
ContainerClass,
@@ -63,11 +62,13 @@
6362
TagAssociationClass,
6463
TagPropertiesClass,
6564
TimeStampClass,
65+
UpstreamClass,
66+
UpstreamLineageClass,
6667
VersionPropertiesClass,
6768
VersionTagClass,
6869
_Aspect,
6970
)
70-
from datahub.metadata.urns import DataPlatformUrn, DatasetUrn, MlModelUrn, VersionSetUrn
71+
from datahub.metadata.urns import DataPlatformUrn, MlModelUrn, VersionSetUrn
7172
from datahub.sdk.container import Container
7273
from datahub.sdk.dataset import Dataset
7374

@@ -207,14 +208,13 @@ def _create_workunit(self, urn: str, aspect: _Aspect) -> MetadataWorkUnit:
207208
def _get_experiment_workunits(self) -> Iterable[MetadataWorkUnit]:
208209
experiments = self._get_mlflow_experiments()
209210
for experiment in experiments:
210-
if experiment.name == "lineage_platform_dataset_lineage_experiment":
211-
yield from self._get_experiment_container_workunit(experiment)
211+
yield from self._get_experiment_container_workunit(experiment)
212212

213-
runs = self._get_mlflow_runs_from_experiment(experiment)
214-
if runs:
215-
for run in runs:
216-
yield from self._get_run_workunits(experiment, run)
217-
yield from self._get_dataset_input_workunits(run)
213+
runs = self._get_mlflow_runs_from_experiment(experiment)
214+
if runs:
215+
for run in runs:
216+
yield from self._get_run_workunits(experiment, run)
217+
yield from self._get_dataset_input_workunits(run)
218218

219219
def _get_experiment_custom_properties(self, experiment):
220220
experiment_custom_props = getattr(experiment, "tags", {}) or {}
@@ -271,7 +271,6 @@ def _get_dataset_schema(self, schema: str) -> Optional[List[Tuple[str, str]]]:
271271
print("Failed to parse schema JSON")
272272
return None
273273

274-
# Check for mlflow_colspec and extract field information
275274
if "mlflow_colspec" in schema_dict:
276275
try:
277276
return [
@@ -281,12 +280,9 @@ def _get_dataset_schema(self, schema: str) -> Optional[List[Tuple[str, str]]]:
281280
except (KeyError, TypeError):
282281
return None
283282

284-
# If we reach here, schema doesn't have the expected structure
285283
return None
286284

287285
def _get_dataset_platform_from_source_type(self, source_type):
288-
# source_type_to_platform = {}
289-
# TODO: add ingestion config for this
290286
if source_type == "gs":
291287
return "gcs"
292288
return source_type
@@ -318,40 +314,36 @@ def _get_dataset_input_workunits(self, run: Run) -> Iterable[MetadataWorkUnit]:
318314
dataset_reference_urns.append(str(local_dataset_reference.urn))
319315

320316
else:
321-
# workaround for setting siblings
322-
hosted_dataset_reference_urn = DatasetUrn.create_from_ids(
323-
platform_id=self.platform, table_name=dataset.name, env="PROD"
324-
)
325317
hosted_dataset = Dataset(
326318
platform=self._get_dataset_platform_from_source_type(source_type),
327319
name=dataset.name,
328320
schema=formatted_schema,
329321
custom_properties=dataset_tags,
330-
extra_aspects=[
331-
Siblings(
332-
primary=True, siblings=[str(hosted_dataset_reference_urn)]
333-
)
334-
],
335322
)
336-
# create dataset reference
337323
hosted_dataset_reference = Dataset(
338324
platform=self.platform,
339325
name=dataset.name,
340326
schema=formatted_schema,
341327
custom_properties=dataset_tags,
342-
extra_aspects=[
343-
Siblings(primary=False, siblings=[str(hosted_dataset.urn)])
344-
],
328+
upstreams=UpstreamLineageClass(
329+
upstreams=[
330+
UpstreamClass(dataset=str(hosted_dataset.urn), type="COPY")
331+
]
332+
),
345333
)
346334
dataset_reference_urns.append(str(hosted_dataset_reference.urn))
347335

348336
yield from hosted_dataset.as_workunits()
349337
yield from hosted_dataset_reference.as_workunits()
350338

351339
if dataset_reference_urns:
340+
input_edges = [
341+
EdgeClass(destinationUrn=dataset_referece_urn)
342+
for dataset_referece_urn in dataset_reference_urns
343+
]
352344
yield MetadataChangeProposalWrapper(
353345
entityUrn=str(run_urn),
354-
aspect=DataProcessInstanceInputClass(inputs=dataset_reference_urns),
346+
aspect=DataProcessInstanceInputClass(inputs=[], inputEdges=input_edges),
355347
).as_workunit()
356348

357349
def _get_run_workunits(
@@ -405,12 +397,7 @@ def _get_run_workunits(
405397
model_version_urn = self._make_ml_model_urn(model_versions[0])
406398
yield MetadataChangeProposalWrapper(
407399
entityUrn=str(data_process_instance.urn),
408-
aspect=DataProcessInstanceOutputClass(
409-
outputs=[],
410-
outputEdges=[
411-
EdgeClass(destinationUrn=model_version_urn),
412-
],
413-
),
400+
aspect=DataProcessInstanceOutputClass(outputs=[model_version_urn]),
414401
).as_workunit()
415402

416403
metrics = self._get_run_metrics(run)

metadata-ingestion/tests/integration/mlflow/mlflow_mcps_golden.json

+2-5
Original file line numberDiff line numberDiff line change
@@ -308,11 +308,8 @@
308308
"aspectName": "dataProcessInstanceOutput",
309309
"aspect": {
310310
"json": {
311-
"outputs": [],
312-
"outputEdges": [
313-
{
314-
"destinationUrn": "urn:li:mlModel:(urn:li:dataPlatform:mlflow,test-model_1,PROD)"
315-
}
311+
"outputs": [
312+
"urn:li:mlModel:(urn:li:dataPlatform:mlflow,test-model_1,PROD)"
316313
]
317314
}
318315
},

0 commit comments

Comments
 (0)