Skip to content

Commit 26cdef0

Browse files
committed
add comments
1 parent efad2f7 commit 26cdef0

File tree

1 file changed

+6
-2
lines changed
  • metadata-ingestion/src/datahub/ingestion/source

1 file changed

+6
-2
lines changed

metadata-ingestion/src/datahub/ingestion/source/mlflow.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -279,10 +279,11 @@ def _get_dataset_schema(self, schema: str) -> Optional[List[Tuple[str, str]]]:
279279
]
280280
except (KeyError, TypeError):
281281
return None
282-
282+
# If the schema is not formatted, return None
283283
return None
284284

285285
def _get_dataset_platform_from_source_type(self, source_type):
286+
# manually map mlflow platform to datahub platform
286287
if source_type == "gs":
287288
return "gcs"
288289
return source_type
@@ -301,8 +302,10 @@ def _get_dataset_input_workunits(self, run: Run) -> Iterable[MetadataWorkUnit]:
301302
platform = self._get_dataset_platform_from_source_type(source_type)
302303
custom_properties = dataset_tags
303304
formatted_schema = self._get_dataset_schema(dataset.schema)
305+
# If the schema is not formatted, pass the schema as a custom property
304306
if formatted_schema is None:
305307
custom_properties["schema"] = dataset.schema
308+
# If the dataset is local or code, we create a local dataset reference
306309
if source_type in ("local", "code"):
307310
local_dataset_reference = Dataset(
308311
platform=platform,
@@ -312,7 +315,7 @@ def _get_dataset_input_workunits(self, run: Run) -> Iterable[MetadataWorkUnit]:
312315
)
313316
yield from local_dataset_reference.as_workunits()
314317
dataset_reference_urns.append(str(local_dataset_reference.urn))
315-
318+
# Otherwise, we create a hosted dataset reference and a hosted dataset
316319
else:
317320
hosted_dataset = Dataset(
318321
platform=self._get_dataset_platform_from_source_type(source_type),
@@ -336,6 +339,7 @@ def _get_dataset_input_workunits(self, run: Run) -> Iterable[MetadataWorkUnit]:
336339
yield from hosted_dataset.as_workunits()
337340
yield from hosted_dataset_reference.as_workunits()
338341

342+
# add the dataset reference as upstream for the run
339343
if dataset_reference_urns:
340344
input_edges = [
341345
EdgeClass(destinationUrn=dataset_referece_urn)

0 commit comments

Comments
 (0)