Skip to content

Commit 1ed361d

Browse files
committed
Adding version set properly
1 parent b5e5101 commit 1ed361d

File tree

3 files changed

+85
-337
lines changed

3 files changed

+85
-337
lines changed

metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml

+1
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,4 @@ sink:
1414
type: "datahub-rest"
1515
config:
1616
server: "http://localhost:8080"
17+
token: "eyJhbGciOiJIUzI1NiJ9.eyJhY3RvclR5cGUiOiJVU0VSIiwiYWN0b3JJZCI6ImRhdGFodWIiLCJ0eXBlIjoiUEVSU09OQUwiLCJ2ZXJzaW9uIjoiMiIsImp0aSI6ImQ1ZWU0NzY4LWI1MDMtNGVhNy04MjEwLWY1MGFkYTFlMTc3YiIsInN1YiI6ImRhdGFodWIiLCJleHAiOjE3NDQ0MDYxMjYsImlzcyI6ImRhdGFodWItbWV0YWRhdGEtc2VydmljZSJ9.J1rdDCF4mGHzkWm5s6I8YxbbC0DA6LTXHyokB0pSqQU"

metadata-ingestion/src/datahub/ingestion/source/vertexai.py

+49-7
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
from datahub._codegen.aspect import _Aspect
2626
from datahub.configuration.source_common import EnvConfigMixin
2727
from datahub.emitter.mcp import MetadataChangeProposalWrapper
28-
from datahub.emitter.mcp_builder import ProjectIdKey, gen_containers
28+
from datahub.emitter.mcp_builder import ContainerKey, ProjectIdKey, gen_containers
2929
from datahub.ingestion.api.common import PipelineContext
3030
from datahub.ingestion.api.decorators import (
3131
SupportStatus,
@@ -41,6 +41,8 @@
4141
from datahub.ingestion.source.mlflow import ContainerKeyWithId
4242
from datahub.metadata._schema_classes import MLHyperParamClass, MLMetricClass
4343
from datahub.metadata._urns.urn_defs import DataPlatformUrn
44+
from datahub.metadata._schema_classes import VersionPropertiesClass, MetadataAttributionClass
45+
from datahub.metadata.urns import VersionSetUrn,MlModelUrn
4446
from datahub.metadata.com.linkedin.pegasus2avro.ml.metadata import (
4547
MLTrainingRunProperties,
4648
)
@@ -54,6 +56,8 @@
5456
DataProcessInstanceRunResultClass,
5557
DataProcessRunStatusClass,
5658
DatasetPropertiesClass,
59+
MLHyperParamClass,
60+
MLMetricClass,
5761
MLModelDeploymentPropertiesClass,
5862
MLModelGroupPropertiesClass,
5963
MLModelPropertiesClass,
@@ -63,6 +67,7 @@
6367
TimeStampClass,
6468
VersionTagClass,
6569
)
70+
from datahub.metadata.urns import DataPlatformUrn
6671
from datahub.utilities.str_enum import StrEnum
6772
from datahub.utilities.time import datetime_to_ts_millis
6873

@@ -155,6 +160,7 @@ def __init__(self, ctx: PipelineContext, config: VertexAIConfig):
155160
self.client = aiplatform
156161
self.endpoints: Optional[Dict[str, List[Endpoint]]] = None
157162
self.datasets: Optional[Dict[str, VertexAiResourceNoun]] = None
163+
self.experiments: Optional[List[Experiment]] = None
158164

159165
def get_report(self) -> SourceReport:
160166
return self.report
@@ -173,9 +179,9 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
173179
# Fetch and Ingest Training Jobs
174180
yield from auto_workunit(self._get_training_jobs_mcps())
175181
# Fetch and Ingest Experiments
176-
yield from self._get_experiments_workunits()
182+
# yield from self._get_experiments_workunits()
177183
# Fetch and Ingest Experiment Runs
178-
yield from auto_workunit(self._get_experiment_runs_mcps())
184+
# yield from auto_workunit(self._get_experiment_runs_mcps())
179185

180186
def _get_experiments_workunits(self) -> Iterable[MetadataWorkUnit]:
181187
# List all experiments
@@ -225,12 +231,22 @@ def _get_experiment_run_metrics(self, run: ExperimentRun) -> List[MLMetricClass]
225231
MLMetricClass(name=k, value=str(v)) for k, v in run.get_metrics().items()
226232
]
227233

228-
def _get_create_time_from_run(self, run: ExperimentRun) -> Optional[int]:
234+
def _get_run_create_time_duration(self, run: ExperimentRun) -> (Optional[int], Optional[int]):
229235
executions = run.get_executions()
230236
if len(executions) == 0:
231-
return None
237+
return None, None
232238
min_create_time = min([exec.create_time for exec in executions])
233-
return int(min_create_time.timestamp() * 1000)
239+
max_upload_time = max([exec.update_time for exec in executions])
240+
create_time_millis = int(min_create_time.timestamp() * 1000)
241+
duration = max_upload_time.timestamp() * 1000 - create_time_millis
242+
return create_time_millis, duration
243+
244+
def _get_run_duration_millis(self, run: ExperimentRun) -> Optional[int]:
245+
executions = run.get_executions()
246+
if len(executions) == 0:
247+
return None
248+
max_upload_time = max([exec.update_time for exec in executions])
249+
234250

235251
def _get_run_result_status(self, status: str) -> Union[str, RunResultTypeClass]:
236252
if status == "COMPLETE":
@@ -268,7 +284,7 @@ def _gen_experiment_run_mcps(
268284
)
269285
run_urn = builder.make_data_process_instance_urn(run_name)
270286

271-
created_time = self._get_create_time_from_run(run)
287+
created_time, duration = self._get_run_create_time_duration(run)
272288
created_actor = f"urn:li:platformResource:{self.platform}"
273289

274290
aspects: List[_Aspect] = list()
@@ -298,8 +314,10 @@ def _gen_experiment_run_mcps(
298314
)
299315
)
300316

317+
state = run.get_state()
301318
run_result_type = self._get_run_result_status(run.get_state())
302319
if isinstance(run_result_type, RunResultTypeClass) and created_time is not None:
320+
303321
aspects.append(
304322
DataProcessInstanceRunEventClass(
305323
status=DataProcessRunStatusClass.STARTED,
@@ -308,6 +326,7 @@ def _gen_experiment_run_mcps(
308326
type=self._get_run_result_status(run.get_state()),
309327
nativeResultType=self.platform,
310328
),
329+
durationMillis=duration,
311330
)
312331
)
313332

@@ -729,6 +748,7 @@ def _gen_ml_model_mcps(
729748
model_version_name = f"{model_name}_{model_version.version_id}"
730749
model_urn = self._make_ml_model_urn(model_version, model_name=model_name)
731750

751+
732752
yield from MetadataChangeProposalWrapper.construct_many(
733753
entityUrn=model_urn,
734754
aspects=[
@@ -768,9 +788,31 @@ def _gen_ml_model_mcps(
768788
container=self._get_project_container().as_urn(),
769789
),
770790
SubTypesClass(typeNames=[MLTypes.MODEL]),
791+
VersionPropertiesClass(
792+
version=VersionTagClass(
793+
versionTag=str(model_version.version_id),
794+
metadataAttribution=MetadataAttributionClass(
795+
time=int(model_version.version_create_time.timestamp() * 1000),
796+
actor="urn:li:corpuser:datahub",
797+
),
798+
),
799+
versionSet=str(self._get_version_set_urn(model)),
800+
sortId=str(model_version.version_id).zfill(10),
801+
# aliases=[
802+
# VersionTagClass(versionTag=alias) for alias in model_version.aliases
803+
# ],
804+
)
771805
],
772806
)
773807

808+
def _get_version_set_urn(self, model: Model) -> VersionSetUrn:
809+
guid_dict = {"platform": self.platform, "name": model.name}
810+
version_set_urn = VersionSetUrn(
811+
id=builder.datahub_guid(guid_dict),
812+
entity_type=MlModelUrn.ENTITY_TYPE,
813+
)
814+
return version_set_urn
815+
774816
def _search_endpoint(self, model: Model) -> List[Endpoint]:
775817
"""
776818
Search for an endpoint associated with the model.

0 commit comments

Comments
 (0)