Skip to content

Commit e52a4de

Browse files
authored
fix(ingest/databricks): Fix profiling (#12060)
1 parent 1570139 commit e52a4de

File tree

4 files changed

+462
-1
lines changed

4 files changed

+462
-1
lines changed

metadata-ingestion/src/datahub/emitter/rest_emitter.py

+16-1
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,7 @@ def emit_mcps(
291291
mcps: List[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
292292
async_flag: Optional[bool] = None,
293293
) -> int:
294+
logger.debug("Attempting to emit batch mcps")
294295
url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
295296
for mcp in mcps:
296297
ensure_has_system_metadata(mcp)
@@ -303,15 +304,22 @@ def emit_mcps(
303304
current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
304305
for mcp_obj in mcp_objs:
305306
mcp_obj_size = len(json.dumps(mcp_obj))
307+
logger.debug(
308+
f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
309+
)
306310

307311
if (
308312
mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES
309313
or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
310314
):
315+
logger.debug("Decided to create new chunk")
311316
mcp_obj_chunks.append([])
312317
current_chunk_size = 0
313318
mcp_obj_chunks[-1].append(mcp_obj)
314319
current_chunk_size += mcp_obj_size
320+
logger.debug(
321+
f"Decided to send {len(mcps)} mcps in {len(mcp_obj_chunks)} chunks"
322+
)
315323

316324
for mcp_obj_chunk in mcp_obj_chunks:
317325
# TODO: We're calling json.dumps on each MCP object twice, once to estimate
@@ -338,8 +346,15 @@ def emit_usage(self, usageStats: UsageAggregation) -> None:
338346

339347
def _emit_generic(self, url: str, payload: str) -> None:
340348
curl_command = make_curl_command(self._session, "POST", url, payload)
349+
payload_size = len(payload)
350+
if payload_size > INGEST_MAX_PAYLOAD_BYTES:
351+
# since we know total payload size here, we could simply avoid sending such payload at all and report a warning, with current approach we are going to cause whole ingestion to fail
352+
logger.warning(
353+
f"Apparent payload size exceeded {INGEST_MAX_PAYLOAD_BYTES}, might fail with an exception due to the size"
354+
)
341355
logger.debug(
342-
"Attempting to emit to DataHub GMS; using curl equivalent to:\n%s",
356+
"Attempting to emit aspect (size: %s) to DataHub GMS; using curl equivalent to:\n%s",
357+
payload_size,
343358
curl_command,
344359
)
345360
try:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import json
2+
import logging
3+
from typing import Iterable, List
4+
5+
from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES
6+
from datahub.emitter.serialization_helper import pre_json_transform
7+
from datahub.ingestion.api.source import SourceReport
8+
from datahub.ingestion.api.workunit import MetadataWorkUnit
9+
from datahub.metadata.schema_classes import (
10+
DatasetProfileClass,
11+
SchemaFieldClass,
12+
SchemaMetadataClass,
13+
)
14+
15+
logger = logging.getLogger(__name__)
16+
17+
18+
class EnsureAspectSizeProcessor:
19+
def __init__(
20+
self, report: SourceReport, payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
21+
):
22+
self.report = report
23+
self.payload_constraint = payload_constraint
24+
25+
def ensure_dataset_profile_size(
26+
self, dataset_urn: str, profile: DatasetProfileClass
27+
) -> None:
28+
"""
29+
This is quite arbitrary approach to ensuring dataset profile aspect does not exceed allowed size, might be adjusted
30+
in the future
31+
"""
32+
sample_fields_size = 0
33+
if profile.fieldProfiles:
34+
logger.debug(f"Length of field profiles: {len(profile.fieldProfiles)}")
35+
for field in profile.fieldProfiles:
36+
if field.sampleValues:
37+
values_len = 0
38+
for value in field.sampleValues:
39+
if value:
40+
values_len += len(value)
41+
logger.debug(
42+
f"Field {field.fieldPath} has {len(field.sampleValues)} sample values, taking total bytes {values_len}"
43+
)
44+
if sample_fields_size + values_len > self.payload_constraint:
45+
field.sampleValues = []
46+
self.report.warning(
47+
title="Dataset profile truncated due to size constraint",
48+
message="Dataset profile contained too much data and would have caused ingestion to fail",
49+
context=f"Sample values for field {field.fieldPath} were removed from dataset profile for {dataset_urn} due to aspect size constraints",
50+
)
51+
else:
52+
sample_fields_size += values_len
53+
else:
54+
logger.debug(f"Field {field.fieldPath} has no sample values")
55+
56+
def ensure_schema_metadata_size(
57+
self, dataset_urn: str, schema: SchemaMetadataClass
58+
) -> None:
59+
"""
60+
This is quite arbitrary approach to ensuring schema metadata aspect does not exceed allowed size, might be adjusted
61+
in the future
62+
"""
63+
total_fields_size = 0
64+
logger.debug(f"Amount of schema fields: {len(schema.fields)}")
65+
accepted_fields: List[SchemaFieldClass] = []
66+
for field in schema.fields:
67+
field_size = len(json.dumps(pre_json_transform(field.to_obj())))
68+
logger.debug(f"Field {field.fieldPath} takes total {field_size}")
69+
if total_fields_size + field_size < self.payload_constraint:
70+
accepted_fields.append(field)
71+
total_fields_size += field_size
72+
else:
73+
self.report.warning(
74+
title="Schema truncated due to size constraint",
75+
message="Dataset schema contained too much data and would have caused ingestion to fail",
76+
context=f"Field {field.fieldPath} was removed from schema for {dataset_urn} due to aspect size constraints",
77+
)
78+
79+
schema.fields = accepted_fields
80+
81+
def ensure_aspect_size(
82+
self,
83+
stream: Iterable[MetadataWorkUnit],
84+
) -> Iterable[MetadataWorkUnit]:
85+
"""
86+
We have hard limitation of aspect size being 16 MB. Some aspects can exceed that value causing an exception
87+
on GMS side and failure of the entire ingestion. This processor will attempt to trim suspected aspects.
88+
"""
89+
for wu in stream:
90+
logger.debug(f"Ensuring size of workunit: {wu.id}")
91+
92+
if schema := wu.get_aspect_of_type(SchemaMetadataClass):
93+
self.ensure_schema_metadata_size(wu.get_urn(), schema)
94+
elif profile := wu.get_aspect_of_type(DatasetProfileClass):
95+
self.ensure_dataset_profile_size(wu.get_urn(), profile)
96+
yield wu

metadata-ingestion/src/datahub/ingestion/source/unity/source.py

+4
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@
2626
gen_containers,
2727
)
2828
from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
29+
from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
30+
EnsureAspectSizeProcessor,
31+
)
2932
from datahub.ingestion.api.common import PipelineContext
3033
from datahub.ingestion.api.decorators import (
3134
SupportStatus,
@@ -260,6 +263,7 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
260263
StaleEntityRemovalHandler.create(
261264
self, self.config, self.ctx
262265
).workunit_processor,
266+
EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
263267
]
264268

265269
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:

0 commit comments

Comments
 (0)