|
| 1 | +import json |
| 2 | +import logging |
| 3 | +from typing import Iterable, List |
| 4 | + |
| 5 | +from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES |
| 6 | +from datahub.emitter.serialization_helper import pre_json_transform |
| 7 | +from datahub.ingestion.api.source import SourceReport |
| 8 | +from datahub.ingestion.api.workunit import MetadataWorkUnit |
| 9 | +from datahub.metadata.schema_classes import ( |
| 10 | + DatasetProfileClass, |
| 11 | + SchemaFieldClass, |
| 12 | + SchemaMetadataClass, |
| 13 | +) |
| 14 | + |
| 15 | +logger = logging.getLogger(__name__) |
| 16 | + |
| 17 | + |
| 18 | +class EnsureAspectSizeProcessor: |
| 19 | + def __init__( |
| 20 | + self, report: SourceReport, payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES |
| 21 | + ): |
| 22 | + self.report = report |
| 23 | + self.payload_constraint = payload_constraint |
| 24 | + |
| 25 | + def ensure_dataset_profile_size( |
| 26 | + self, dataset_urn: str, profile: DatasetProfileClass |
| 27 | + ) -> None: |
| 28 | + """ |
| 29 | + This is quite arbitrary approach to ensuring dataset profile aspect does not exceed allowed size, might be adjusted |
| 30 | + in the future |
| 31 | + """ |
| 32 | + sample_fields_size = 0 |
| 33 | + if profile.fieldProfiles: |
| 34 | + logger.debug(f"Length of field profiles: {len(profile.fieldProfiles)}") |
| 35 | + for field in profile.fieldProfiles: |
| 36 | + if field.sampleValues: |
| 37 | + values_len = 0 |
| 38 | + for value in field.sampleValues: |
| 39 | + if value: |
| 40 | + values_len += len(value) |
| 41 | + logger.debug( |
| 42 | + f"Field {field.fieldPath} has {len(field.sampleValues)} sample values, taking total bytes {values_len}" |
| 43 | + ) |
| 44 | + if sample_fields_size + values_len > self.payload_constraint: |
| 45 | + field.sampleValues = [] |
| 46 | + self.report.warning( |
| 47 | + title="Dataset profile truncated due to size constraint", |
| 48 | + message="Dataset profile contained too much data and would have caused ingestion to fail", |
| 49 | + context=f"Sample values for field {field.fieldPath} were removed from dataset profile for {dataset_urn} due to aspect size constraints", |
| 50 | + ) |
| 51 | + else: |
| 52 | + sample_fields_size += values_len |
| 53 | + else: |
| 54 | + logger.debug(f"Field {field.fieldPath} has no sample values") |
| 55 | + |
| 56 | + def ensure_schema_metadata_size( |
| 57 | + self, dataset_urn: str, schema: SchemaMetadataClass |
| 58 | + ) -> None: |
| 59 | + """ |
| 60 | + This is quite arbitrary approach to ensuring schema metadata aspect does not exceed allowed size, might be adjusted |
| 61 | + in the future |
| 62 | + """ |
| 63 | + total_fields_size = 0 |
| 64 | + logger.debug(f"Amount of schema fields: {len(schema.fields)}") |
| 65 | + accepted_fields: List[SchemaFieldClass] = [] |
| 66 | + for field in schema.fields: |
| 67 | + field_size = len(json.dumps(pre_json_transform(field.to_obj()))) |
| 68 | + logger.debug(f"Field {field.fieldPath} takes total {field_size}") |
| 69 | + if total_fields_size + field_size < self.payload_constraint: |
| 70 | + accepted_fields.append(field) |
| 71 | + total_fields_size += field_size |
| 72 | + else: |
| 73 | + self.report.warning( |
| 74 | + title="Schema truncated due to size constraint", |
| 75 | + message="Dataset schema contained too much data and would have caused ingestion to fail", |
| 76 | + context=f"Field {field.fieldPath} was removed from schema for {dataset_urn} due to aspect size constraints", |
| 77 | + ) |
| 78 | + |
| 79 | + schema.fields = accepted_fields |
| 80 | + |
| 81 | + def ensure_aspect_size( |
| 82 | + self, |
| 83 | + stream: Iterable[MetadataWorkUnit], |
| 84 | + ) -> Iterable[MetadataWorkUnit]: |
| 85 | + """ |
| 86 | + We have hard limitation of aspect size being 16 MB. Some aspects can exceed that value causing an exception |
| 87 | + on GMS side and failure of the entire ingestion. This processor will attempt to trim suspected aspects. |
| 88 | + """ |
| 89 | + for wu in stream: |
| 90 | + logger.debug(f"Ensuring size of workunit: {wu.id}") |
| 91 | + |
| 92 | + if schema := wu.get_aspect_of_type(SchemaMetadataClass): |
| 93 | + self.ensure_schema_metadata_size(wu.get_urn(), schema) |
| 94 | + elif profile := wu.get_aspect_of_type(DatasetProfileClass): |
| 95 | + self.ensure_dataset_profile_size(wu.get_urn(), profile) |
| 96 | + yield wu |
0 commit comments