Skip to content

Commit 943edc3

Browse files
feat(ingest/mongodb): improve sorting when downsampling collection schema (#9856)
1 parent fbc291c commit 943edc3

File tree

3 files changed

+722
-7
lines changed

3 files changed

+722
-7
lines changed

metadata-ingestion/src/datahub/ingestion/source/mongodb.py

+6-7
Original file line numberDiff line numberDiff line change
@@ -419,11 +419,6 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
419419
key=dataset_urn,
420420
reason=f"Downsampling the collection schema because it has {collection_schema_size} fields. Threshold is {max_schema_size}",
421421
)
422-
collection_fields = sorted(
423-
collection_schema.values(),
424-
key=lambda x: (x["count"], x["delimited_name"]),
425-
reverse=True,
426-
)[0:max_schema_size]
427422
# Add this information to the custom properties so user can know they are looking at downsampled schema
428423
dataset_properties.customProperties[
429424
"schema.downsampled"
@@ -437,8 +432,12 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
437432
)
438433
# append each schema field (sort so output is consistent)
439434
for schema_field in sorted(
440-
collection_fields, key=lambda x: x["delimited_name"]
441-
):
435+
collection_fields,
436+
key=lambda x: (
437+
-x["count"],
438+
x["delimited_name"],
439+
), # Negate `count` for descending order, `delimited_name` stays the same for ascending
440+
)[0:max_schema_size]:
442441
field = SchemaField(
443442
fieldPath=schema_field["delimited_name"],
444443
nativeDataType=self.get_pymongo_type_string(

0 commit comments

Comments
 (0)