1
1
import logging
2
2
from dataclasses import dataclass , field
3
- from typing import Any , Counter , Dict , Iterable , List , Optional , Type , Union
3
+ from typing import Any , Counter , Dict , Iterable , List , Optional , Tuple , Type , Union
4
4
5
5
import boto3
6
6
import pydantic
61
61
PAGE_SIZE = 100
62
62
MAX_SCHEMA_SIZE = 300
63
63
MAX_PRIMARY_KEYS_SIZE = 100
64
+ FIELD_DELIMITER = "."
64
65
65
66
logger : logging .Logger = logging .getLogger (__name__ )
66
67
@@ -285,13 +286,13 @@ def construct_schema_from_dynamodb(
285
286
dynamodb_client : BaseClient ,
286
287
region : str ,
287
288
table_name : str ,
288
- ) -> Dict [str , SchemaDescription ]:
289
+ ) -> Dict [Tuple [ str , ...] , SchemaDescription ]:
289
290
"""
290
291
This will use the dynamodb client to scan the given table to retrieve items with pagination,
291
292
and construct the schema of this table by reading the attributes of the retrieved items
292
293
"""
293
294
paginator = dynamodb_client .get_paginator ("scan" )
294
- schema : Dict [str , SchemaDescription ] = {}
295
+ schema : Dict [Tuple [ str , ...] , SchemaDescription ] = {}
295
296
"""
296
297
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/dynamodb.html#DynamoDB.Paginator.Scan
297
298
Note that the behavior of the pagination does not align with the documentation according to https://stackoverflow.com/questions/39201093/how-to-use-boto3-pagination
@@ -323,7 +324,7 @@ def include_table_item_to_schema(
323
324
dynamodb_client : Any ,
324
325
region : str ,
325
326
table_name : str ,
326
- schema : Dict [str , SchemaDescription ],
327
+ schema : Dict [Tuple [ str , ...] , SchemaDescription ],
327
328
) -> None :
328
329
"""
329
330
It will look up in the config include_table_item dict to see if "region.table_name" exists as key,
@@ -358,7 +359,9 @@ def include_table_item_to_schema(
358
359
self .construct_schema_from_items (items , schema )
359
360
360
361
def construct_schema_from_items (
361
- slef , items : List [Dict [str , Dict ]], schema : Dict [str , SchemaDescription ]
362
+ self ,
363
+ items : List [Dict [str , Dict ]],
364
+ schema : Dict [Tuple [str , ...], SchemaDescription ],
362
365
) -> None :
363
366
"""
364
367
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/dynamodb.html#DynamoDB.Client.scan
@@ -367,35 +370,58 @@ def construct_schema_from_items(
367
370
we are writing our own construct schema method, take the attribute name as key and SchemaDescription as value
368
371
"""
369
372
for document in items :
370
- # the key is the attribute name and the value is a dict with only one entry,
371
- # whose key is the data type and value is the data
372
- for key , value in document .items ():
373
- if value is not None :
374
- data_type = list (value .keys ())[0 ]
375
- if key not in schema :
376
- schema [key ] = {
377
- "types" : Counter (data_type ),
378
- "count" : 1 ,
379
- # It seems we don't have collapsed field name so we are using attribute name here
380
- "delimited_name" : key ,
381
- "type" : data_type ,
382
- "nullable" : False ,
383
- }
384
- else :
385
- # update the type count
386
- schema [key ]["types" ].update ({data_type : 1 })
387
- schema [key ]["count" ] += 1
388
- # if we found an attribute name with different attribute type, we consider this attribute type as "mixed"
389
- field_types = schema [key ]["types" ]
390
- if len (field_types .keys ()) > 1 :
391
- schema [key ]["type" ] = "mixed"
373
+ self .append_schema (schema , document )
374
+
375
+ def append_schema (
376
+ self ,
377
+ schema : Dict [Tuple [str , ...], SchemaDescription ],
378
+ document : Dict [str , Dict ],
379
+ parent_field_path : Tuple [str , ...] = (),
380
+ ) -> None :
381
+ # the key is the attribute name and the value is a dict with only one entry,
382
+ # whose key is the data type and value is the data and we will recursively expand
383
+ # map data type to get flattened field
384
+ for key , value in document .items ():
385
+ if value is not None :
386
+ data_type = list (value .keys ())[0 ]
387
+ attribute_value = value [data_type ]
388
+ current_field_path = parent_field_path + (key ,)
389
+ # Handle nested maps by recursive calls
390
+ if data_type == "M" :
391
+ logger .debug (
392
+ f"expanding nested fields for map, current_field_path: { current_field_path } "
393
+ )
394
+ self .append_schema (schema , attribute_value , current_field_path )
395
+
396
+ if current_field_path not in schema :
397
+ schema [current_field_path ] = {
398
+ "types" : Counter ({data_type : 1 }),
399
+ "count" : 1 ,
400
+ # It seems we don't have collapsed field name so we are using attribute name here
401
+ "delimited_name" : FIELD_DELIMITER .join (current_field_path ),
402
+ "type" : data_type ,
403
+ "nullable" : False ,
404
+ }
405
+ else :
406
+ schema [current_field_path ]["types" ].update ({data_type : 1 })
407
+ schema [current_field_path ]["count" ] += 1
408
+ # if we found an attribute name with different attribute type, we consider this attribute type as "mixed"
409
+ if len (schema [current_field_path ]["types" ]) > 1 :
410
+ schema [current_field_path ]["type" ] = "mixed"
411
+ schema [current_field_path ]["nullable" ] |= (
412
+ attribute_value is None
413
+ ) # Mark as nullable if null encountered
414
+ types = schema [current_field_path ]["types" ]
415
+ logger .debug (
416
+ f"append schema with field_path: { current_field_path } and type: { types } "
417
+ )
392
418
393
419
def construct_schema_metadata (
394
420
self ,
395
421
table_name : str ,
396
422
dataset_urn : str ,
397
423
dataset_properties : DatasetPropertiesClass ,
398
- schema : Dict [str , SchemaDescription ],
424
+ schema : Dict [Tuple [ str , ...] , SchemaDescription ],
399
425
primary_key_dict : Dict [str , str ],
400
426
) -> SchemaMetadata :
401
427
""" "
@@ -407,20 +433,23 @@ def construct_schema_metadata(
407
433
canonical_schema : List [SchemaField ] = []
408
434
schema_size = len (schema .values ())
409
435
table_fields = list (schema .values ())
410
-
411
436
if schema_size > MAX_SCHEMA_SIZE :
412
437
# downsample the schema, using frequency as the sort key
413
438
self .report .report_warning (
414
439
key = dataset_urn ,
415
440
reason = f"Downsampling the table schema because MAX_SCHEMA_SIZE threshold is { MAX_SCHEMA_SIZE } " ,
416
441
)
442
+
417
443
# Add this information to the custom properties so user can know they are looking at down sampled schema
418
444
dataset_properties .customProperties ["schema.downsampled" ] = "True"
419
445
dataset_properties .customProperties ["schema.totalFields" ] = f"{ schema_size } "
420
- # append each schema field (sort so output is consistent)
446
+ # append each schema field, schema will be sorted by count descending and delimited_name ascending and sliced to only include MAX_SCHEMA_SIZE items
421
447
for schema_field in sorted (
422
448
table_fields ,
423
- key = lambda x : x ["delimited_name" ],
449
+ key = lambda x : (
450
+ - x ["count" ],
451
+ x ["delimited_name" ],
452
+ ), # Negate `count` for descending order, `delimited_name` stays the same for ascending
424
453
)[0 :MAX_SCHEMA_SIZE ]:
425
454
field_path = schema_field ["delimited_name" ]
426
455
native_data_type = self .get_native_type (schema_field ["type" ], table_name )
0 commit comments