4
4
import json
5
5
import logging
6
6
import os
7
+ from collections import defaultdict
8
+ from dataclasses import dataclass
7
9
from json .decoder import JSONDecodeError
8
10
from typing import (
9
11
TYPE_CHECKING ,
31
33
ConfigurationError ,
32
34
OperationalError ,
33
35
)
36
+ from datahub .emitter .aspect import JSON_CONTENT_TYPE
34
37
from datahub .emitter .generic_emitter import Emitter
35
38
from datahub .emitter .mcp import MetadataChangeProposalWrapper
36
39
from datahub .emitter .request_helper import make_curl_command
@@ -143,10 +146,31 @@ def build_session(self) -> requests.Session:
143
146
return session
144
147
145
148
149
+ @dataclass
150
+ class _Chunk :
151
+ items : List [str ]
152
+ total_bytes : int = 0
153
+
154
+ def add_item (self , item : str ) -> bool :
155
+ item_bytes = len (item .encode ())
156
+ if not self .items : # Always add at least one item even if over byte limit
157
+ self .items .append (item )
158
+ self .total_bytes += item_bytes
159
+ return True
160
+ self .items .append (item )
161
+ self .total_bytes += item_bytes
162
+ return True
163
+
164
+ @staticmethod
165
+ def join (chunk : "_Chunk" ) -> str :
166
+ return "[" + "," .join (chunk .items ) + "]"
167
+
168
+
146
169
class DataHubRestEmitter (Closeable , Emitter ):
147
170
_gms_server : str
148
171
_token : Optional [str ]
149
172
_session : requests .Session
173
+ _openapi_ingestion : bool
150
174
151
175
def __init__ (
152
176
self ,
@@ -162,6 +186,7 @@ def __init__(
162
186
ca_certificate_path : Optional [str ] = None ,
163
187
client_certificate_path : Optional [str ] = None ,
164
188
disable_ssl_verification : bool = False ,
189
+ openapi_ingestion : bool = False ,
165
190
):
166
191
if not gms_server :
167
192
raise ConfigurationError ("gms server is required" )
@@ -174,9 +199,13 @@ def __init__(
174
199
self ._gms_server = fixup_gms_url (gms_server )
175
200
self ._token = token
176
201
self .server_config : Dict [str , Any ] = {}
177
-
202
+ self . _openapi_ingestion = openapi_ingestion
178
203
self ._session = requests .Session ()
179
204
205
+ logger .debug (
206
+ f"Using { 'OpenAPI' if openapi_ingestion else 'Restli' } for ingestion."
207
+ )
208
+
180
209
headers = {
181
210
"X-RestLi-Protocol-Version" : "2.0.0" ,
182
211
"X-DataHub-Py-Cli-Version" : nice_version_name (),
@@ -264,6 +293,43 @@ def to_graph(self) -> "DataHubGraph":
264
293
265
294
return DataHubGraph .from_emitter (self )
266
295
296
+ def _to_openapi_request (
297
+ self ,
298
+ mcp : Union [MetadataChangeProposal , MetadataChangeProposalWrapper ],
299
+ async_flag : Optional [bool ] = None ,
300
+ async_default : bool = False ,
301
+ ) -> Optional [Tuple [str , List [Dict [str , Any ]]]]:
302
+ if mcp .aspect and mcp .aspectName :
303
+ resolved_async_flag = (
304
+ async_flag if async_flag is not None else async_default
305
+ )
306
+ url = f"{ self ._gms_server } /openapi/v3/entity/{ mcp .entityType } ?async={ 'true' if resolved_async_flag else 'false' } "
307
+
308
+ if isinstance (mcp , MetadataChangeProposalWrapper ):
309
+ aspect_value = pre_json_transform (
310
+ mcp .to_obj (simplified_structure = True )
311
+ )["aspect" ]["json" ]
312
+ else :
313
+ obj = mcp .aspect .to_obj ()
314
+ if obj .get ("value" ) and obj .get ("contentType" ) == JSON_CONTENT_TYPE :
315
+ obj = json .loads (obj ["value" ])
316
+ aspect_value = pre_json_transform (obj )
317
+ return (
318
+ url ,
319
+ [
320
+ {
321
+ "urn" : mcp .entityUrn ,
322
+ mcp .aspectName : {
323
+ "value" : aspect_value ,
324
+ "systemMetadata" : mcp .systemMetadata .to_obj ()
325
+ if mcp .systemMetadata
326
+ else None ,
327
+ },
328
+ }
329
+ ],
330
+ )
331
+ return None
332
+
267
333
def emit (
268
334
self ,
269
335
item : Union [
@@ -317,18 +383,24 @@ def emit_mcp(
317
383
mcp : Union [MetadataChangeProposal , MetadataChangeProposalWrapper ],
318
384
async_flag : Optional [bool ] = None ,
319
385
) -> None :
320
- url = f"{ self ._gms_server } /aspects?action=ingestProposal"
321
386
ensure_has_system_metadata (mcp )
322
387
323
- mcp_obj = pre_json_transform (mcp .to_obj ())
324
- payload_dict = {"proposal" : mcp_obj }
388
+ if self ._openapi_ingestion :
389
+ request = self ._to_openapi_request (mcp , async_flag , async_default = False )
390
+ if request :
391
+ self ._emit_generic (request [0 ], payload = request [1 ])
392
+ else :
393
+ url = f"{ self ._gms_server } /aspects?action=ingestProposal"
325
394
326
- if async_flag is not None :
327
- payload_dict [ "async" ] = "true" if async_flag else "false"
395
+ mcp_obj = pre_json_transform ( mcp . to_obj ())
396
+ payload_dict = { "proposal" : mcp_obj }
328
397
329
- payload = json .dumps (payload_dict )
398
+ if async_flag is not None :
399
+ payload_dict ["async" ] = "true" if async_flag else "false"
330
400
331
- self ._emit_generic (url , payload )
401
+ payload = json .dumps (payload_dict )
402
+
403
+ self ._emit_generic (url , payload )
332
404
333
405
def emit_mcps (
334
406
self ,
@@ -337,10 +409,75 @@ def emit_mcps(
337
409
) -> int :
338
410
if _DATAHUB_EMITTER_TRACE :
339
411
logger .debug (f"Attempting to emit MCP batch of size { len (mcps )} " )
340
- url = f" { self . _gms_server } /aspects?action=ingestProposalBatch"
412
+
341
413
for mcp in mcps :
342
414
ensure_has_system_metadata (mcp )
343
415
416
+ if self ._openapi_ingestion :
417
+ return self ._emit_openapi_mcps (mcps , async_flag )
418
+ else :
419
+ return self ._emit_restli_mcps (mcps , async_flag )
420
+
421
+ def _emit_openapi_mcps (
422
+ self ,
423
+ mcps : Sequence [Union [MetadataChangeProposal , MetadataChangeProposalWrapper ]],
424
+ async_flag : Optional [bool ] = None ,
425
+ ) -> int :
426
+ """
427
+ 1. Grouping MCPs by their entity URL
428
+ 2. Breaking down large batches into smaller chunks based on both:
429
+ * Total byte size (INGEST_MAX_PAYLOAD_BYTES)
430
+ * Maximum number of items (BATCH_INGEST_MAX_PAYLOAD_LENGTH)
431
+
432
+ The Chunk class encapsulates both the items and their byte size tracking
433
+ Serializing the items only once with json.dumps(request[1]) and reusing that
434
+ The chunking logic handles edge cases (always accepting at least one item per chunk)
435
+ The joining logic is efficient with a simple string concatenation
436
+
437
+ :param mcps: metadata change proposals to transmit
438
+ :param async_flag: the mode
439
+ :return: number of requests
440
+ """
441
+ # group by entity url
442
+ batches : Dict [str , List [_Chunk ]] = defaultdict (
443
+ lambda : [_Chunk (items = [])]
444
+ ) # Initialize with one empty Chunk
445
+
446
+ for mcp in mcps :
447
+ request = self ._to_openapi_request (mcp , async_flag , async_default = True )
448
+ if request :
449
+ current_chunk = batches [request [0 ]][- 1 ] # Get the last chunk
450
+ # Only serialize once
451
+ serialized_item = json .dumps (request [1 ][0 ])
452
+ item_bytes = len (serialized_item .encode ())
453
+
454
+ # If adding this item would exceed max_bytes, create a new chunk
455
+ # Unless the chunk is empty (always add at least one item)
456
+ if current_chunk .items and (
457
+ current_chunk .total_bytes + item_bytes > INGEST_MAX_PAYLOAD_BYTES
458
+ or len (current_chunk .items ) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
459
+ ):
460
+ new_chunk = _Chunk (items = [])
461
+ batches [request [0 ]].append (new_chunk )
462
+ current_chunk = new_chunk
463
+
464
+ current_chunk .add_item (serialized_item )
465
+
466
+ responses = []
467
+ for url , chunks in batches .items ():
468
+ for chunk in chunks :
469
+ response = self ._emit_generic (url , payload = _Chunk .join (chunk ))
470
+ responses .append (response )
471
+
472
+ return len (responses )
473
+
474
+ def _emit_restli_mcps (
475
+ self ,
476
+ mcps : Sequence [Union [MetadataChangeProposal , MetadataChangeProposalWrapper ]],
477
+ async_flag : Optional [bool ] = None ,
478
+ ) -> int :
479
+ url = f"{ self ._gms_server } /aspects?action=ingestProposalBatch"
480
+
344
481
mcp_objs = [pre_json_transform (mcp .to_obj ()) for mcp in mcps ]
345
482
346
483
# As a safety mechanism, we need to make sure we don't exceed the max payload size for GMS.
@@ -392,7 +529,10 @@ def emit_usage(self, usageStats: UsageAggregation) -> None:
392
529
payload = json .dumps (snapshot )
393
530
self ._emit_generic (url , payload )
394
531
395
- def _emit_generic (self , url : str , payload : str ) -> None :
532
+ def _emit_generic (self , url : str , payload : Union [str , Any ]) -> requests .Response :
533
+ if not isinstance (payload , str ):
534
+ payload = json .dumps (payload )
535
+
396
536
curl_command = make_curl_command (self ._session , "POST" , url , payload )
397
537
payload_size = len (payload )
398
538
if payload_size > INGEST_MAX_PAYLOAD_BYTES :
@@ -408,6 +548,7 @@ def _emit_generic(self, url: str, payload: str) -> None:
408
548
try :
409
549
response = self ._session .post (url , data = payload )
410
550
response .raise_for_status ()
551
+ return response
411
552
except HTTPError as e :
412
553
try :
413
554
info : Dict = response .json ()
0 commit comments