2
2
import logging
3
3
import threading
4
4
import uuid
5
- from typing import Any , Dict , Iterable , List , Optional
5
+ from typing import Any , Dict , Iterable , List , Optional , Tuple
6
6
7
+ from dateutil import parser as dateutil_parser
7
8
from pyiceberg .catalog import Catalog
8
9
from pyiceberg .exceptions import (
9
10
NoSuchIcebergTableError ,
81
82
OwnerClass ,
82
83
OwnershipClass ,
83
84
OwnershipTypeClass ,
85
+ TimeStampClass ,
84
86
)
85
87
from datahub .utilities .perf_timer import PerfTimer
86
88
from datahub .utilities .threaded_iterator_executor import ThreadedIteratorExecutor
@@ -183,16 +185,9 @@ def _get_datasets(self, catalog: Catalog) -> Iterable[Identifier]:
183
185
def get_workunits_internal (self ) -> Iterable [MetadataWorkUnit ]:
184
186
thread_local = threading .local ()
185
187
186
- def _process_dataset (dataset_path : Identifier ) -> Iterable [MetadataWorkUnit ]:
187
- LOGGER .debug (f"Processing dataset for path { dataset_path } " )
188
- dataset_name = "." .join (dataset_path )
189
- if not self .config .table_pattern .allowed (dataset_name ):
190
- # Dataset name is rejected by pattern, report as dropped.
191
- self .report .report_dropped (dataset_name )
192
- LOGGER .debug (
193
- f"Skipping table { dataset_name } due to not being allowed by the config pattern"
194
- )
195
- return
188
+ def _try_processing_dataset (
189
+ dataset_path : Tuple [str , ...], dataset_name : str
190
+ ) -> Iterable [MetadataWorkUnit ]:
196
191
try :
197
192
if not hasattr (thread_local , "local_catalog" ):
198
193
LOGGER .debug (
@@ -248,10 +243,31 @@ def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]:
248
243
LOGGER .warning (
249
244
f"Iceberg Rest Catalog server error (500 status) encountered when processing table { dataset_path } , skipping it."
250
245
)
246
+ except ValueError as e :
247
+ if "Could not initialize FileIO" not in str (e ):
248
+ raise
249
+ self .report .warning (
250
+ "Could not initialize FileIO" ,
251
+ f"Could not initialize FileIO for { dataset_path } due to: { e } " ,
252
+ )
253
+
254
+ def _process_dataset (dataset_path : Identifier ) -> Iterable [MetadataWorkUnit ]:
255
+ try :
256
+ LOGGER .debug (f"Processing dataset for path { dataset_path } " )
257
+ dataset_name = "." .join (dataset_path )
258
+ if not self .config .table_pattern .allowed (dataset_name ):
259
+ # Dataset name is rejected by pattern, report as dropped.
260
+ self .report .report_dropped (dataset_name )
261
+ LOGGER .debug (
262
+ f"Skipping table { dataset_name } due to not being allowed by the config pattern"
263
+ )
264
+ return
265
+
266
+ yield from _try_processing_dataset (dataset_path , dataset_name )
251
267
except Exception as e :
252
268
self .report .report_failure (
253
269
"general" ,
254
- f"Failed to create workunit for dataset { dataset_name } : { e } " ,
270
+ f"Failed to create workunit for dataset { dataset_path } : { e } " ,
255
271
)
256
272
LOGGER .exception (
257
273
f"Exception while processing table { dataset_path } , skipping it." ,
@@ -288,6 +304,7 @@ def _create_iceberg_workunit(
288
304
)
289
305
290
306
# Dataset properties aspect.
307
+ additional_properties = {}
291
308
custom_properties = table .metadata .properties .copy ()
292
309
custom_properties ["location" ] = table .metadata .location
293
310
custom_properties ["format-version" ] = str (table .metadata .format_version )
@@ -299,10 +316,27 @@ def _create_iceberg_workunit(
299
316
custom_properties ["manifest-list" ] = (
300
317
table .current_snapshot ().manifest_list
301
318
)
319
+ additional_properties ["lastModified" ] = TimeStampClass (
320
+ int (table .current_snapshot ().timestamp_ms )
321
+ )
322
+ if "created-at" in custom_properties :
323
+ try :
324
+ dt = dateutil_parser .isoparse (custom_properties ["created-at" ])
325
+ additional_properties ["created" ] = TimeStampClass (
326
+ int (dt .timestamp () * 1000 )
327
+ )
328
+ except Exception as ex :
329
+ LOGGER .warning (
330
+ f"Exception while trying to parse creation date { custom_properties ['created-at' ]} , ignoring: { ex } "
331
+ )
332
+
302
333
dataset_properties = DatasetPropertiesClass (
303
334
name = table .name ()[- 1 ],
304
335
description = table .metadata .properties .get ("comment" , None ),
305
336
customProperties = custom_properties ,
337
+ lastModified = additional_properties .get ("lastModified" ),
338
+ created = additional_properties .get ("created" ),
339
+ qualifiedName = dataset_name ,
306
340
)
307
341
dataset_snapshot .aspects .append (dataset_properties )
308
342
# Dataset ownership aspect.
0 commit comments