10
10
NoSuchIcebergTableError ,
11
11
NoSuchNamespaceError ,
12
12
NoSuchPropertyException ,
13
+ NoSuchTableError ,
14
+ ServerError ,
13
15
)
14
16
from pyiceberg .io .pyarrow import PyArrowFileIO
15
17
from pyiceberg .partitioning import PartitionSpec
39
41
UUIDType ,
40
42
)
41
43
44
+ from datahub .configuration .common import AllowDenyPattern
42
45
from datahub .ingestion .api .common import PipelineContext
43
46
from datahub .ingestion .api .workunit import MetadataWorkUnit
44
47
from datahub .ingestion .source .iceberg .iceberg import (
62
65
)
63
66
64
67
65
- def with_iceberg_source (processing_threads : int = 1 ) -> IcebergSource :
68
+ def with_iceberg_source (processing_threads : int = 1 , ** kwargs : Any ) -> IcebergSource :
66
69
catalog = {"test" : {"type" : "rest" }}
67
70
return IcebergSource (
68
71
ctx = PipelineContext (run_id = "iceberg-source-test" ),
69
72
config = IcebergSourceConfig (
70
- catalog = catalog , processing_threads = processing_threads
73
+ catalog = catalog , processing_threads = processing_threads , ** kwargs
71
74
),
72
75
)
73
76
@@ -542,27 +545,27 @@ def __init__(self, tables: Dict[str, Dict[str, Callable[[], Table]]]):
542
545
"""
543
546
self .tables = tables
544
547
545
- def list_namespaces (self ) -> Iterable [str ]:
546
- return [* self .tables .keys ()]
548
+ def list_namespaces (self ) -> Iterable [Tuple [ str ] ]:
549
+ return [* [( key ,) for key in self .tables .keys ()] ]
547
550
548
551
def list_tables (self , namespace : str ) -> Iterable [Tuple [str , str ]]:
549
- return [(namespace , table ) for table in self .tables [namespace ].keys ()]
552
+ return [(namespace [ 0 ] , table ) for table in self .tables [namespace [ 0 ] ].keys ()]
550
553
551
554
def load_table (self , dataset_path : Tuple [str , str ]) -> Table :
552
555
return self .tables [dataset_path [0 ]][dataset_path [1 ]]()
553
556
554
557
555
558
class MockCatalogExceptionListingTables (MockCatalog ):
556
559
def list_tables (self , namespace : str ) -> Iterable [Tuple [str , str ]]:
557
- if namespace == "no_such_namespace" :
560
+ if namespace == ( "no_such_namespace" ,) :
558
561
raise NoSuchNamespaceError ()
559
- if namespace == "generic_exception" :
562
+ if namespace == ( "generic_exception" ,) :
560
563
raise Exception ()
561
564
return super ().list_tables (namespace )
562
565
563
566
564
567
class MockCatalogExceptionListingNamespaces (MockCatalog ):
565
- def list_namespaces (self ) -> Iterable [str ]:
568
+ def list_namespaces (self ) -> Iterable [Tuple [ str ] ]:
566
569
raise Exception ()
567
570
568
571
@@ -814,15 +817,157 @@ def test_proper_run_with_multiple_namespaces() -> None:
814
817
)
815
818
816
819
820
+ def test_filtering () -> None :
821
+ source = with_iceberg_source (
822
+ processing_threads = 1 ,
823
+ table_pattern = AllowDenyPattern (deny = [".*abcd.*" ]),
824
+ namespace_pattern = AllowDenyPattern (allow = ["namespace1" ]),
825
+ )
826
+ mock_catalog = MockCatalog (
827
+ {
828
+ "namespace1" : {
829
+ "table_xyz" : lambda : Table (
830
+ identifier = ("namespace1" , "table_xyz" ),
831
+ metadata = TableMetadataV2 (
832
+ partition_specs = [PartitionSpec (spec_id = 0 )],
833
+ location = "s3://abcdefg/namespace1/table_xyz" ,
834
+ last_column_id = 0 ,
835
+ schemas = [Schema (schema_id = 0 )],
836
+ ),
837
+ metadata_location = "s3://abcdefg/namespace1/table_xyz" ,
838
+ io = PyArrowFileIO (),
839
+ catalog = None ,
840
+ ),
841
+ "JKLtable" : lambda : Table (
842
+ identifier = ("namespace1" , "JKLtable" ),
843
+ metadata = TableMetadataV2 (
844
+ partition_specs = [PartitionSpec (spec_id = 0 )],
845
+ location = "s3://abcdefg/namespace1/JKLtable" ,
846
+ last_column_id = 0 ,
847
+ schemas = [Schema (schema_id = 0 )],
848
+ ),
849
+ metadata_location = "s3://abcdefg/namespace1/JKLtable" ,
850
+ io = PyArrowFileIO (),
851
+ catalog = None ,
852
+ ),
853
+ "table_abcd" : lambda : Table (
854
+ identifier = ("namespace1" , "table_abcd" ),
855
+ metadata = TableMetadataV2 (
856
+ partition_specs = [PartitionSpec (spec_id = 0 )],
857
+ location = "s3://abcdefg/namespace1/table_abcd" ,
858
+ last_column_id = 0 ,
859
+ schemas = [Schema (schema_id = 0 )],
860
+ ),
861
+ metadata_location = "s3://abcdefg/namespace1/table_abcd" ,
862
+ io = PyArrowFileIO (),
863
+ catalog = None ,
864
+ ),
865
+ "aaabcd" : lambda : Table (
866
+ identifier = ("namespace1" , "aaabcd" ),
867
+ metadata = TableMetadataV2 (
868
+ partition_specs = [PartitionSpec (spec_id = 0 )],
869
+ location = "s3://abcdefg/namespace1/aaabcd" ,
870
+ last_column_id = 0 ,
871
+ schemas = [Schema (schema_id = 0 )],
872
+ ),
873
+ metadata_location = "s3://abcdefg/namespace1/aaabcd" ,
874
+ io = PyArrowFileIO (),
875
+ catalog = None ,
876
+ ),
877
+ },
878
+ "namespace2" : {
879
+ "foo" : lambda : Table (
880
+ identifier = ("namespace2" , "foo" ),
881
+ metadata = TableMetadataV2 (
882
+ partition_specs = [PartitionSpec (spec_id = 0 )],
883
+ location = "s3://abcdefg/namespace2/foo" ,
884
+ last_column_id = 0 ,
885
+ schemas = [Schema (schema_id = 0 )],
886
+ ),
887
+ metadata_location = "s3://abcdefg/namespace2/foo" ,
888
+ io = PyArrowFileIO (),
889
+ catalog = None ,
890
+ ),
891
+ "bar" : lambda : Table (
892
+ identifier = ("namespace2" , "bar" ),
893
+ metadata = TableMetadataV2 (
894
+ partition_specs = [PartitionSpec (spec_id = 0 )],
895
+ location = "s3://abcdefg/namespace2/bar" ,
896
+ last_column_id = 0 ,
897
+ schemas = [Schema (schema_id = 0 )],
898
+ ),
899
+ metadata_location = "s3://abcdefg/namespace2/bar" ,
900
+ io = PyArrowFileIO (),
901
+ catalog = None ,
902
+ ),
903
+ },
904
+ "namespace3" : {
905
+ "sales" : lambda : Table (
906
+ identifier = ("namespace3" , "sales" ),
907
+ metadata = TableMetadataV2 (
908
+ partition_specs = [PartitionSpec (spec_id = 0 )],
909
+ location = "s3://abcdefg/namespace3/sales" ,
910
+ last_column_id = 0 ,
911
+ schemas = [Schema (schema_id = 0 )],
912
+ ),
913
+ metadata_location = "s3://abcdefg/namespace3/sales" ,
914
+ io = PyArrowFileIO (),
915
+ catalog = None ,
916
+ ),
917
+ "products" : lambda : Table (
918
+ identifier = ("namespace2" , "bar" ),
919
+ metadata = TableMetadataV2 (
920
+ partition_specs = [PartitionSpec (spec_id = 0 )],
921
+ location = "s3://abcdefg/namespace3/products" ,
922
+ last_column_id = 0 ,
923
+ schemas = [Schema (schema_id = 0 )],
924
+ ),
925
+ metadata_location = "s3://abcdefg/namespace3/products" ,
926
+ io = PyArrowFileIO (),
927
+ catalog = None ,
928
+ ),
929
+ },
930
+ }
931
+ )
932
+ with patch (
933
+ "datahub.ingestion.source.iceberg.iceberg.IcebergSourceConfig.get_catalog"
934
+ ) as get_catalog :
935
+ get_catalog .return_value = mock_catalog
936
+ wu : List [MetadataWorkUnit ] = [* source .get_workunits_internal ()]
937
+ assert len (wu ) == 2
938
+ urns = []
939
+ for unit in wu :
940
+ assert isinstance (unit .metadata , MetadataChangeEvent )
941
+ assert isinstance (unit .metadata .proposedSnapshot , DatasetSnapshotClass )
942
+ urns .append (unit .metadata .proposedSnapshot .urn )
943
+ TestCase ().assertCountEqual (
944
+ urns ,
945
+ [
946
+ "urn:li:dataset:(urn:li:dataPlatform:iceberg,namespace1.table_xyz,PROD)" ,
947
+ "urn:li:dataset:(urn:li:dataPlatform:iceberg,namespace1.JKLtable,PROD)" ,
948
+ ],
949
+ )
950
+ assert source .report .tables_scanned == 2
951
+
952
+
817
953
def test_handle_expected_exceptions () -> None :
818
954
source = with_iceberg_source (processing_threads = 3 )
819
955
820
956
def _raise_no_such_property_exception ():
821
957
raise NoSuchPropertyException ()
822
958
823
- def _raise_no_such_table_exception ():
959
+ def _raise_no_such_iceberg_table_exception ():
824
960
raise NoSuchIcebergTableError ()
825
961
962
+ def _raise_file_not_found_error ():
963
+ raise FileNotFoundError ()
964
+
965
+ def _raise_no_such_table_exception ():
966
+ raise NoSuchTableError ()
967
+
968
+ def _raise_server_error ():
969
+ raise ServerError ()
970
+
826
971
mock_catalog = MockCatalog (
827
972
{
828
973
"namespaceA" : {
@@ -876,6 +1021,9 @@ def _raise_no_such_table_exception():
876
1021
),
877
1022
"table5" : _raise_no_such_property_exception ,
878
1023
"table6" : _raise_no_such_table_exception ,
1024
+ "table7" : _raise_file_not_found_error ,
1025
+ "table8" : _raise_no_such_iceberg_table_exception ,
1026
+ "table9" : _raise_server_error ,
879
1027
}
880
1028
}
881
1029
)
@@ -899,7 +1047,7 @@ def _raise_no_such_table_exception():
899
1047
"urn:li:dataset:(urn:li:dataPlatform:iceberg,namespaceA.table4,PROD)" ,
900
1048
],
901
1049
)
902
- assert source .report .warnings .total_elements == 2
1050
+ assert source .report .warnings .total_elements == 5
903
1051
assert source .report .failures .total_elements == 0
904
1052
assert source .report .tables_scanned == 4
905
1053
0 commit comments