1
1
import itertools
2
2
import logging
3
+ import time
3
4
from typing import Dict , Iterable , List , Optional , Union
4
5
5
6
from datahub .configuration .pattern_utils import is_schema_allowed
6
7
from datahub .emitter .mce_builder import (
7
- get_sys_time ,
8
8
make_data_platform_urn ,
9
9
make_dataset_urn_with_platform_instance ,
10
10
make_schema_field_urn ,
74
74
PROFILING ,
75
75
)
76
76
from datahub .metadata .com .linkedin .pegasus2avro .common import (
77
- AuditStamp ,
78
77
GlobalTags ,
79
78
Status ,
80
79
SubTypes ,
101
100
StringType ,
102
101
TimeType ,
103
102
)
104
- from datahub .metadata .com .linkedin .pegasus2avro .structured import (
105
- StructuredPropertyDefinition ,
106
- )
107
103
from datahub .metadata .com .linkedin .pegasus2avro .tag import TagProperties
108
104
from datahub .metadata .urns import (
109
- ContainerUrn ,
110
- DatasetUrn ,
111
- DataTypeUrn ,
112
- EntityTypeUrn ,
113
105
SchemaFieldUrn ,
114
106
StructuredPropertyUrn ,
115
107
)
@@ -191,7 +183,7 @@ def __init__(
191
183
self .domain_registry : Optional [DomainRegistry ] = domain_registry
192
184
self .classification_handler = ClassificationHandler (self .config , self .report )
193
185
self .tag_extractor = SnowflakeTagExtractor (
194
- config , self .data_dictionary , self .report
186
+ config , self .data_dictionary , self .report , identifiers
195
187
)
196
188
self .profiler : Optional [SnowflakeProfiler ] = profiler
197
189
self .snowsight_url_builder : Optional [SnowsightUrlBuilder ] = (
@@ -217,6 +209,16 @@ def snowflake_identifier(self, identifier: str) -> str:
217
209
return self .identifiers .snowflake_identifier (identifier )
218
210
219
211
def get_workunits_internal (self ) -> Iterable [MetadataWorkUnit ]:
212
+ if self .config .extract_tags_as_structured_properties :
213
+ logger .info ("Creating structured property templates for tags" )
214
+ yield from self .tag_extractor .create_structured_property_templates ()
215
+ # We have to wait until cache invalidates to make sure the structured property template is available
216
+ logger .info (
217
+ f"Waiting for { self .config .structured_properties_template_cache_invalidation_interval } seconds for structured properties cache to invalidate"
218
+ )
219
+ time .sleep (
220
+ self .config .structured_properties_template_cache_invalidation_interval
221
+ )
220
222
self .databases = []
221
223
for database in self .get_databases () or []:
222
224
self .report .report_entity_scanned (database .name , "database" )
@@ -698,6 +700,7 @@ def _process_view(
698
700
699
701
def _process_tag (self , tag : SnowflakeTag ) -> Iterable [MetadataWorkUnit ]:
700
702
use_sp = self .config .extract_tags_as_structured_properties
703
+
701
704
identifier = (
702
705
self .snowflake_identifier (tag .structured_property_identifier ())
703
706
if use_sp
@@ -708,10 +711,11 @@ def _process_tag(self, tag: SnowflakeTag) -> Iterable[MetadataWorkUnit]:
708
711
return
709
712
710
713
self .report .report_tag_processed (identifier )
714
+
711
715
if use_sp :
712
- yield from self . gen_tag_as_structured_property_workunits ( tag )
713
- else :
714
- yield from self .gen_tag_workunits (tag )
716
+ return
717
+
718
+ yield from self .gen_tag_workunits (tag )
715
719
716
720
def _format_tags_as_structured_properties (
717
721
self , tags : List [SnowflakeTag ]
@@ -732,6 +736,7 @@ def gen_dataset_workunits(
732
736
if table .tags :
733
737
for tag in table .tags :
734
738
yield from self ._process_tag (tag )
739
+
735
740
for column_name in table .column_tags :
736
741
for tag in table .column_tags [column_name ]:
737
742
yield from self ._process_tag (tag )
@@ -903,29 +908,6 @@ def gen_tag_workunits(self, tag: SnowflakeTag) -> Iterable[MetadataWorkUnit]:
903
908
entityUrn = tag_urn , aspect = tag_properties_aspect
904
909
).as_workunit ()
905
910
906
- def gen_tag_as_structured_property_workunits (
907
- self , tag : SnowflakeTag
908
- ) -> Iterable [MetadataWorkUnit ]:
909
- identifier = self .snowflake_identifier (tag .structured_property_identifier ())
910
- urn = StructuredPropertyUrn (identifier ).urn ()
911
- aspect = StructuredPropertyDefinition (
912
- qualifiedName = identifier ,
913
- displayName = tag .name ,
914
- valueType = DataTypeUrn ("datahub.string" ).urn (),
915
- entityTypes = [
916
- EntityTypeUrn (f"datahub.{ ContainerUrn .ENTITY_TYPE } " ).urn (),
917
- EntityTypeUrn (f"datahub.{ DatasetUrn .ENTITY_TYPE } " ).urn (),
918
- EntityTypeUrn (f"datahub.{ SchemaFieldUrn .ENTITY_TYPE } " ).urn (),
919
- ],
920
- lastModified = AuditStamp (
921
- time = get_sys_time (), actor = "urn:li:corpuser:datahub"
922
- ),
923
- )
924
- yield MetadataChangeProposalWrapper (
925
- entityUrn = urn ,
926
- aspect = aspect ,
927
- ).as_workunit ()
928
-
929
911
def gen_column_tags_as_structured_properties (
930
912
self , dataset_urn : str , table : Union [SnowflakeTable , SnowflakeView ]
931
913
) -> Iterable [MetadataWorkUnit ]:
0 commit comments