-
Notifications
You must be signed in to change notification settings - Fork 3.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix(ingest/snowflake): Create all structured propery templates before assignation #12469
Changes from 9 commits
b03d6f0
74603e2
180e3e4
3392fac
006a9c8
5900271
5c477b2
d3b679e
5c4d7ad
0296c03
3abcf0c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -285,6 +285,22 @@ def get_secure_view_definitions(self) -> Dict[str, Dict[str, Dict[str, str]]]: | |
|
||
return secure_view_definitions | ||
|
||
def get_all_tags(self) -> List[SnowflakeTag]: | ||
cur = self.connection.query( | ||
SnowflakeQuery.get_all_tags(), | ||
) | ||
|
||
tags = [] | ||
for tag in cur: | ||
snowflake_tag = SnowflakeTag( | ||
database=tag["TAG_DATABASE"], | ||
schema=tag["TAG_SCHEMA"], | ||
name=tag["TAG_NAME"], | ||
value="", | ||
) | ||
tags.append(snowflake_tag) | ||
return tags | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could be a list comprehension a bit more succinctly |
||
|
||
@serialized_lru_cache(maxsize=1) | ||
def get_tables_for_database( | ||
self, db_name: str | ||
|
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
@@ -1,6 +1,9 @@ | ||||||||
import logging | ||||||||
from typing import Dict, List, Optional | ||||||||
from typing import Dict, Iterable, List, Optional | ||||||||
|
||||||||
from datahub.emitter.mce_builder import get_sys_time | ||||||||
from datahub.emitter.mcp import MetadataChangeProposalWrapper | ||||||||
from datahub.ingestion.api.workunit import MetadataWorkUnit | ||||||||
from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain | ||||||||
from datahub.ingestion.source.snowflake.snowflake_config import ( | ||||||||
SnowflakeV2Config, | ||||||||
|
@@ -12,7 +15,22 @@ | |||||||
SnowflakeTag, | ||||||||
_SnowflakeTagCache, | ||||||||
) | ||||||||
from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin | ||||||||
from datahub.ingestion.source.snowflake.snowflake_utils import ( | ||||||||
SnowflakeCommonMixin, | ||||||||
SnowflakeIdentifierBuilder, | ||||||||
) | ||||||||
from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp | ||||||||
from datahub.metadata.com.linkedin.pegasus2avro.structured import ( | ||||||||
StructuredPropertyDefinition, | ||||||||
) | ||||||||
from datahub.metadata.urns import ( | ||||||||
ContainerUrn, | ||||||||
DatasetUrn, | ||||||||
DataTypeUrn, | ||||||||
EntityTypeUrn, | ||||||||
SchemaFieldUrn, | ||||||||
StructuredPropertyUrn, | ||||||||
) | ||||||||
|
||||||||
logger: logging.Logger = logging.getLogger(__name__) | ||||||||
|
||||||||
|
@@ -23,11 +41,12 @@ | |||||||
config: SnowflakeV2Config, | ||||||||
data_dictionary: SnowflakeDataDictionary, | ||||||||
report: SnowflakeV2Report, | ||||||||
snowflake_identifiers: SnowflakeIdentifierBuilder, | ||||||||
) -> None: | ||||||||
self.config = config | ||||||||
self.data_dictionary = data_dictionary | ||||||||
self.report = report | ||||||||
|
||||||||
self.snowflake_identifiers = snowflake_identifiers | ||||||||
self.tag_cache: Dict[str, _SnowflakeTagCache] = {} | ||||||||
|
||||||||
def _get_tags_on_object_without_propagation( | ||||||||
|
@@ -59,6 +78,46 @@ | |||||||
raise ValueError(f"Unknown domain {domain}") | ||||||||
return tags | ||||||||
|
||||||||
def create_structured_property_templates(self) -> Iterable[MetadataWorkUnit]: | ||||||||
for tag in self.data_dictionary.get_all_tags(): | ||||||||
if not self.config.tag_pattern.allowed(tag.tag_identifier()): | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, I think |
||||||||
continue | ||||||||
# Do we need to filter based on database and schema or is it enough if we filter based on tag pattern? | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would say no, because you can apply tags from other databases / schemas |
||||||||
# if not self.config.database_pattern.allowed(tag.database): | ||||||||
# continue | ||||||||
# if not self.config.schema_pattern.allowed(f"{tag.database}.{tag.schema}"): | ||||||||
# continue | ||||||||
|
||||||||
if self.config.extract_tags_as_structured_properties: | ||||||||
self.report.num_structured_property_templates_created += 1 | ||||||||
for workunit in self.gen_tag_as_structured_property_workunits(tag): | ||||||||
yield workunit | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
|
||||||||
def gen_tag_as_structured_property_workunits( | ||||||||
self, tag: SnowflakeTag | ||||||||
) -> Iterable[MetadataWorkUnit]: | ||||||||
identifier = self.snowflake_identifiers.snowflake_identifier( | ||||||||
tag.structured_property_identifier() | ||||||||
) | ||||||||
urn = StructuredPropertyUrn(identifier).urn() | ||||||||
aspect = StructuredPropertyDefinition( | ||||||||
qualifiedName=identifier, | ||||||||
displayName=tag.name, | ||||||||
valueType=DataTypeUrn("datahub.string").urn(), | ||||||||
entityTypes=[ | ||||||||
EntityTypeUrn(f"datahub.{ContainerUrn.ENTITY_TYPE}").urn(), | ||||||||
EntityTypeUrn(f"datahub.{DatasetUrn.ENTITY_TYPE}").urn(), | ||||||||
EntityTypeUrn(f"datahub.{SchemaFieldUrn.ENTITY_TYPE}").urn(), | ||||||||
], | ||||||||
lastModified=AuditStamp( | ||||||||
time=get_sys_time(), actor="urn:li:corpuser:datahub" | ||||||||
), | ||||||||
) | ||||||||
yield MetadataChangeProposalWrapper( | ||||||||
entityUrn=urn, | ||||||||
aspect=aspect, | ||||||||
).as_workunit() | ||||||||
|
||||||||
def _get_tags_on_object_with_propagation( | ||||||||
self, | ||||||||
domain: str, | ||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -219,6 +219,7 @@ def test_snowflake_tags_as_structured_properties( | |
include_column_lineage=False, | ||
include_usage_stats=False, | ||
include_operational_stats=False, | ||
structured_properties_template_cache_invalidation_interval=1, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For test I'd prob put this at 0 unless it's needed just to avoid unnecessary sleeps |
||
), | ||
), | ||
sink=DynamicTypedConfig( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe hide this one from docs, feels more like an implementation detail