24
24
make_dataset_urn_with_platform_instance ,
25
25
make_domain_urn ,
26
26
make_user_urn ,
27
+ make_term_urn ,
27
28
)
29
+ from datahub .emitter .mcp import MetadataChangeProposalWrapper
28
30
from datahub .emitter .mcp_builder import add_domain_to_entity_wu
31
+ from datahub .emitter .rest_emitter import DatahubRestEmitter
29
32
from datahub .ingestion .api .common import PipelineContext
30
33
from datahub .ingestion .api .decorators import (
31
34
SourceCapability ,
37
40
)
38
41
from datahub .ingestion .api .source import MetadataWorkUnitProcessor
39
42
from datahub .ingestion .api .workunit import MetadataWorkUnit
43
+ from datahub .ingestion .graph .client import DatahubClientConfig , DataHubGraph
40
44
from datahub .ingestion .source .sql .sql_types import resolve_sql_type
41
45
from datahub .ingestion .source .state .stale_entity_removal_handler import (
42
46
StaleEntityRemovalHandler ,
47
51
StatefulIngestionConfigBase ,
48
52
StatefulIngestionSourceBase ,
49
53
)
54
+ from datahub .metadata ._schema_classes import (
55
+ AuditStampClass ,
56
+ GlossaryTermAssociationClass ,
57
+ GlossaryTermInfoClass ,
58
+ GlossaryTermsClass ,
59
+ )
50
60
from datahub .metadata .com .linkedin .pegasus2avro .common import (
51
61
ChangeAuditStamps ,
52
62
Status ,
88
98
89
99
PAGE_SIZE = 25
90
100
91
-
92
101
chart_type_from_viz_type = {
93
102
"line" : ChartTypeClass .LINE ,
94
103
"big_number" : ChartTypeClass .LINE ,
105
114
"box_plot" : ChartTypeClass .BAR ,
106
115
}
107
116
108
-
109
117
platform_without_databases = ["druid" ]
110
118
111
119
@@ -259,6 +267,7 @@ def __init__(self, ctx: PipelineContext, config: SupersetConfig):
259
267
cached_domains = [domain_id for domain_id in self .config .domain ],
260
268
graph = self .ctx .graph ,
261
269
)
270
+ self .sink_config = ctx .pipeline_config .sink .config
262
271
self .session = self .login ()
263
272
self .owner_info = self .parse_owner_info ()
264
273
@@ -680,6 +689,72 @@ def gen_dataset_urn(self, datahub_dataset_name: str) -> str:
680
689
env = self .config .env ,
681
690
)
682
691
692
+ def check_if_term_exists (self , term_urn ):
693
+ graph = DataHubGraph (
694
+ DatahubClientConfig (
695
+ server = self .sink_config .get ("server" , "" ),
696
+ token = self .sink_config .get ("token" , "" ),
697
+ )
698
+ )
699
+ # Query multiple aspects from entity
700
+ result = graph .get_entity_semityped (
701
+ entity_urn = term_urn ,
702
+ aspects = ["glossaryTermInfo" ],
703
+ )
704
+
705
+ if result .get ("glossaryTermInfo" ):
706
+ return True
707
+ return False
708
+
709
+ def parse_glossary_terms_from_metrics (
710
+ self , metrics , last_modified
711
+ ) -> GlossaryTermsClass :
712
+ glossary_term_urns = []
713
+ for metric in metrics :
714
+ expression = metric .get ("expression" , "" )
715
+ certification_details = metric .get ("extra" , "" )
716
+ metric_name = metric .get ("metric_name" , "" )
717
+ description = metric .get ("description" , "" )
718
+ term_urn = make_term_urn (metric_name )
719
+
720
+ if self .check_if_term_exists (term_urn ):
721
+ logger .info (f"Term { term_urn } already exists" )
722
+ glossary_term_urns .append (GlossaryTermAssociationClass (urn = term_urn ))
723
+ continue
724
+
725
+ term_properties_aspect = GlossaryTermInfoClass (
726
+ name = metric_name ,
727
+ definition = f"Description: { description } \n Sql Expression: { expression } \n Certification details: { certification_details } " ,
728
+ termSource = "" ,
729
+ )
730
+
731
+ event : MetadataChangeProposalWrapper = MetadataChangeProposalWrapper (
732
+ entityUrn = term_urn ,
733
+ aspect = term_properties_aspect ,
734
+ )
735
+
736
+ # Create rest emitter
737
+ rest_emitter = DatahubRestEmitter (
738
+ gms_server = self .sink_config .get ("server" , "" ),
739
+ token = self .sink_config .get ("token" , "" ),
740
+ )
741
+ rest_emitter .emit (event )
742
+ logger .info (f"Created Glossary term { term_urn } " )
743
+ glossary_term_urns .append (GlossaryTermAssociationClass (urn = term_urn ))
744
+
745
+ return GlossaryTermsClass (terms = glossary_term_urns , auditStamp = last_modified )
746
+
747
+ def _is_certified_metric (self , response_result : dict ) -> bool :
748
+ # We only want to ingest certified metrics for physical preset dataset
749
+ metrics = response_result .get ("metrics" , {})
750
+ extra = response_result .get ("extra" , {})
751
+ # kind = response_result.get("kind")
752
+ if (metrics and extra and "This table is produced by dbt" in extra ):
753
+ # and kind == "physical"):
754
+ return True
755
+ else :
756
+ return False
757
+
683
758
def construct_dataset_from_dataset_data (
684
759
self , dataset_data : dict
685
760
) -> DatasetSnapshot :
@@ -690,6 +765,12 @@ def construct_dataset_from_dataset_data(
690
765
dataset_response , self .platform
691
766
)
692
767
dataset_url = f"{ self .config .display_uri } { dataset_response .get ('result' , {}).get ('url' , '' )} "
768
+ now = datetime .now ().strftime ("%I:%M%p on %B %d, %Y" )
769
+ modified_ts = int (
770
+ dp .parse (dataset_data .get ("changed_on" ) or now ).timestamp () * 1000
771
+ )
772
+ modified_actor = f"urn:li:corpuser:{ (dataset_data .get ('changed_by' ) or {}).get ('username' , 'unknown' )} "
773
+ last_modified = AuditStampClass (time = modified_ts , actor = modified_actor )
693
774
694
775
modified_actor = f"urn:li:corpuser:{ self .owner_info .get ((dataset_data .get ('changed_by' ) or {}).get ('id' , - 1 ), 'unknown' )} "
695
776
modified_ts = int (
@@ -747,6 +828,14 @@ def construct_dataset_from_dataset_data(
747
828
]
748
829
)
749
830
831
+ response_result = dataset_response .get ("result" , {})
832
+
833
+ if self ._is_certified_metric (response_result ):
834
+ glossary_terms = self .parse_glossary_terms_from_metrics (
835
+ response_result .get ("metrics" , {}), last_modified
836
+ )
837
+ aspects_items .append (glossary_terms )
838
+
750
839
dataset_snapshot = DatasetSnapshot (
751
840
urn = datasource_urn ,
752
841
aspects = aspects_items ,
0 commit comments