Skip to content

Commit f327981

Browse files
feat(ingest): allowdenypattern for dashboard, chart, dataset in superset (#12782)
1 parent 9e7f482 commit f327981

File tree

4 files changed

+84
-8
lines changed

4 files changed

+84
-8
lines changed

metadata-ingestion/src/datahub/ingestion/source/preset.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,13 @@
1616
support_status,
1717
)
1818
from datahub.ingestion.source.state.stale_entity_removal_handler import (
19-
StaleEntityRemovalSourceReport,
2019
StatefulStaleMetadataRemovalConfig,
2120
)
22-
from datahub.ingestion.source.superset import SupersetConfig, SupersetSource
21+
from datahub.ingestion.source.superset import (
22+
SupersetConfig,
23+
SupersetSource,
24+
SupersetSourceReport,
25+
)
2326
from datahub.utilities import config_clean
2427

2528
logger = logging.getLogger(__name__)
@@ -76,15 +79,15 @@ class PresetSource(SupersetSource):
7679
"""
7780

7881
config: PresetConfig
79-
report: StaleEntityRemovalSourceReport
82+
report: SupersetSourceReport
8083
platform = "preset"
8184

8285
def __init__(self, ctx: PipelineContext, config: PresetConfig):
8386
logger.info(f"ctx is {ctx}")
8487

8588
super().__init__(ctx, config)
8689
self.config = config
87-
self.report = StaleEntityRemovalSourceReport()
90+
self.report = SupersetSourceReport()
8891
self.platform = "preset"
8992

9093
def login(self):

metadata-ingestion/src/datahub/ingestion/source/superset.py

+69-4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import json
22
import logging
3+
from dataclasses import dataclass, field
34
from datetime import datetime
45
from functools import lru_cache
56
from typing import Any, Dict, Iterable, List, Optional
@@ -80,6 +81,7 @@
8081
UpstreamLineageClass,
8182
)
8283
from datahub.utilities import config_clean
84+
from datahub.utilities.lossy_collections import LossyList
8385
from datahub.utilities.registries.domain_registry import DomainRegistry
8486

8587
logger = logging.getLogger(__name__)
@@ -107,6 +109,14 @@
107109
platform_without_databases = ["druid"]
108110

109111

112+
@dataclass
113+
class SupersetSourceReport(StaleEntityRemovalSourceReport):
114+
filtered: LossyList[str] = field(default_factory=LossyList)
115+
116+
def report_dropped(self, name: str) -> None:
117+
self.filtered.append(name)
118+
119+
110120
class SupersetDataset(BaseModel):
111121
id: int
112122
table_name: str
@@ -142,6 +152,18 @@ class SupersetConfig(
142152
default=dict(),
143153
description="regex patterns for tables to filter to assign domain_key. ",
144154
)
155+
dataset_pattern: AllowDenyPattern = Field(
156+
default=AllowDenyPattern.allow_all(),
157+
description="Regex patterns for dataset to filter in ingestion.",
158+
)
159+
chart_pattern: AllowDenyPattern = Field(
160+
AllowDenyPattern.allow_all(),
161+
description="Patterns for selecting chart names that are to be included",
162+
)
163+
dashboard_pattern: AllowDenyPattern = Field(
164+
AllowDenyPattern.allow_all(),
165+
description="Patterns for selecting dashboard names that are to be included",
166+
)
145167
username: Optional[str] = Field(default=None, description="Superset username.")
146168
password: Optional[str] = Field(default=None, description="Superset password.")
147169
# Configuration for stateful ingestion
@@ -222,7 +244,7 @@ class SupersetSource(StatefulIngestionSourceBase):
222244
"""
223245

224246
config: SupersetConfig
225-
report: StaleEntityRemovalSourceReport
247+
report: SupersetSourceReport
226248
platform = "superset"
227249

228250
def __hash__(self):
@@ -231,7 +253,7 @@ def __hash__(self):
231253
def __init__(self, ctx: PipelineContext, config: SupersetConfig):
232254
super().__init__(config, ctx)
233255
self.config = config
234-
self.report = StaleEntityRemovalSourceReport()
256+
self.report = SupersetSourceReport()
235257
if self.config.domain:
236258
self.domain_registry = DomainRegistry(
237259
cached_domains=[domain_id for domain_id in self.config.domain],
@@ -449,6 +471,15 @@ def construct_dashboard_from_api_data(
449471
def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
450472
for dashboard_data in self.paginate_entity_api_results("dashboard/", PAGE_SIZE):
451473
try:
474+
dashboard_id = str(dashboard_data.get("id"))
475+
dashboard_title = dashboard_data.get("dashboard_title", "")
476+
477+
if not self.config.dashboard_pattern.allowed(dashboard_title):
478+
self.report.report_dropped(
479+
f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
480+
)
481+
continue
482+
452483
dashboard_snapshot = self.construct_dashboard_from_api_data(
453484
dashboard_data
454485
)
@@ -461,7 +492,7 @@ def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
461492
mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
462493
yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
463494
yield from self._get_domain_wu(
464-
title=dashboard_data.get("dashboard_title", ""),
495+
title=dashboard_title,
465496
entity_urn=dashboard_snapshot.urn,
466497
)
467498

@@ -569,12 +600,37 @@ def construct_chart_from_chart_data(self, chart_data: dict) -> ChartSnapshot:
569600
def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
570601
for chart_data in self.paginate_entity_api_results("chart/", PAGE_SIZE):
571602
try:
603+
chart_id = str(chart_data.get("id"))
604+
chart_name = chart_data.get("slice_name", "")
605+
606+
if not self.config.chart_pattern.allowed(chart_name):
607+
self.report.report_dropped(
608+
f"Chart '{chart_name}' (id: {chart_id}) filtered by chart_pattern"
609+
)
610+
continue
611+
612+
# Emit a warning if charts use data from a dataset that will be filtered out
613+
if self.config.dataset_pattern != AllowDenyPattern.allow_all():
614+
datasource_id = chart_data.get("datasource_id")
615+
if datasource_id:
616+
dataset_response = self.get_dataset_info(datasource_id)
617+
dataset_name = dataset_response.get("result", {}).get(
618+
"table_name", ""
619+
)
620+
621+
if dataset_name and not self.config.dataset_pattern.allowed(
622+
dataset_name
623+
):
624+
self.report.warning(
625+
f"Chart '{chart_name}' (id: {chart_id}) uses dataset '{dataset_name}' which is filtered by dataset_pattern"
626+
)
627+
572628
chart_snapshot = self.construct_chart_from_chart_data(chart_data)
573629

574630
mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
575631
except Exception as e:
576632
self.report.warning(
577-
f"Failed to construct chart snapshot. Chart name: {chart_data.get('table_name')}. Error: \n{e}"
633+
f"Failed to construct chart snapshot. Chart name: {chart_name}. Error: \n{e}"
578634
)
579635
continue
580636
# Emit the chart
@@ -716,6 +772,15 @@ def construct_dataset_from_dataset_data(
716772
def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
717773
for dataset_data in self.paginate_entity_api_results("dataset/", PAGE_SIZE):
718774
try:
775+
dataset_name = dataset_data.get("table_name", "")
776+
777+
# Check if dataset should be filtered by dataset name
778+
if not self.config.dataset_pattern.allowed(dataset_name):
779+
self.report.report_dropped(
780+
f"Dataset '{dataset_name}' filtered by dataset_pattern"
781+
)
782+
continue
783+
719784
dataset_snapshot = self.construct_dataset_from_dataset_data(
720785
dataset_data
721786
)

metadata-ingestion/tests/unit/test_preset_source.py

+4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from datahub.configuration.common import AllowDenyPattern
12
from datahub.ingestion.source.preset import PresetConfig
23

34

@@ -10,6 +11,9 @@ def test_default_values():
1011
assert config.env == "PROD"
1112
assert config.api_key is None
1213
assert config.api_secret is None
14+
assert config.dataset_pattern == AllowDenyPattern.allow_all()
15+
assert config.chart_pattern == AllowDenyPattern.allow_all()
16+
assert config.dashboard_pattern == AllowDenyPattern.allow_all()
1317

1418

1519
def test_set_display_uri():

metadata-ingestion/tests/unit/test_superset_source.py

+4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from datahub.configuration.common import AllowDenyPattern
12
from datahub.ingestion.api.common import PipelineContext
23
from datahub.ingestion.source.superset import SupersetConfig, SupersetSource
34

@@ -11,6 +12,9 @@ def test_default_values():
1112
assert config.env == "PROD"
1213
assert config.username is None
1314
assert config.password is None
15+
assert config.dataset_pattern == AllowDenyPattern.allow_all()
16+
assert config.chart_pattern == AllowDenyPattern.allow_all()
17+
assert config.dashboard_pattern == AllowDenyPattern.allow_all()
1418

1519

1620
def test_set_display_uri():

0 commit comments

Comments
 (0)