Skip to content

Commit 7f26249

Browse files
sid-acrylsleeperdeep
authored andcommitted
fix(ingest/tableau): project_path_pattern use in _is_denied_project (datahub-project#12010)
1 parent ecbcc31 commit 7f26249

File tree

2 files changed

+38
-19
lines changed

2 files changed

+38
-19
lines changed

metadata-ingestion/src/datahub/configuration/common.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -258,15 +258,15 @@ def allow_all(cls) -> "AllowDenyPattern":
258258
return AllowDenyPattern()
259259

260260
def allowed(self, string: str) -> bool:
261-
if self._denied(string):
261+
if self.denied(string):
262262
return False
263263

264264
return any(
265265
re.match(allow_pattern, string, self.regex_flags)
266266
for allow_pattern in self.allow
267267
)
268268

269-
def _denied(self, string: str) -> bool:
269+
def denied(self, string: str) -> bool:
270270
for deny_pattern in self.deny:
271271
if re.match(deny_pattern, string, self.regex_flags):
272272
return True
@@ -290,7 +290,7 @@ def get_allowed_list(self) -> List[str]:
290290
raise ValueError(
291291
"allow list must be fully specified to get list of allowed strings"
292292
)
293-
return [a for a in self.allow if not self._denied(a)]
293+
return [a for a in self.allow if not self.denied(a)]
294294

295295
def __eq__(self, other): # type: ignore
296296
return isinstance(other, self.__class__) and self.__dict__ == other.__dict__

metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py

+35-16
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,7 @@ class TableauConfig(
353353

354354
project_path_separator: str = Field(
355355
default="/",
356-
description="The separator used for the project_pattern field between project names. By default, we use a slash. "
356+
description="The separator used for the project_path_pattern field between project names. By default, we use a slash. "
357357
"You can change this if your Tableau projects contain slashes in their names, and you'd like to filter by project.",
358358
)
359359

@@ -959,19 +959,36 @@ def _is_allowed_project(self, project: TableauProject) -> bool:
959959
return is_allowed
960960

961961
def _is_denied_project(self, project: TableauProject) -> bool:
962-
# Either project name or project path should exist in deny
963-
for deny_pattern in self.config.project_pattern.deny:
964-
# Either name or project path is denied
965-
if re.match(
966-
deny_pattern, project.name, self.config.project_pattern.regex_flags
967-
) or re.match(
968-
deny_pattern,
969-
self._get_project_path(project),
970-
self.config.project_pattern.regex_flags,
971-
):
972-
return True
973-
logger.info(f"project({project.name}) is not denied as per project_pattern")
974-
return False
962+
"""
963+
Why use an explicit denial check instead of the `AllowDenyPattern.allowed` method?
964+
965+
Consider a scenario where a Tableau site contains four projects: A, B, C, and D, with the following hierarchical relationship:
966+
967+
- **A**
968+
- **B** (Child of A)
969+
- **C** (Child of A)
970+
- **D**
971+
972+
In this setup:
973+
974+
- `project_pattern` is configured with `allow: ["A"]` and `deny: ["B"]`.
975+
- `extract_project_hierarchy` is set to `True`.
976+
977+
The goal is to extract assets from project A and its children while explicitly denying the child project B.
978+
979+
If we rely solely on the `project_pattern.allowed()` method, project C's assets will not be ingested.
980+
This happens because project C is not explicitly included in the `allow` list, nor is it part of the `deny` list.
981+
However, since `extract_project_hierarchy` is enabled, project C should ideally be included in the ingestion process unless explicitly denied.
982+
983+
To address this, the function explicitly checks the deny regex to ensure that project C’s assets are ingested if it is not specifically denied in the deny list. This approach ensures that the hierarchy is respected while adhering to the configured allow/deny rules.
984+
"""
985+
986+
# Either project_pattern or project_path_pattern is set in a recipe
987+
# TableauConfig.projects_backward_compatibility ensures that at least one of these properties is configured.
988+
989+
return self.config.project_pattern.denied(
990+
project.name
991+
) or self.config.project_path_pattern.denied(self._get_project_path(project))
975992

976993
def _init_tableau_project_registry(self, all_project_map: dict) -> None:
977994
list_of_skip_projects: List[TableauProject] = []
@@ -999,9 +1016,11 @@ def _init_tableau_project_registry(self, all_project_map: dict) -> None:
9991016
for project in list_of_skip_projects:
10001017
if (
10011018
project.parent_id in projects_to_ingest
1002-
and self._is_denied_project(project) is False
1019+
and not self._is_denied_project(project)
10031020
):
1004-
logger.debug(f"Project {project.name} is added in project registry")
1021+
logger.debug(
1022+
f"Project {project.name} is added in project registry as it's a child project and not explicitly denied in `deny` list"
1023+
)
10051024
projects_to_ingest[project.id] = project
10061025

10071026
# We rely on automatic browse paths (v2) when creating containers. That's why we need to sort the projects here.

0 commit comments

Comments
 (0)