|
11 | 11 | from pydantic.fields import Field
|
12 | 12 | from wcmatch import pathlib
|
13 | 13 |
|
14 |
| -from datahub.configuration.common import ConfigModel |
| 14 | +from datahub.configuration.common import AllowDenyPattern, ConfigModel |
15 | 15 | from datahub.ingestion.source.aws.s3_util import is_s3_uri
|
16 | 16 | from datahub.ingestion.source.azure.abs_utils import is_abs_uri
|
17 | 17 | from datahub.ingestion.source.gcs.gcs_utils import is_gcs_uri
|
@@ -145,6 +145,11 @@ class Config:
|
145 | 145 | description="Include hidden folders in the traversal (folders starting with . or _",
|
146 | 146 | )
|
147 | 147 |
|
| 148 | + tables_filter_pattern: AllowDenyPattern = Field( |
| 149 | + default=AllowDenyPattern.allow_all(), |
| 150 | + description="The tables_filter_pattern configuration field uses regular expressions to filter the tables part of the Pathspec for ingestion, allowing fine-grained control over which tables are included or excluded based on specified patterns. The default setting allows all tables.", |
| 151 | + ) |
| 152 | + |
148 | 153 | def is_path_hidden(self, path: str) -> bool:
|
149 | 154 | # Split the path into directories and filename
|
150 | 155 | dirs, filename = os.path.split(path)
|
@@ -177,6 +182,12 @@ def allowed(self, path: str, ignore_ext: bool = False) -> bool:
|
177 | 182 | ):
|
178 | 183 | return False
|
179 | 184 | logger.debug(f"{path} is not excluded")
|
| 185 | + |
| 186 | + table_name, _ = self.extract_table_name_and_path(path) |
| 187 | + if not self.tables_filter_pattern.allowed(table_name): |
| 188 | + return False |
| 189 | + logger.debug(f"{path} is passed table name check") |
| 190 | + |
180 | 191 | ext = os.path.splitext(path)[1].strip(".")
|
181 | 192 |
|
182 | 193 | if not ignore_ext:
|
@@ -218,6 +229,15 @@ def dir_allowed(self, path: str) -> bool:
|
218 | 229 | exclude_path.rstrip("/"), flags=pathlib.GLOBSTAR
|
219 | 230 | ):
|
220 | 231 | return False
|
| 232 | + |
| 233 | + file_name_pattern = self.include.rsplit("/", 1)[1] |
| 234 | + table_name, _ = self.extract_table_name_and_path( |
| 235 | + os.path.join(path, file_name_pattern) |
| 236 | + ) |
| 237 | + if not self.tables_filter_pattern.allowed(table_name): |
| 238 | + return False |
| 239 | + logger.debug(f"{path} is passed table name check") |
| 240 | + |
221 | 241 | return True
|
222 | 242 |
|
223 | 243 | @classmethod
|
|
0 commit comments