refactor(ingest/tableau): mark the fetch_size configuration as deprecated (#12126)

sid-acryl · web-flow · commit 98c056d569d4 · 2024-12-20T11:06:57.000-08:00
diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py
@@ -49,6 +49,7 @@
     DatasetSourceConfigMixin,
 )
 from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
+from datahub.configuration.validate_field_removal import pydantic_removed_field
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.emitter.mcp_builder import (
     ContainerKey,
@@ -380,11 +381,6 @@ class TableauConfig(
         description="[advanced] Number of metadata objects (e.g. CustomSQLTable, PublishedDatasource, etc) to query at a time using the Tableau API.",
     )
 
-    fetch_size: int = Field(
-        default=250,
-        description="Specifies the number of records to retrieve in each batch during a query execution.",
-    )
-
     # We've found that even with a small workbook page size (e.g. 10), the Tableau API often
     # returns warnings like this:
     # {
@@ -499,6 +495,10 @@ class TableauConfig(
         "This can only be used with ingest_tags enabled as it will overwrite tags entered from the UI.",
     )
 
+    _fetch_size = pydantic_removed_field(
+        "fetch_size",
+    )
+
     # pre = True because we want to take some decision before pydantic initialize the configuration to default values
     @root_validator(pre=True)
     def projects_backward_compatibility(cls, values: Dict) -> Dict:
@@ -1147,7 +1147,7 @@ def get_connection_object_page(
         connection_type: str,
         query_filter: str,
         current_cursor: Optional[str],
-        fetch_size: int = 250,
+        fetch_size: int,
         retry_on_auth_error: bool = True,
         retries_remaining: Optional[int] = None,
     ) -> Tuple[dict, Optional[str], int]:
@@ -1344,7 +1344,11 @@ def get_connection_objects(
                     connection_type=connection_type,
                     query_filter=filter_,
                     current_cursor=current_cursor,
-                    fetch_size=self.config.fetch_size,
+                    # `filter_page` contains metadata object IDs (e.g., Project IDs, Field IDs, Sheet IDs, etc.).
+                    # The number of IDs is always less than or equal to page_size.
+                    # If the IDs are primary keys, the number of metadata objects to load matches the number of records to return.
+                    # In our case, mostly, the IDs are primary key, therefore, fetch_size is set equal to page_size.
+                    fetch_size=page_size,
                 )
 
                 yield from connection_objects.get(c.NODES) or []
diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py
@@ -1324,6 +1324,7 @@ def test_permission_warning(pytestconfig, tmp_path, mock_datahub_graph):
                 query_filter=mock.MagicMock(),
                 current_cursor=None,
                 retries_remaining=1,
+                fetch_size=10,
             )
 
             warnings = list(reporter.warnings)

Original file line number	Diff line number	Diff line change
`@@ -1324,6 +1324,7 @@ def test_permission_warning(pytestconfig, tmp_path, mock_datahub_graph):`
`1324`	`1324`	`query_filter=mock.MagicMock(),`
`1325`	`1325`	`current_cursor=None,`
`1326`	`1326`	`retries_remaining=1,`
	`1327`	`+ fetch_size=10,`
`1327`	`1328`	`)`
`1328`	`1329`
`1329`	`1330`	`warnings = list(reporter.warnings)`