Skip to content

Commit 20acb75

Browse files
committed
Update profiler.py
1 parent 910c7e3 commit 20acb75

File tree

1 file changed

+19
-17
lines changed
  • metadata-ingestion/src/datahub/ingestion/source/bigquery_v2

1 file changed

+19
-17
lines changed

metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py

+19-17
Original file line numberDiff line numberDiff line change
@@ -1298,26 +1298,20 @@ def _get_external_table_partition_filters(
12981298
metadata = self._get_table_metadata(table, project, schema)
12991299
partition_columns = metadata["partition_columns"]
13001300

1301-
# If no partition columns, try using TABLESAMPLE for safe profiling
1301+
# If no partition columns, use TABLESAMPLE for safe profiling
13021302
if not partition_columns:
13031303
logger.info(
13041304
f"No partition columns found for external table {table.name}, using TABLESAMPLE"
13051305
)
1306-
# For external tables with no partitions, use a small sample - return empty list
1306+
# Return empty list for external tables with no partitions
1307+
# (TABLESAMPLE will be applied when generating SQL)
13071308
return []
13081309

1309-
# External table size estimation - be cautious with large tables
1310-
is_large_table = False
1311-
if metadata.get("size_bytes", 0) > 10_000_000_000: # 10 GB
1312-
is_large_table = True
1313-
logger.warning(
1314-
f"External table {table.name} is large, using aggressive filtering"
1315-
)
1316-
elif metadata.get("row_count", 0) > 10_000_000: # 10M rows
1317-
is_large_table = True
1318-
logger.warning(
1319-
f"External table {table.name} has many rows, using aggressive filtering"
1320-
)
1310+
# IMPORTANT: Always treat external tables as large tables, regardless of reported size
1311+
is_large_table = True
1312+
logger.info(
1313+
f"External table {table.name} is being treated as a large table for safe profiling"
1314+
)
13211315

13221316
# For external tables, prioritize date/time-based partitioning first
13231317
date_filters = self._try_date_based_filtering_for_external(
@@ -1903,9 +1897,15 @@ def _get_required_partition_filters(
19031897
logger.info(f"Using cached filters for {table_key}")
19041898
return self._successful_filters_cache[table_key]
19051899

1906-
# Optimization: First check if this table has minimal data or is very small
1900+
# Important change - External tables should always be considered "large" for profiling
1901+
# This ensures we use proper strategies even if their reported size is 0
19071902
is_small_table = False
1908-
if (
1903+
if table.external:
1904+
logger.info(
1905+
f"Table {table.name} is an external table, treating as large table for profiling"
1906+
)
1907+
is_small_table = False
1908+
elif (
19091909
table.size_in_bytes is not None
19101910
and table.size_in_bytes < 100_000_000 # Less than 100MB
19111911
and table.rows_count is not None
@@ -1928,6 +1928,7 @@ def _get_required_partition_filters(
19281928
logger.info(f"Found partition columns for {table.name}: {partition_columns}")
19291929

19301930
# For small tables with partitioning, we can try without filters first
1931+
# But NOT for external tables - they should be handled cautiously
19311932
if is_small_table and not table.external:
19321933
logger.info(
19331934
"Small table with partitioning, checking if full scan is viable"
@@ -2059,7 +2060,8 @@ def get_batch_kwargs(
20592060
and bq_table.size_in_bytes > 5_000_000_000
20602061
or bq_table.rows_count
20612062
and bq_table.rows_count > 50_000_000
2062-
): # > 5GB
2063+
or bq_table.external # IMPORTANT: Always add hints for external tables
2064+
): # > 5GB or external
20632065
needs_optimization_hints = True
20642066

20652067
if needs_optimization_hints:

0 commit comments

Comments
 (0)