@@ -1298,26 +1298,20 @@ def _get_external_table_partition_filters(
1298
1298
metadata = self ._get_table_metadata (table , project , schema )
1299
1299
partition_columns = metadata ["partition_columns" ]
1300
1300
1301
- # If no partition columns, try using TABLESAMPLE for safe profiling
1301
+ # If no partition columns, use TABLESAMPLE for safe profiling
1302
1302
if not partition_columns :
1303
1303
logger .info (
1304
1304
f"No partition columns found for external table { table .name } , using TABLESAMPLE"
1305
1305
)
1306
- # For external tables with no partitions, use a small sample - return empty list
1306
+ # Return empty list for external tables with no partitions
1307
+ # (TABLESAMPLE will be applied when generating SQL)
1307
1308
return []
1308
1309
1309
- # External table size estimation - be cautious with large tables
1310
- is_large_table = False
1311
- if metadata .get ("size_bytes" , 0 ) > 10_000_000_000 : # 10 GB
1312
- is_large_table = True
1313
- logger .warning (
1314
- f"External table { table .name } is large, using aggressive filtering"
1315
- )
1316
- elif metadata .get ("row_count" , 0 ) > 10_000_000 : # 10M rows
1317
- is_large_table = True
1318
- logger .warning (
1319
- f"External table { table .name } has many rows, using aggressive filtering"
1320
- )
1310
+ # IMPORTANT: Always treat external tables as large tables, regardless of reported size
1311
+ is_large_table = True
1312
+ logger .info (
1313
+ f"External table { table .name } is being treated as a large table for safe profiling"
1314
+ )
1321
1315
1322
1316
# For external tables, prioritize date/time-based partitioning first
1323
1317
date_filters = self ._try_date_based_filtering_for_external (
@@ -1903,9 +1897,15 @@ def _get_required_partition_filters(
1903
1897
logger .info (f"Using cached filters for { table_key } " )
1904
1898
return self ._successful_filters_cache [table_key ]
1905
1899
1906
- # Optimization: First check if this table has minimal data or is very small
1900
+ # Important change - External tables should always be considered "large" for profiling
1901
+ # This ensures we use proper strategies even if their reported size is 0
1907
1902
is_small_table = False
1908
- if (
1903
+ if table .external :
1904
+ logger .info (
1905
+ f"Table { table .name } is an external table, treating as large table for profiling"
1906
+ )
1907
+ is_small_table = False
1908
+ elif (
1909
1909
table .size_in_bytes is not None
1910
1910
and table .size_in_bytes < 100_000_000 # Less than 100MB
1911
1911
and table .rows_count is not None
@@ -1928,6 +1928,7 @@ def _get_required_partition_filters(
1928
1928
logger .info (f"Found partition columns for { table .name } : { partition_columns } " )
1929
1929
1930
1930
# For small tables with partitioning, we can try without filters first
1931
+ # But NOT for external tables - they should be handled cautiously
1931
1932
if is_small_table and not table .external :
1932
1933
logger .info (
1933
1934
"Small table with partitioning, checking if full scan is viable"
@@ -2059,7 +2060,8 @@ def get_batch_kwargs(
2059
2060
and bq_table .size_in_bytes > 5_000_000_000
2060
2061
or bq_table .rows_count
2061
2062
and bq_table .rows_count > 50_000_000
2062
- ): # > 5GB
2063
+ or bq_table .external # IMPORTANT: Always add hints for external tables
2064
+ ): # > 5GB or external
2063
2065
needs_optimization_hints = True
2064
2066
2065
2067
if needs_optimization_hints :
0 commit comments