@@ -391,6 +391,11 @@ def _get_external_table_partition_filters(
391
391
None if partition filters could not be determined
392
392
"""
393
393
try :
394
+ # Try sampling approach first - most efficient
395
+ sample_filters = self ._get_partitions_with_sampling (table , project , schema )
396
+ if sample_filters :
397
+ return sample_filters
398
+
394
399
# Step 1: Get partition columns from INFORMATION_SCHEMA
395
400
partition_cols_with_types = self ._get_partition_columns_from_info_schema (
396
401
table , project , schema
@@ -459,7 +464,7 @@ def _try_partition_combinations(
459
464
schema : str ,
460
465
partition_cols : List [str ],
461
466
partition_cols_with_types : Dict [str , str ],
462
- timeout : int = 300 , # This parameter will be ignored
467
+ timeout : int = 300 ,
463
468
) -> Optional [List [str ]]:
464
469
"""
465
470
Try to find combinations of partition values that return data.
@@ -860,6 +865,152 @@ def _try_find_most_populated_partition(
860
865
logger .warning ("All fallback approaches failed to find valid partition values" )
861
866
return None
862
867
868
+ def _verify_partition_has_data (
869
+ self ,
870
+ table : BigqueryTable ,
871
+ project : str ,
872
+ schema : str ,
873
+ filters : List [str ],
874
+ timeout : int = 300 , # Increased from 120
875
+ ) -> bool :
876
+ """
877
+ Verify that the partition filters actually return data.
878
+
879
+ Args:
880
+ table: BigqueryTable instance
881
+ project: BigQuery project ID
882
+ schema: BigQuery dataset name
883
+ filters: List of partition filter strings
884
+ timeout: Query timeout in seconds
885
+
886
+ Returns:
887
+ True if data exists, False otherwise
888
+ """
889
+ if not filters :
890
+ return False
891
+
892
+ # Build WHERE clause from filters
893
+ where_clause = " AND " .join (filters )
894
+
895
+ # Run a simple count query to check if data exists
896
+ query = f"""SELECT COUNT(*) as cnt
897
+ FROM `{ project } .{ schema } .{ table .name } `
898
+ WHERE { where_clause }
899
+ LIMIT 1000""" # Limit to avoid expensive full table scans
900
+
901
+ try :
902
+ logger .debug (f"Verifying partition data with query: { query } " )
903
+
904
+ # Set a longer timeout for this operation
905
+ query_job = self .config .get_bigquery_client ().query (query )
906
+ results = list (query_job .result ())
907
+
908
+ if results and results [0 ].cnt > 0 :
909
+ logger .info (
910
+ f"Verified partition filters return { results [0 ].cnt } rows: { where_clause } "
911
+ )
912
+ return True
913
+ else :
914
+ logger .warning (f"Partition verification found no data: { where_clause } " )
915
+ return False
916
+ except Exception as e :
917
+ logger .warning (f"Error verifying partition data: { e } " , exc_info = True )
918
+
919
+ # Try with a simpler query as fallback
920
+ try :
921
+ simpler_query = f"""
922
+ SELECT 1
923
+ FROM `{ project } .{ schema } .{ table .name } `
924
+ WHERE { where_clause }
925
+ LIMIT 1
926
+ """
927
+ query_job = self .config .get_bigquery_client ().query (simpler_query )
928
+ results = list (query_job .result ())
929
+
930
+ return len (results ) > 0
931
+ except Exception as simple_e :
932
+ logger .warning (f"Simple verification also failed: { simple_e } " )
933
+ return False
934
+
935
+ def _get_partitions_with_sampling (
936
+ self ,
937
+ table : BigqueryTable ,
938
+ project : str ,
939
+ schema : str ,
940
+ ) -> Optional [List [str ]]:
941
+ """
942
+ Get partition filters using sampling to avoid full table scans.
943
+
944
+ Args:
945
+ table: BigqueryTable instance
946
+ project: BigQuery project ID
947
+ schema: BigQuery dataset name
948
+
949
+ Returns:
950
+ List of partition filter strings, or None if unable to build filters
951
+ """
952
+ try :
953
+ # First get partition columns
954
+ partition_cols_with_types = self ._get_partition_columns_from_info_schema (
955
+ table , project , schema
956
+ )
957
+
958
+ if not partition_cols_with_types :
959
+ partition_cols_with_types = self ._get_partition_columns_from_ddl (
960
+ table , project , schema
961
+ )
962
+
963
+ if not partition_cols_with_types :
964
+ return None
965
+
966
+ logger .info (
967
+ f"Using sampling to find partition values for { len (partition_cols_with_types )} columns"
968
+ )
969
+
970
+ # Use TABLESAMPLE to get a small sample of data
971
+ sample_query = f"""
972
+ SELECT *
973
+ FROM `{ project } .{ schema } .{ table .name } ` TABLESAMPLE SYSTEM (1 PERCENT)
974
+ LIMIT 100
975
+ """
976
+
977
+ query_job = self .config .get_bigquery_client ().query (sample_query )
978
+ results = list (query_job .result ())
979
+
980
+ if not results :
981
+ logger .info ("Sample query returned no results" )
982
+ return None
983
+
984
+ # Extract values for partition columns
985
+ filters = []
986
+ for col_name , data_type in partition_cols_with_types .items ():
987
+ for row in results :
988
+ if hasattr (row , col_name ) and getattr (row , col_name ) is not None :
989
+ val = getattr (row , col_name )
990
+ filter_str = self ._create_partition_filter_from_value (
991
+ col_name , val , data_type
992
+ )
993
+ filters .append (filter_str )
994
+ logger .info (
995
+ f"Found partition value from sample: { col_name } ={ val } "
996
+ )
997
+ break
998
+
999
+ # Verify the filters return data
1000
+ if filters and self ._verify_partition_has_data (
1001
+ table , project , schema , filters
1002
+ ):
1003
+ logger .info (
1004
+ f"Successfully created partition filters from sample: { filters } "
1005
+ )
1006
+ return filters
1007
+
1008
+ return None
1009
+
1010
+ except Exception as e :
1011
+ logger .warning (f"Error getting partition filters with sampling: { e } " )
1012
+ return None
1013
+
863
1014
def _find_valid_partition_combination (
864
1015
self ,
865
1016
table : BigqueryTable ,
@@ -947,74 +1098,6 @@ def _find_valid_partition_combination(
947
1098
table , project , schema , partition_cols_with_types , timeout
948
1099
)
949
1100
950
- def _verify_partition_has_data (
951
- self ,
952
- table : BigqueryTable ,
953
- project : str ,
954
- schema : str ,
955
- filters : List [str ],
956
- timeout : int = 300 , # Increased from 120
957
- ) -> bool :
958
- """
959
- Verify that the partition filters actually return data.
960
-
961
- Args:
962
- table: BigqueryTable instance
963
- project: BigQuery project ID
964
- schema: BigQuery dataset name
965
- filters: List of partition filter strings
966
- timeout: Query timeout in seconds
967
-
968
- Returns:
969
- True if data exists, False otherwise
970
- """
971
- if not filters :
972
- return False
973
-
974
- # Build WHERE clause from filters
975
- where_clause = " AND " .join (filters )
976
-
977
- # Run a simple count query to check if data exists
978
- query = f"""SELECT COUNT(*) as cnt
979
- FROM `{ project } .{ schema } .{ table .name } `
980
- WHERE { where_clause }
981
- LIMIT 1000""" # Limit to avoid expensive full table scans
982
-
983
- try :
984
- logger .debug (f"Verifying partition data with query: { query } " )
985
-
986
- # Set a longer timeout for this operation
987
- query_job = self .config .get_bigquery_client ().query (query )
988
- results = list (query_job .result ())
989
-
990
- if results and results [0 ].cnt > 0 :
991
- logger .info (
992
- f"Verified partition filters return { results [0 ].cnt } rows: { where_clause } "
993
- )
994
- return True
995
- else :
996
- logger .warning (f"Partition verification found no data: { where_clause } " )
997
- return False
998
- except Exception as e :
999
- logger .warning (f"Error verifying partition data: { e } " , exc_info = True )
1000
-
1001
- # Try with a simpler query as fallback
1002
- try :
1003
- simpler_query = f"""
1004
- SELECT 1
1005
- FROM `{ project } .{ schema } .{ table .name } `
1006
- WHERE { where_clause }
1007
- LIMIT 1
1008
- """
1009
- query_job = self .config .get_bigquery_client ().query (simpler_query )
1010
- results = list (query_job .result ())
1011
-
1012
- return len (results ) > 0
1013
- except Exception as simple_e :
1014
- logger .warning (f"Simple verification also failed: { simple_e } " )
1015
- return False
1016
-
1017
- # Add this method to improve detection of partition columns from INFORMATION_SCHEMA if not found in partition_info
1018
1101
def _get_required_partition_filters (
1019
1102
self ,
1020
1103
table : BigqueryTable ,
@@ -1036,6 +1119,11 @@ def _get_required_partition_filters(
1036
1119
current_time = datetime .now (timezone .utc )
1037
1120
partition_filters = []
1038
1121
1122
+ # First try sampling approach as it's most efficient
1123
+ sample_filters = self ._get_partitions_with_sampling (table , project , schema )
1124
+ if sample_filters :
1125
+ return sample_filters
1126
+
1039
1127
# Get required partition columns from table info
1040
1128
required_partition_columns = set ()
1041
1129
@@ -1053,10 +1141,10 @@ def _get_required_partition_filters(
1053
1141
if not required_partition_columns :
1054
1142
try :
1055
1143
query = f"""SELECT column_name
1056
- FROM `{ project } .{ schema } .INFORMATION_SCHEMA.COLUMNS`
1057
- WHERE table_name = '{ table .name } ' AND is_partitioning_column = 'YES'"""
1144
+ FROM `{ project } .{ schema } .INFORMATION_SCHEMA.COLUMNS`
1145
+ WHERE table_name = '{ table .name } ' AND is_partitioning_column = 'YES'"""
1058
1146
query_job = self .config .get_bigquery_client ().query (query )
1059
- results = list (query_job )
1147
+ results = list (query_job . result () )
1060
1148
required_partition_columns = {row .column_name for row in results }
1061
1149
logger .debug (
1062
1150
f"Found partition columns from schema: { required_partition_columns } "
@@ -1076,14 +1164,14 @@ def _get_required_partition_filters(
1076
1164
1077
1165
logger .debug (f"Required partition columns: { required_partition_columns } " )
1078
1166
1079
- # Get column data types to handle casting correctly
1167
+ # Get column data types for the partition columns
1080
1168
column_data_types = {}
1081
1169
try :
1082
1170
query = f"""SELECT column_name, data_type
1083
- FROM `{ project } .{ schema } .INFORMATION_SCHEMA.COLUMNS`
1084
- WHERE table_name = '{ table .name } '"""
1171
+ FROM `{ project } .{ schema } .INFORMATION_SCHEMA.COLUMNS`
1172
+ WHERE table_name = '{ table .name } '"""
1085
1173
query_job = self .config .get_bigquery_client ().query (query )
1086
- results = list (query_job )
1174
+ results = list (query_job . result () )
1087
1175
column_data_types = {row .column_name : row .data_type for row in results }
1088
1176
except Exception as e :
1089
1177
logger .error (f"Error fetching column data types: { e } " )
@@ -1131,9 +1219,9 @@ def _get_required_partition_filters(
1131
1219
HAVING record_count > 0
1132
1220
ORDER BY { col_name } DESC
1133
1221
LIMIT 1
1134
- )
1135
- SELECT val, record_count
1136
- FROM PartitionStats"""
1222
+ )
1223
+ SELECT val, record_count
1224
+ FROM PartitionStats"""
1137
1225
logger .debug (f"Executing query for partition value: { query } " )
1138
1226
1139
1227
query_job = self .config .get_bigquery_client ().query (query )
@@ -1175,16 +1263,14 @@ def get_batch_kwargs(
1175
1263
"table_name" : bq_table .name ,
1176
1264
}
1177
1265
1178
- # Different handling path for external tables vs native tables
1266
+ # For external tables, add specific handling
1179
1267
if bq_table .external :
1180
- logger .info (f"Processing external table: { bq_table .name } " )
1181
- partition_filters = self ._get_external_table_partition_filters (
1182
- bq_table , db_name , schema_name , datetime .now (timezone .utc )
1183
- )
1184
- else :
1185
- partition_filters = self ._get_required_partition_filters (
1186
- bq_table , db_name , schema_name
1187
- )
1268
+ base_kwargs ["is_external" ] = "true"
1269
+ # Add any specific external table options needed
1270
+
1271
+ partition_filters = self ._get_required_partition_filters (
1272
+ bq_table , db_name , schema_name
1273
+ )
1188
1274
1189
1275
if partition_filters is None :
1190
1276
logger .warning (
@@ -1193,7 +1279,7 @@ def get_batch_kwargs(
1193
1279
)
1194
1280
return base_kwargs
1195
1281
1196
- # If no partition filters needed, return base kwargs
1282
+ # If no partition filters needed (e.g. some external tables) , return base kwargs
1197
1283
if not partition_filters :
1198
1284
return base_kwargs
1199
1285
0 commit comments