@@ -84,32 +84,69 @@ def _get_external_table_partition_filters(
84
84
None if partition filters could not be determined
85
85
"""
86
86
try :
87
+ # For external tables, we need to check specifically for partitioning columns
88
+ # and also look at the DDL if available to detect hive-style partitioning
89
+
90
+ # First, try to get partition columns directly from INFORMATION_SCHEMA
87
91
query = f"""SELECT column_name, data_type
88
92
FROM `{ project } .{ schema } .INFORMATION_SCHEMA.COLUMNS`
89
93
WHERE table_name = '{ table .name } ' AND is_partitioning_column = 'YES'"""
90
94
query_job = self .config .get_bigquery_client ().query (query )
91
95
results = list (query_job )
92
96
93
- if results :
94
- # For external tables, also capture data type
95
- partition_cols_with_types = {
96
- row .column_name : row .data_type for row in results
97
- }
98
- required_partition_columns = set (partition_cols_with_types .keys ())
97
+ partition_cols_with_types = {
98
+ row .column_name : row .data_type for row in results
99
+ }
100
+
101
+ # If we didn't find any partition columns through INFORMATION_SCHEMA,
102
+ # check the DDL for external table declarations that have partition info
103
+ if not partition_cols_with_types and table .ddl :
104
+ # Very simple DDL parsing to look for PARTITION BY statements
105
+ if "PARTITION BY" in table .ddl .upper ():
106
+ ddl_lines = table .ddl .upper ().split ("\n " )
107
+ for line in ddl_lines :
108
+ if "PARTITION BY" in line :
109
+ # Look for column names mentioned in the PARTITION BY clause
110
+ # This is a basic extraction and may need enhancement for complex DDLs
111
+ parts = (
112
+ line .split ("PARTITION BY" )[1 ]
113
+ .split ("OPTIONS" )[0 ]
114
+ .strip ()
115
+ )
116
+ potential_cols = [
117
+ col .strip (", `()" ) for col in parts .split ()
118
+ ]
119
+
120
+ # Get all columns to check data types for potential partition columns
121
+ all_cols_query = f"""SELECT column_name, data_type
122
+ FROM `{ project } .{ schema } .INFORMATION_SCHEMA.COLUMNS`
123
+ WHERE table_name = '{ table .name } '"""
124
+ all_cols_job = self .config .get_bigquery_client ().query (
125
+ all_cols_query
126
+ )
127
+ all_cols_results = list (all_cols_job )
128
+ all_cols_dict = {
129
+ row .column_name .upper (): row .data_type
130
+ for row in all_cols_results
131
+ }
132
+
133
+ # Add potential partition columns with their types
134
+ for col in potential_cols :
135
+ if col in all_cols_dict :
136
+ partition_cols_with_types [col ] = all_cols_dict [col ]
137
+
138
+ partition_filters = []
139
+
140
+ # Process all identified partition columns
141
+ for col_name , data_type in partition_cols_with_types .items ():
99
142
logger .debug (
100
- f"Found external partition columns : { required_partition_columns } "
143
+ f"Processing external table partition column : { col_name } with type { data_type } "
101
144
)
102
145
103
- # For tables with single DATE/TIMESTAMP partition, we can use current date/time
104
- if len (required_partition_columns ) == 1 :
105
- col_name = list (required_partition_columns )[0 ]
106
- col_type = partition_cols_with_types [col_name ].upper ()
107
-
108
- # For temporal partitions, find the latest non-empty partition
109
- if col_type in ("DATE" , "TIMESTAMP" , "DATETIME" ):
110
- # Query to find latest non-empty partition for temporal columns
111
- temporal_query = f"""WITH PartitionStats AS (
112
- SELECT { col_name } as partition_value,
146
+ # For each partition column, we need to find a valid value
147
+ query = f"""
148
+ WITH PartitionStats AS (
149
+ SELECT { col_name } as val,
113
150
COUNT(*) as record_count
114
151
FROM `{ project } .{ schema } .{ table .name } `
115
152
WHERE { col_name } IS NOT NULL
@@ -118,41 +155,57 @@ def _get_external_table_partition_filters(
118
155
ORDER BY { col_name } DESC
119
156
LIMIT 1
120
157
)
121
- SELECT partition_value , record_count
158
+ SELECT val , record_count
122
159
FROM PartitionStats"""
123
160
124
- query_job = self .config .get_bigquery_client ().query (
125
- temporal_query
126
- )
127
- temporal_results = list (query_job )
161
+ try :
162
+ query_job = self .config .get_bigquery_client ().query (query )
163
+ results = list (query_job .result (timeout = 30 ))
128
164
129
- if not temporal_results :
130
- logger .warning (
131
- f"No non-empty partitions found for { col_name } "
132
- )
133
- return None
165
+ if not results or results [ 0 ]. val is None :
166
+ logger .warning (
167
+ f"No non-empty partition values found for column { col_name } "
168
+ )
169
+ continue
134
170
135
- partition_value = temporal_results [0 ].partition_value
136
- record_count = temporal_results [0 ].record_count
171
+ val = results [0 ].val
172
+ record_count = results [0 ].record_count
173
+ logger .info (
174
+ f"Selected external partition { col_name } ={ val } with { record_count } records"
175
+ )
137
176
138
- if col_type == "DATE" :
139
- filter_value = (
140
- f"DATE '{ partition_value .strftime ('%Y-%m-%d' )} '"
177
+ # Format the filter based on the data type
178
+ data_type_upper = data_type .upper () if data_type else ""
179
+ if data_type_upper in ("STRING" , "VARCHAR" ):
180
+ partition_filters .append (f"`{ col_name } ` = '{ val } '" )
181
+ elif data_type_upper == "DATE" :
182
+ partition_filters .append (f"`{ col_name } ` = DATE '{ val } '" )
183
+ elif data_type_upper in ("TIMESTAMP" , "DATETIME" ):
184
+ if isinstance (val , datetime ):
185
+ partition_filters .append (
186
+ f"`{ col_name } ` = TIMESTAMP '{ val .strftime ('%Y-%m-%d %H:%M:%S' )} '"
187
+ )
188
+ else :
189
+ partition_filters .append (
190
+ f"`{ col_name } ` = TIMESTAMP '{ val } '"
141
191
)
142
- else : # TIMESTAMP or DATETIME
143
- filter_value = f"TIMESTAMP '{ partition_value .strftime ('%Y-%m-%d %H:%M:%S' )} '"
192
+ else :
193
+ # Default to numeric or other type
194
+ partition_filters .append (f"`{ col_name } ` = { val } " )
144
195
145
- logger .info (
146
- f"Selected temporal partition { col_name } ={ partition_value } "
147
- f"with { record_count } records"
148
- )
149
- return [f"`{ col_name } ` = { filter_value } " ]
196
+ except Exception as e :
197
+ logger .warning (
198
+ f"Error determining value for partition column { col_name } : { e } "
199
+ )
200
+ continue
201
+
202
+ return partition_filters
150
203
151
- return [] # No partitions found (valid for external tables)
152
204
except Exception as e :
153
205
logger .error (f"Error checking external table partitioning: { e } " )
154
206
return None
155
207
208
+ # Add this method to improve detection of partition columns from INFORMATION_SCHEMA if not found in partition_info
156
209
def _get_required_partition_filters (
157
210
self ,
158
211
table : BigqueryTable ,
@@ -187,15 +240,30 @@ def _get_required_partition_filters(
187
240
col .name for col in table .partition_info .columns if col
188
241
)
189
242
190
- # If no partition columns found, check for external table partitioning
243
+ # If no partition columns found from partition_info, query INFORMATION_SCHEMA
244
+ if not required_partition_columns :
245
+ try :
246
+ query = f"""SELECT column_name
247
+ FROM `{ project } .{ schema } .INFORMATION_SCHEMA.COLUMNS`
248
+ WHERE table_name = '{ table .name } ' AND is_partitioning_column = 'YES'"""
249
+ query_job = self .config .get_bigquery_client ().query (query )
250
+ results = list (query_job )
251
+ required_partition_columns = {row .column_name for row in results }
252
+ logger .debug (
253
+ f"Found partition columns from schema: { required_partition_columns } "
254
+ )
255
+ except Exception as e :
256
+ logger .error (f"Error querying partition columns: { e } " )
257
+
258
+ # If still no partition columns found, check for external table partitioning
191
259
if not required_partition_columns :
192
260
logger .debug (f"No partition columns found for table { table .name } " )
193
261
if table .external :
194
262
return self ._get_external_table_partition_filters (
195
263
table , project , schema , current_time
196
264
)
197
265
else :
198
- return None # Internal table without partitions (unexpected)
266
+ return None
199
267
200
268
logger .debug (f"Required partition columns: { required_partition_columns } " )
201
269
@@ -245,8 +313,7 @@ def _get_required_partition_filters(
245
313
for col_name in other_columns :
246
314
try :
247
315
# Query to get latest non-empty partition
248
- query = f"""
249
- WITH PartitionStats AS (
316
+ query = f"""WITH PartitionStats AS (
250
317
SELECT { col_name } as val,
251
318
COUNT(*) as record_count
252
319
FROM `{ project } .{ schema } .{ table .name } `
@@ -299,14 +366,16 @@ def get_batch_kwargs(
299
366
"table_name" : bq_table .name ,
300
367
}
301
368
302
- # For external tables, add specific handling
369
+ # Different handling path for external tables vs native tables
303
370
if bq_table .external :
304
- base_kwargs ["is_external" ] = "true"
305
- # Add any specific external table options needed
306
-
307
- partition_filters = self ._get_required_partition_filters (
308
- bq_table , db_name , schema_name
309
- )
371
+ logger .info (f"Processing external table: { bq_table .name } " )
372
+ partition_filters = self ._get_external_table_partition_filters (
373
+ bq_table , db_name , schema_name , datetime .now (timezone .utc )
374
+ )
375
+ else :
376
+ partition_filters = self ._get_required_partition_filters (
377
+ bq_table , db_name , schema_name
378
+ )
310
379
311
380
if partition_filters is None :
312
381
logger .warning (
@@ -315,7 +384,7 @@ def get_batch_kwargs(
315
384
)
316
385
return base_kwargs
317
386
318
- # If no partition filters needed (e.g. some external tables) , return base kwargs
387
+ # If no partition filters needed, return base kwargs
319
388
if not partition_filters :
320
389
return base_kwargs
321
390
0 commit comments