@@ -67,6 +67,18 @@ def __init__(
67
67
self .report = report
68
68
self .client = client
69
69
70
+ if self .config .profiling .use_sampling :
71
+ self .sample_size = self .config .profiling .sample_size
72
+ else :
73
+ self .sample_size = 0
74
+
75
+ self .field_sample_count = self .config .profiling .field_sample_values_limit
76
+
77
+ if self .config .profiling .max_number_of_fields_to_profile :
78
+ self .sample_fields = self .config .profiling .max_number_of_fields_to_profile
79
+ else :
80
+ self .sample_fields = 0
81
+
70
82
try :
71
83
self .loop = asyncio .get_running_loop ()
72
84
except RuntimeError :
@@ -95,7 +107,10 @@ def generate_profile(self, keyspace: str) -> Iterable[MetadataWorkUnit]:
95
107
platform_instance = self .config .cluster_name ,
96
108
)
97
109
98
- if not self .config .profile_pattern .allowed (keyspace ):
110
+ if (
111
+ not self .config .profile_pattern .allowed (keyspace )
112
+ and self .config .profiling .report_dropped_profiles
113
+ ):
99
114
self .report .profiling_skipped_table_profile_pattern [keyspace ] += 1
100
115
logger .info (f"Profiling not allowed for Keyspace { keyspace } " )
101
116
return
@@ -193,8 +208,12 @@ async def _collect_column_data(
193
208
self , keyspace : str , profile_data : ProfileData
194
209
) -> ProfileData :
195
210
document_total_count : int = 0
211
+ dropped_fields = set ()
212
+ dropped_nested_fields = set ()
196
213
197
- aggregator = CouchbaseAggregate (self .client , keyspace )
214
+ aggregator = CouchbaseAggregate (
215
+ self .client , keyspace , max_sample_size = self .sample_size
216
+ )
198
217
199
218
async for chunk in aggregator .get_documents ():
200
219
for document in chunk :
@@ -204,7 +223,18 @@ async def _collect_column_data(
204
223
for _field , data in flatten ([], document ):
205
224
column_values [_field ].append (data )
206
225
207
- for field_name , values in column_values .items ():
226
+ for n , (field_name , values ) in enumerate (column_values .items ()):
227
+ if 0 < self .sample_fields <= n :
228
+ dropped_fields .add (field_name )
229
+ continue
230
+
231
+ if (
232
+ not self .config .profiling .profile_nested_fields
233
+ and len (field_name .split ("." )) > 1
234
+ ):
235
+ dropped_nested_fields .add (field_name )
236
+ continue
237
+
208
238
if field_name not in profile_data .column_metrics :
209
239
profile_data .column_metrics [field_name ] = ColumnMetric ()
210
240
if not profile_data .column_count :
@@ -229,8 +259,23 @@ async def _collect_column_data(
229
259
else :
230
260
profile_data .column_metrics [field_name ].values .append (value )
231
261
262
+ if len (dropped_fields ) > 0 :
263
+ if self .config .profiling .report_dropped_profiles :
264
+ self .report .report_dropped (
265
+ f"The max_number_of_fields_to_profile={ self .sample_fields } reached. Dropped fields for { keyspace } ({ ', ' .join (sorted (dropped_fields ))} )"
266
+ )
267
+
268
+ if len (dropped_nested_fields ) > 0 :
269
+ if self .config .profiling .report_dropped_profiles :
270
+ self .report .report_dropped (
271
+ f"Dropped nested fields for { keyspace } ({ ', ' .join (sorted (dropped_nested_fields ))} )"
272
+ )
273
+
232
274
profile_data .row_count = document_total_count
233
275
276
+ return self ._add_field_statistics (profile_data )
277
+
278
+ def _add_field_statistics (self , profile_data : ProfileData ) -> ProfileData :
234
279
for field_name , column_metrics in profile_data .column_metrics .items ():
235
280
if column_metrics .values :
236
281
try :
@@ -277,7 +322,9 @@ def _compute_field_statistics(self, column_metrics: ColumnMetric) -> None:
277
322
]
278
323
279
324
if values and self .config .profiling .include_field_sample_values :
280
- column_metrics .sample_values = [str (v ) for v in values [:5 ]]
325
+ column_metrics .sample_values = [
326
+ str (v ) for v in values [: self .field_sample_count ]
327
+ ]
281
328
282
329
@staticmethod
283
330
def _is_numeric_type (data_type : Union [str , None ]) -> bool :
0 commit comments