Skip to content

Commit bb14433

Browse files
committed
feat(ingest): Couchbase source profiling updates
1 parent 8591db8 commit bb14433

File tree

4 files changed

+63
-8
lines changed

4 files changed

+63
-8
lines changed

metadata-ingestion/docs/sources/couchbase/couchbase_recipe.yml

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ source:
1111
- type: datahub
1212
profiling:
1313
enabled: true
14+
profile_nested_fields: true
1415

1516
sink:
1617
# sink configs

metadata-ingestion/src/datahub/ingestion/source/couchbase/couchbase_common.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,14 @@
1313
ClassificationReportMixin,
1414
ClassificationSourceConfigMixin,
1515
)
16-
from datahub.ingestion.source.ge_profiling_config import GEProfilingBaseConfig
16+
from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
1717
from datahub.ingestion.source.state.stale_entity_removal_handler import (
1818
StaleEntityRemovalSourceReport,
1919
StatefulIngestionConfigBase,
2020
)
21+
from datahub.ingestion.source.state.stateful_ingestion_base import (
22+
StatefulProfilingConfigMixin,
23+
)
2124
from datahub.ingestion.source_config.operation_config import is_profiling_enabled
2225
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
2326
from datahub.utilities.perf_timer import PerfTimer
@@ -47,6 +50,7 @@ class CouchbaseDBConfig(
4750
EnvConfigMixin,
4851
StatefulIngestionConfigBase,
4952
ClassificationSourceConfigMixin,
53+
StatefulProfilingConfigMixin,
5054
):
5155
connect_string: str = Field(
5256
default="couchbases://127.0.0.1", description="Couchbase connect string."
@@ -82,8 +86,8 @@ class CouchbaseDBConfig(
8286
description="Regex patterns for tables to profile",
8387
)
8488

85-
profiling: GEProfilingBaseConfig = Field(
86-
default=GEProfilingBaseConfig(),
89+
profiling: GEProfilingConfig = Field(
90+
default=GEProfilingConfig(),
8791
description="Configuration for profiling",
8892
)
8993

metadata-ingestion/src/datahub/ingestion/source/couchbase/couchbase_profiling.py

+51-4
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,18 @@ def __init__(
6767
self.report = report
6868
self.client = client
6969

70+
if self.config.profiling.use_sampling:
71+
self.sample_size = self.config.profiling.sample_size
72+
else:
73+
self.sample_size = 0
74+
75+
self.field_sample_count = self.config.profiling.field_sample_values_limit
76+
77+
if self.config.profiling.max_number_of_fields_to_profile:
78+
self.sample_fields = self.config.profiling.max_number_of_fields_to_profile
79+
else:
80+
self.sample_fields = 0
81+
7082
try:
7183
self.loop = asyncio.get_running_loop()
7284
except RuntimeError:
@@ -95,7 +107,10 @@ def generate_profile(self, keyspace: str) -> Iterable[MetadataWorkUnit]:
95107
platform_instance=self.config.cluster_name,
96108
)
97109

98-
if not self.config.profile_pattern.allowed(keyspace):
110+
if (
111+
not self.config.profile_pattern.allowed(keyspace)
112+
and self.config.profiling.report_dropped_profiles
113+
):
99114
self.report.profiling_skipped_table_profile_pattern[keyspace] += 1
100115
logger.info(f"Profiling not allowed for Keyspace {keyspace}")
101116
return
@@ -193,8 +208,12 @@ async def _collect_column_data(
193208
self, keyspace: str, profile_data: ProfileData
194209
) -> ProfileData:
195210
document_total_count: int = 0
211+
dropped_fields = set()
212+
dropped_nested_fields = set()
196213

197-
aggregator = CouchbaseAggregate(self.client, keyspace)
214+
aggregator = CouchbaseAggregate(
215+
self.client, keyspace, max_sample_size=self.sample_size
216+
)
198217

199218
async for chunk in aggregator.get_documents():
200219
for document in chunk:
@@ -204,7 +223,18 @@ async def _collect_column_data(
204223
for _field, data in flatten([], document):
205224
column_values[_field].append(data)
206225

207-
for field_name, values in column_values.items():
226+
for n, (field_name, values) in enumerate(column_values.items()):
227+
if 0 < self.sample_fields <= n:
228+
dropped_fields.add(field_name)
229+
continue
230+
231+
if (
232+
not self.config.profiling.profile_nested_fields
233+
and len(field_name.split(".")) > 1
234+
):
235+
dropped_nested_fields.add(field_name)
236+
continue
237+
208238
if field_name not in profile_data.column_metrics:
209239
profile_data.column_metrics[field_name] = ColumnMetric()
210240
if not profile_data.column_count:
@@ -229,8 +259,23 @@ async def _collect_column_data(
229259
else:
230260
profile_data.column_metrics[field_name].values.append(value)
231261

262+
if len(dropped_fields) > 0:
263+
if self.config.profiling.report_dropped_profiles:
264+
self.report.report_dropped(
265+
f"The max_number_of_fields_to_profile={self.sample_fields} reached. Dropped fields for {keyspace} ({', '.join(sorted(dropped_fields))})"
266+
)
267+
268+
if len(dropped_nested_fields) > 0:
269+
if self.config.profiling.report_dropped_profiles:
270+
self.report.report_dropped(
271+
f"Dropped nested fields for {keyspace} ({', '.join(sorted(dropped_nested_fields))})"
272+
)
273+
232274
profile_data.row_count = document_total_count
233275

276+
return self._add_field_statistics(profile_data)
277+
278+
def _add_field_statistics(self, profile_data: ProfileData) -> ProfileData:
234279
for field_name, column_metrics in profile_data.column_metrics.items():
235280
if column_metrics.values:
236281
try:
@@ -277,7 +322,9 @@ def _compute_field_statistics(self, column_metrics: ColumnMetric) -> None:
277322
]
278323

279324
if values and self.config.profiling.include_field_sample_values:
280-
column_metrics.sample_values = [str(v) for v in values[:5]]
325+
column_metrics.sample_values = [
326+
str(v) for v in values[: self.field_sample_count]
327+
]
281328

282329
@staticmethod
283330
def _is_numeric_type(data_type: Union[str, None]) -> bool:

metadata-ingestion/tests/integration/couchbase/test_couchbase.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,10 @@ def collection_wait():
110110
"username": "Administrator",
111111
"password": "password",
112112
"cluster_name": "testdb",
113-
"profiling": {"enabled": True},
113+
"profiling": {
114+
"enabled": True,
115+
"profile_nested_fields": True,
116+
},
114117
"classification": ClassificationConfig(
115118
enabled=True,
116119
classifiers=[

0 commit comments

Comments
 (0)