From 64e26066b9bc4aedf26412bdc619d1293db3f020 Mon Sep 17 00:00:00 2001 From: Andrew Jones <andrewjones@grubhub.com> Date: Fri, 16 Feb 2024 11:58:06 -0500 Subject: [PATCH] fix(ingestion/redash): Limit size of RedashSourceReport Addressing the problem described in #9575 For large Redash deployments, the `filtered` and `timing` fields can grow to be very large. By using the LossyList and LossyDict data structures in a similar way as used in other SourceReports, we will limit the number of lines printed during an ingestion run which makes the logs more usable and improves performance for ingestion against Redash deployments with a large number of queries filtered. --- metadata-ingestion/src/datahub/ingestion/source/redash.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/redash.py b/metadata-ingestion/src/datahub/ingestion/source/redash.py index 5b196782cbad22..f7b8bb09724a16 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redash.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redash.py @@ -39,6 +39,7 @@ ChartTypeClass, DashboardInfoClass, ) +from datahub.utilities.lossy_collections import LossyDict, LossyList from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.sql_parser import SQLParser @@ -282,7 +283,7 @@ class RedashConfig(ConfigModel): @dataclass class RedashSourceReport(SourceReport): items_scanned: int = 0 - filtered: List[str] = field(default_factory=list) + filtered: LossyList[str] = field(default_factory=LossyList) queries_problem_parsing: Set[str] = field(default_factory=set) queries_no_dataset: Set[str] = field(default_factory=set) charts_no_input: Set[str] = field(default_factory=set) @@ -295,7 +296,7 @@ class RedashSourceReport(SourceReport): ) max_page_dashboards: Optional[int] = field(default=None) api_page_limit: Optional[float] = field(default=None) - timing: Dict[str, int] = field(default_factory=dict) + timing: LossyDict[str, int] = field(default_factory=LossyDict) def report_item_scanned(self) -> None: self.items_scanned += 1