From 64e26066b9bc4aedf26412bdc619d1293db3f020 Mon Sep 17 00:00:00 2001
From: Andrew Jones <andrewjones@grubhub.com>
Date: Fri, 16 Feb 2024 11:58:06 -0500
Subject: [PATCH] fix(ingestion/redash): Limit size of RedashSourceReport

Addressing the problem described in #9575

For large Redash deployments, the `filtered` and `timing` fields can grow to be very
large. By using the LossyList and LossyDict data structures in a similar way as used
in other SourceReports, we will limit the number of lines printed during an ingestion
run which makes the logs more usable and improves performance for ingestion against
Redash deployments with a large number of queries filtered.
---
 metadata-ingestion/src/datahub/ingestion/source/redash.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/redash.py b/metadata-ingestion/src/datahub/ingestion/source/redash.py
index 5b196782cbad22..f7b8bb09724a16 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redash.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redash.py
@@ -39,6 +39,7 @@
     ChartTypeClass,
     DashboardInfoClass,
 )
+from datahub.utilities.lossy_collections import LossyDict, LossyList
 from datahub.utilities.perf_timer import PerfTimer
 from datahub.utilities.sql_parser import SQLParser
 
@@ -282,7 +283,7 @@ class RedashConfig(ConfigModel):
 @dataclass
 class RedashSourceReport(SourceReport):
     items_scanned: int = 0
-    filtered: List[str] = field(default_factory=list)
+    filtered: LossyList[str] = field(default_factory=LossyList)
     queries_problem_parsing: Set[str] = field(default_factory=set)
     queries_no_dataset: Set[str] = field(default_factory=set)
     charts_no_input: Set[str] = field(default_factory=set)
@@ -295,7 +296,7 @@ class RedashSourceReport(SourceReport):
     )
     max_page_dashboards: Optional[int] = field(default=None)
     api_page_limit: Optional[float] = field(default=None)
-    timing: Dict[str, int] = field(default_factory=dict)
+    timing: LossyDict[str, int] = field(default_factory=LossyDict)
 
     def report_item_scanned(self) -> None:
         self.items_scanned += 1