JSON report generation from scan results (#1929)

bmalezieux · Benoît Malézieux · Benoît Malézieux · web-flow · commit 5987d270d7e6 · 2024-06-10T15:54:44.000+02:00
* ENH: adding json report generation from scan results and test suite results

* TEST: add test for json report

* UPD: add issue level to json report

* FIX: detector name now part of Issue properties

* FIX: minor fixes after review

* FIX: bug in test for json report

* FIX: pre-commit

* FIX: clarify loop in json generation + add test on detector in report generation + change output format

* FIX: pre-commit

---------

Co-authored-by: Benoît Malézieux &lt;benoit@mbp-benoit.home&gt;
Co-authored-by: Benoît Malézieux &lt;benoit@giskard.ai&gt;
Co-authored-by: Kevin Messiaen &lt;114553769+kevinmessiaen@users.noreply.github.com&gt;
Co-authored-by: Rabah Khalek &lt;rabah.khalek@gmail.com&gt;
diff --git a/giskard/core/suite.py b/giskard/core/suite.py
@@ -3,6 +3,7 @@
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
 import inspect
+import json
 import logging
 import traceback
 import warnings
@@ -219,6 +220,19 @@ def _to_dto(self, label: Optional[str], client: GiskardClient, project_key: str)
             completionDate=self.completion_date.isoformat(),
         )
 
+    def to_json(self, filename=None):
+        results = {}
+        for suite_result in self.results:
+            results[suite_result.test_name] = {
+                "result": "Passed" if suite_result.result.passed else "Failed",
+                "metric_value": suite_result.result.metric,
+            }
+        if filename is not None:
+            with open(filename, "w") as json_file:
+                json.dump(results, json_file, indent=4)
+        else:
+            return json.dumps(results, indent=4)
+
     def to_mlflow(self, mlflow_client: MlflowClient = None, mlflow_run_id: str = None):
         import mlflow
 
diff --git a/giskard/scanner/calibration/overconfidence_detector.py b/giskard/scanner/calibration/overconfidence_detector.py
@@ -105,6 +105,7 @@ def _find_issues(
                     tests=_generate_overconfidence_tests,
                     importance=relative_delta,
                     taxonomy=["avid-effect:performance:P0204"],
+                    detector_name=self.__class__.__name__,
                 )
 
                 # Add examples
diff --git a/giskard/scanner/calibration/underconfidence_detector.py b/giskard/scanner/calibration/underconfidence_detector.py
@@ -105,6 +105,7 @@ def _find_issues(
                     importance=relative_delta,
                     tests=_generate_underconfidence_tests,
                     taxonomy=["avid-effect:performance:P0204"],
+                    detector_name=self.__class__.__name__,
                 )
 
                 # Add examples
diff --git a/giskard/scanner/correlation/spurious_correlation_detector.py b/giskard/scanner/correlation/spurious_correlation_detector.py
@@ -96,6 +96,7 @@ def run(self, model: BaseModel, dataset: Dataset, features: Sequence[str]):
                         importance=metric_value,
                         tests=_generate_spurious_corr_tests,
                         taxonomy=["avid-effect:performance:P0103"],
+                        detector_name=self.__class__.__name__,
                     )
 
                     extractor = ExampleExtractor(issue)
diff --git a/giskard/scanner/data_leakage/data_leakage_detector.py b/giskard/scanner/data_leakage/data_leakage_detector.py
@@ -54,6 +54,7 @@ def slice_fn(df):
                     examples=fail_samples,
                     meta={"domain": "Whole dataset"},
                     taxonomy=["avid-effect:performance:P0103"],
+                    detector_name=self.__class__.__name__,
                 )
             ]
 
diff --git a/giskard/scanner/issues.py b/giskard/scanner/issues.py
@@ -123,6 +123,7 @@ def __init__(
         taxonomy: List[str] = None,
         scan_examples: Optional[ScanExamples] = None,
         display_footer_info: bool = True,
+        detector_name: str = None,
     ):
         """Issue represents a single model vulnerability detected by Giskard.
 
@@ -179,6 +180,7 @@ def __init__(
         self.scan_examples = DataFrameScanExamples() if scan_examples is None else scan_examples
         if examples is not None:
             self.scan_examples.extend(examples)
+        self._detector_name = detector_name
 
     def __repr__(self):
         return f"<{self.__class__.__name__} group='{self.group.name}' level='{self.level}'>"
@@ -218,6 +220,13 @@ def description(self):
             **self.meta,
         )
 
+    @property
+    def detector_name(self):
+        return self._detector_name
+
+    def set_detector_name(self, detector_name):
+        self._detector_name = detector_name
+
     def examples(self, n=3) -> Any:
         return self.scan_examples.head(n)
 
diff --git a/giskard/scanner/llm/base.py b/giskard/scanner/llm/base.py
@@ -117,6 +117,7 @@ def make_issue(self, model: BaseModel, dataset: Dataset, requirement: str, eval_
             },
             tests=_generate_output_requirement_tests,
             taxonomy=self._taxonomy,
+            detector_name=self.__class__.__name__,
         )
 
 
diff --git a/giskard/scanner/llm/llm_basic_sycophancy_detector.py b/giskard/scanner/llm/llm_basic_sycophancy_detector.py
@@ -126,6 +126,7 @@ def run(self, model: BaseModel, dataset: Dataset, features=None) -> Sequence[Iss
                     },
                     tests=_generate_sycophancy_tests,
                     taxonomy=["avid-effect:ethics:E0402"],
+                    detector_name=self.__class__.__name__,
                 )
             ]
 
diff --git a/giskard/scanner/llm/llm_chars_injection_detector.py b/giskard/scanner/llm/llm_chars_injection_detector.py
@@ -137,6 +137,7 @@ def run(self, model: BaseModel, dataset: Dataset, features: Sequence[str]) -> Se
                 examples=examples,
                 tests=_generate_char_injection_tests,
                 taxonomy=["avid-effect:performance:P0201", "avid-effect:security:S0403"],
+                detector_name=self.__class__.__name__,
             )
 
             issues.append(issue)
diff --git a/giskard/scanner/llm/llm_implausible_output_detector.py b/giskard/scanner/llm/llm_implausible_output_detector.py
@@ -100,6 +100,7 @@ def run(self, model: BaseModel, dataset: Dataset, features=None) -> Sequence[Iss
                     examples=examples,
                     tests=_generate_implausible_output_tests,
                     taxonomy=["avid-effect:performance:P0204"],
+                    detector_name=self.__class__.__name__,
                 )
             ]
 
diff --git a/giskard/scanner/llm/llm_prompt_injection_detector.py b/giskard/scanner/llm/llm_prompt_injection_detector.py
@@ -106,6 +106,7 @@ def run(self, model: BaseModel, dataset: Dataset, features: Sequence[str]) -> Se
                     examples=examples,
                     tests=_generate_prompt_injection_tests,
                     taxonomy=["avid-effect:security:S0403"],
+                    detector_name=self.__class__.__name__,
                 )
             )
         return issues
diff --git a/giskard/scanner/performance/performance_bias_detector.py b/giskard/scanner/performance/performance_bias_detector.py
@@ -241,6 +241,7 @@ def _detect_for_metric(
                     importance=-relative_delta if metric.greater_is_better else relative_delta,
                     tests=_generate_performance_tests,
                     taxonomy=["avid-effect:performance:P0204"],
+                    detector_name="PerformanceBiasDetector",
                 )
 
                 # Add failure examples
diff --git a/giskard/scanner/report.py b/giskard/scanner/report.py
@@ -2,6 +2,7 @@
 
 from typing import TYPE_CHECKING, Optional
 
+import json
 import random
 import string
 import tempfile
@@ -18,7 +19,7 @@
 
 
 class ScanReport:
-    def __init__(self, issues, model=None, dataset=None, as_html: bool = True):
+    def __init__(self, issues, model=None, dataset=None, detectors_names=None, as_html: bool = True):
         """The scan report contains the results of the scan.
 
         Note that this object is not meant to be instantiated directly. Instead, it is returned by the
@@ -32,13 +33,16 @@ def __init__(self, issues, model=None, dataset=None, as_html: bool = True):
             A Giskard model object.
         dataset : Dataset
             A Giskard dataset object.
+        detectors_names : list
+            A list of names corresponding to the detectors used
         as_html : bool
             Whether to render the report widget as HTML.
         """
         self.issues = issues
         self.as_html = as_html
         self.model = model
         self.dataset = dataset
+        self.detectors_names = detectors_names
 
     def has_issues(self):
         return len(self.issues) > 0
@@ -67,6 +71,30 @@ def _repr_html_(self):
     def _repr_markdown_(self):
         return self.to_markdown()
 
+    def to_json(self, filename=None):
+        """Renders the scan report as json
+
+        Parameters
+        ----------
+        filename : Optional[str]
+            If provided, the json will be written to the file.
+        """
+        results = {}
+        if self.detectors_names is None:
+            return results
+        for detector_name in self.detectors_names:
+            results[detector_name] = {}
+        for issue in self.issues:
+            if issue.detector_name in results:
+                if issue.level not in results[issue.detector_name]:
+                    results[issue.detector_name][issue.level] = []
+                results[issue.detector_name][issue.level].append(issue.description)
+        if filename is not None:
+            with open(filename, "w") as json_file:
+                json.dump(results, json_file, indent=4)
+        else:
+            return json.dumps(results, indent=4)
+
     def to_html(self, filename=None, embed=False):
         """Renders the scan report as HTML.
 
diff --git a/giskard/scanner/robustness/base_detector.py b/giskard/scanner/robustness/base_detector.py
@@ -192,6 +192,7 @@ def _detect_issues(
                     importance=fail_rate,
                     tests=_generate_robustness_tests,
                     taxonomy=self._taxonomy,
+                    detector_name=self.__class__.__name__,
                 )
 
                 # Add examples
diff --git a/giskard/scanner/scanner.py b/giskard/scanner/scanner.py
@@ -113,6 +113,7 @@ def analyze(
 
             # Collect the detectors
             detectors = self.get_detectors(tags=[model.meta.model_type.value])
+            detectors_names = [detector.__class__.__name__ for detector in detectors]
 
             # Print cost estimate
             if verbose:
@@ -135,7 +136,7 @@ def analyze(
 
             self._collect_analytics(model, dataset, issues, elapsed, model_validation_time, detectors)
 
-        return ScanReport(issues, model=model, dataset=dataset)
+        return ScanReport(issues, model=model, dataset=dataset, detectors_names=detectors_names)
 
     def _run_detectors(self, detectors, model, dataset, features, verbose=True, raise_exceptions=False):
         if not detectors:
diff --git a/giskard/scanner/stochasticity/stochasticity_detector.py b/giskard/scanner/stochasticity/stochasticity_detector.py
@@ -55,5 +55,6 @@ def run(self, model: BaseModel, dataset: Dataset, features=None):
                 },
                 examples=fail_samples,
                 taxonomy=["avid-effect:performance:P0201"],
+                detector_name=self.__class__.__name__,
             )
         ]
diff --git a/tests/scan/test_scan_report.py b/tests/scan/test_scan_report.py
@@ -51,3 +51,29 @@ def test_scan_report_exports_to_markdown():
         assert dest.exists()
         assert dest.is_file()
         assert dest.read_text() == markdown
+
+
+def test_scan_report_to_json():
+    model = Mock()
+    dataset = Mock()
+
+    report = ScanReport(
+        issues=[Issue(model, dataset, Robustness, IssueLevel.MAJOR, detector_name="RobustnessDetector")],
+        detectors_names=["RobustnessDetector"],
+    )
+
+    # JSON report
+    json_report = report.to_json()
+
+    assert json_report is not None
+    assert isinstance(json_report, str)
+    assert "RobustnessDetector" in json_report
+
+    # Save to a file
+    with tempfile.TemporaryDirectory() as tmpdir:
+        dest = Path(tmpdir).joinpath("report.json")
+        report.to_json(dest)
+
+        assert dest.exists()
+        assert dest.is_file()
+        assert dest.read_text() == json_report

Original file line number	Diff line number	Diff line change
`@@ -105,6 +105,7 @@ def _find_issues(`
`105`	`105`	`tests=_generate_overconfidence_tests,`
`106`	`106`	`importance=relative_delta,`
`107`	`107`	`taxonomy=["avid-effect:performance:P0204"],`
	`108`	`+ detector_name=self.__class__.__name__,`
`108`	`109`	`)`
`109`	`110`
`110`	`111`	`# Add examples`
Original file line number	Diff line number	Diff line change
`@@ -96,6 +96,7 @@ def run(self, model: BaseModel, dataset: Dataset, features: Sequence[str]):`
`96`	`96`	`importance=metric_value,`
`97`	`97`	`tests=_generate_spurious_corr_tests,`
`98`	`98`	`taxonomy=["avid-effect:performance:P0103"],`
	`99`	`+ detector_name=self.__class__.__name__,`
`99`	`100`	`)`
`100`	`101`
`101`	`102`	`extractor = ExampleExtractor(issue)`
Original file line number	Diff line number	Diff line change
`@@ -54,6 +54,7 @@ def slice_fn(df):`
`54`	`54`	`examples=fail_samples,`
`55`	`55`	`meta={"domain": "Whole dataset"},`
`56`	`56`	`taxonomy=["avid-effect:performance:P0103"],`
	`57`	`+ detector_name=self.__class__.__name__,`
`57`	`58`	`)`
`58`	`59`	`]`
`59`	`60`
Original file line number	Diff line number	Diff line change
`@@ -117,6 +117,7 @@ def make_issue(self, model: BaseModel, dataset: Dataset, requirement: str, eval_`
`117`	`117`	`},`
`118`	`118`	`tests=_generate_output_requirement_tests,`
`119`	`119`	`taxonomy=self._taxonomy,`
	`120`	`+ detector_name=self.__class__.__name__,`
`120`	`121`	`)`
`121`	`122`
`122`	`123`
Original file line number	Diff line number	Diff line change
`@@ -126,6 +126,7 @@ def run(self, model: BaseModel, dataset: Dataset, features=None) -> Sequence[Iss`
`126`	`126`	`},`
`127`	`127`	`tests=_generate_sycophancy_tests,`
`128`	`128`	`taxonomy=["avid-effect:ethics:E0402"],`
	`129`	`+ detector_name=self.__class__.__name__,`
`129`	`130`	`)`
`130`	`131`	`]`
`131`	`132`
Original file line number	Diff line number	Diff line change
`@@ -137,6 +137,7 @@ def run(self, model: BaseModel, dataset: Dataset, features: Sequence[str]) -> Se`
`137`	`137`	`examples=examples,`
`138`	`138`	`tests=_generate_char_injection_tests,`
`139`	`139`	`taxonomy=["avid-effect:performance:P0201", "avid-effect:security:S0403"],`
	`140`	`+ detector_name=self.__class__.__name__,`
`140`	`141`	`)`
`141`	`142`
`142`	`143`	`issues.append(issue)`
Original file line number	Diff line number	Diff line change
`@@ -100,6 +100,7 @@ def run(self, model: BaseModel, dataset: Dataset, features=None) -> Sequence[Iss`
`100`	`100`	`examples=examples,`
`101`	`101`	`tests=_generate_implausible_output_tests,`
`102`	`102`	`taxonomy=["avid-effect:performance:P0204"],`
	`103`	`+ detector_name=self.__class__.__name__,`
`103`	`104`	`)`
`104`	`105`	`]`
`105`	`106`
Original file line number	Diff line number	Diff line change
`@@ -106,6 +106,7 @@ def run(self, model: BaseModel, dataset: Dataset, features: Sequence[str]) -> Se`
`106`	`106`	`examples=examples,`
`107`	`107`	`tests=_generate_prompt_injection_tests,`
`108`	`108`	`taxonomy=["avid-effect:security:S0403"],`
	`109`	`+ detector_name=self.__class__.__name__,`
`109`	`110`	`)`
`110`	`111`	`)`
`111`	`112`	`return issues`
Original file line number	Diff line number	Diff line change
`@@ -241,6 +241,7 @@ def _detect_for_metric(`
`241`	`241`	`importance=-relative_delta if metric.greater_is_better else relative_delta,`
`242`	`242`	`tests=_generate_performance_tests,`
`243`	`243`	`taxonomy=["avid-effect:performance:P0204"],`
	`244`	`+ detector_name="PerformanceBiasDetector",`
`244`	`245`	`)`
`245`	`246`
`246`	`247`	`# Add failure examples`
Original file line number	Diff line number	Diff line change
`@@ -192,6 +192,7 @@ def _detect_issues(`
`192`	`192`	`importance=fail_rate,`
`193`	`193`	`tests=_generate_robustness_tests,`
`194`	`194`	`taxonomy=self._taxonomy,`
	`195`	`+ detector_name=self.__class__.__name__,`
`195`	`196`	`)`
`196`	`197`
`197`	`198`	`# Add examples`