Skip to content

Commit 91222db

Browse files
Merge pull request #2088 from Giskard-AI/feature/gsk-4014-set-encoding-as-utf-8-for-all-open-statements
[GSK-4014] Set encoding as utf-8 for all open statements
2 parents c7c1afa + 79dada0 commit 91222db

18 files changed

+95
-40
lines changed

giskard/core/model_validation.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ def validate_model_loading_and_saving(model: BaseModel):
147147
with tempfile.TemporaryDirectory(prefix="giskard-model-") as f:
148148
model.save(f)
149149

150-
with open(f + "/giskard-model-meta.yaml") as yaml_f:
150+
with open(f + "/giskard-model-meta.yaml", encoding="utf-8") as yaml_f:
151151
saved_meta = yaml.load(yaml_f, Loader=yaml.Loader)
152152

153153
meta = ModelMeta(

giskard/core/savable.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def _get_meta_endpoint(cls, uuid: str, project_key: str) -> str:
5454
return posixpath.join("project", project_key, cls._get_name(), uuid)
5555

5656
def _save_meta_locally(self, local_dir):
57-
with open(Path(local_dir) / "meta.yaml", "w") as f:
57+
with open(Path(local_dir) / "meta.yaml", "w", encoding="utf-8") as f:
5858
yaml.dump(self.meta, f)
5959

6060

@@ -70,7 +70,7 @@ def _load_meta_locally(cls, local_dir, uuid: str) -> Optional[SMT]:
7070
if meta is not None:
7171
return meta
7272

73-
with open(local_dir / "meta.yaml", "r") as f:
73+
with open(local_dir / "meta.yaml", "r", encoding="utf-8") as f:
7474
return yaml.load(f, Loader=yaml.Loader)
7575

7676
@classmethod

giskard/core/suite.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -162,8 +162,8 @@ def to_json(self, filename=None):
162162
"metric_value": suite_result.result.metric,
163163
}
164164
if filename is not None:
165-
with open(filename, "w") as json_file:
166-
json.dump(results, json_file, indent=4)
165+
with open(filename, "w", encoding="utf-8") as json_file:
166+
json.dump(results, json_file, indent=4, ensure_ascii=False)
167167
else:
168168
return json.dumps(results, indent=4)
169169

@@ -628,8 +628,8 @@ def save(self, folder: str):
628628

629629
json_content = self._to_json(folder_path, saved_uuid_status)
630630

631-
with open(folder_path / "suite.json", "w") as f:
632-
json.dump(json_content, f)
631+
with open(folder_path / "suite.json", "w", encoding="utf-8") as f:
632+
json.dump(json_content, f, ensure_ascii=False)
633633

634634
analytics.track("lib:test_suite:saved")
635635

@@ -843,7 +843,7 @@ def _contains_test(self, test: TestFunctionMeta):
843843
def load(cls, folder: str) -> "Suite":
844844
folder_path = Path(folder)
845845

846-
with open(folder_path / "suite.json", "r") as f:
846+
with open(folder_path / "suite.json", "r", encoding="utf-8") as f:
847847
suite_json = json.load(f)
848848

849849
suite = Suite(name=suite_json.get("name", "Unnamed test suite"))

giskard/datasets/base/__init__.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -525,7 +525,7 @@ def cast_column_to_dtypes(df, column_dtypes):
525525
@classmethod
526526
def load(cls, local_path: str):
527527
# load metadata
528-
with open(Path(local_path) / "giskard-dataset-meta.yaml", "r") as meta_f:
528+
with open(Path(local_path) / "giskard-dataset-meta.yaml", "r", encoding="utf-8") as meta_f:
529529
meta = yaml.safe_load(meta_f)
530530

531531
# load data
@@ -560,7 +560,7 @@ def save(self, local_path: str):
560560
f.write(compressed_bytes)
561561
original_size_bytes, compressed_size_bytes = len(uncompressed_bytes), len(compressed_bytes)
562562

563-
with open(Path(local_path) / "giskard-dataset-meta.yaml", "w") as meta_f:
563+
with open(Path(local_path) / "giskard-dataset-meta.yaml", "w", encoding="utf-8") as meta_f:
564564
yaml.dump(
565565
{
566566
"id": str(self.id),

giskard/models/base/wrapper.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ def save_model_postprocessing_function(self, local_path: Union[str, Path], *_arg
230230
cloudpickle.dump(self.model_postprocessing_function, f, protocol=pickle.DEFAULT_PROTOCOL)
231231

232232
def save_wrapper_meta(self, local_path, *_args, **_kwargs):
233-
with open(Path(local_path) / "giskard-model-wrapper-meta.yaml", "w") as f:
233+
with open(Path(local_path) / "giskard-model-wrapper-meta.yaml", "w", encoding="utf-8") as f:
234234
yaml.dump(
235235
{
236236
"batch_size": self.batch_size,
@@ -313,7 +313,7 @@ def load_model_postprocessing_function(cls, local_path: Union[str, Path], *_args
313313
def load_wrapper_meta(cls, local_dir, *args, **kwargs):
314314
wrapper_meta_file = Path(local_dir) / "giskard-model-wrapper-meta.yaml"
315315
if wrapper_meta_file.exists():
316-
with open(wrapper_meta_file) as f:
316+
with open(wrapper_meta_file, encoding="utf-8") as f:
317317
wrapper_meta = yaml.load(f, Loader=yaml.Loader)
318318
wrapper_meta["batch_size"] = int(wrapper_meta["batch_size"]) if wrapper_meta["batch_size"] else None
319319
return wrapper_meta

giskard/models/huggingface.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ class explicitly using :class:`giskard.models.huggingface.HuggingFaceModel`.
9292
the `model_postprocessing_function` argument. This function should take the
9393
raw output of your model and return a numpy array of probabilities.
9494
"""
95+
9596
from typing import Any, Callable, Iterable, Optional, Tuple, Union
9697

9798
import logging
@@ -199,7 +200,7 @@ def __init__(
199200
def load_model(cls, local_path, model_py_ver: Optional[Tuple[str, str, str]] = None, *args, **kwargs):
200201
huggingface_meta_file = Path(local_path) / "giskard-model-huggingface-meta.yaml"
201202
if huggingface_meta_file.exists():
202-
with open(huggingface_meta_file) as f:
203+
with open(huggingface_meta_file, encoding="utf-8") as f:
203204
huggingface_meta = yaml.load(f, Loader=yaml.Loader)
204205

205206
if huggingface_meta["pipeline_task"]:
@@ -208,7 +209,7 @@ def load_model(cls, local_path, model_py_ver: Optional[Tuple[str, str, str]] = N
208209
return huggingface_meta["huggingface_module"].from_pretrained(local_path)
209210

210211
def save_huggingface_meta(self, local_path, *args, **kwargs):
211-
with open(Path(local_path) / "giskard-model-huggingface-meta.yaml", "w") as f:
212+
with open(Path(local_path) / "giskard-model-huggingface-meta.yaml", "w", encoding="utf-8") as f:
212213
yaml.dump(
213214
{
214215
"huggingface_module": self.huggingface_module,

giskard/models/pytorch.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ def _convert_to_numpy(self, raw_predictions):
200200
return super()._convert_to_numpy(raw_predictions)
201201

202202
def save_pytorch_meta(self, local_path, *_args, **_kwargs):
203-
with open(Path(local_path) / "giskard-model-pytorch-meta.yaml", "w") as f:
203+
with open(Path(local_path) / "giskard-model-pytorch-meta.yaml", "w", encoding="utf-8") as f:
204204
yaml.dump(
205205
{
206206
"device": self.device,
@@ -224,7 +224,7 @@ def load(cls, local_dir, model_py_ver: Optional[Tuple[str, str, str]] = None, *a
224224
def load_pytorch_meta(cls, local_dir):
225225
pytorch_meta_file = Path(local_dir) / "giskard-model-pytorch-meta.yaml"
226226
if pytorch_meta_file.exists():
227-
with open(pytorch_meta_file) as f:
227+
with open(pytorch_meta_file, encoding="utf-8") as f:
228228
pytorch_meta = yaml.load(f, Loader=yaml.Loader)
229229
pytorch_meta["device"] = pytorch_meta.get("device")
230230
pytorch_meta["torch_dtype"] = pytorch_meta.get("torch_dtype")

giskard/rag/report.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -193,11 +193,11 @@ def load(
193193
The embedding model to use inside the knowledge base. If not provided, the default model will be used.
194194
"""
195195
path = Path(folder_path)
196-
knowledge_base_meta = json.load(open(path / "knowledge_base_meta.json", "r"))
196+
knowledge_base_meta = json.load(open(path / "knowledge_base_meta.json", "r", encoding="utf-8"))
197197
knowledge_base_data = pd.read_json(path / "knowledge_base.jsonl", orient="records", lines=True)
198198
testset = QATestset.load(path / "testset.jsonl")
199199

200-
answers = json.load(open(path / "agent_answer.json", "r"))
200+
answers = json.load(open(path / "agent_answer.json", "r", encoding="utf-8"))
201201
model_outputs = [AgentAnswer(**answer) for answer in answers]
202202

203203
topics = {int(k): topic for k, topic in knowledge_base_meta.pop("topics", None).items()}
@@ -219,9 +219,9 @@ def load(
219219

220220
metrics_results = {}
221221
if (path / "metrics_results.json").exists():
222-
metrics_results = json.load(open(path / "metrics_results.json", "r"))
222+
metrics_results = json.load(open(path / "metrics_results.json", "r", encoding="utf-8"))
223223

224-
report_details = json.load(open(path / "report_details.json", "r"))
224+
report_details = json.load(open(path / "report_details.json", "r", encoding="utf-8"))
225225
testset._dataframe.index = testset._dataframe.index.astype(str)
226226

227227
report = cls(testset, model_outputs, metrics_results, knowledge_base)

giskard/registry/giskard_test.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def _load_meta_locally(cls, local_dir, uuid: str) -> Optional[TestFunctionMeta]:
7373
if meta is not None:
7474
return meta
7575

76-
with open(local_dir / "meta.yaml", "r") as f:
76+
with open(local_dir / "meta.yaml", "r", encoding="utf-8") as f:
7777
return yaml.load(f, Loader=yaml.Loader)
7878

7979
@classmethod

giskard/scanner/report.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,8 @@ def to_json(self, filename=None):
9090
results[issue.detector_name][issue.level] = []
9191
results[issue.detector_name][issue.level].append(issue.description)
9292
if filename is not None:
93-
with open(filename, "w") as json_file:
94-
json.dump(results, json_file, indent=4)
93+
with open(filename, "w", encoding="utf-8") as json_file:
94+
json.dump(results, json_file, indent=4, ensure_ascii=False)
9595
else:
9696
return json.dumps(results, indent=4)
9797

@@ -115,7 +115,7 @@ def to_html(self, filename=None, embed=False):
115115
html = widget.render_html(embed=embed)
116116

117117
if filename is not None:
118-
with open(filename, "w") as f:
118+
with open(filename, "w", encoding="utf-8") as f:
119119
f.write(html)
120120
return
121121

@@ -139,7 +139,7 @@ def to_markdown(self, filename=None, template="summary"):
139139
markdown = widget.render_markdown(template=template)
140140

141141
if filename is not None:
142-
with open(filename, "w") as f:
142+
with open(filename, "w", encoding="utf-8") as f:
143143
f.write(markdown)
144144
return
145145

@@ -349,7 +349,7 @@ def to_avid(self, filename=None):
349349
]
350350

351351
if filename is not None:
352-
with open(filename, "w") as f, warnings.catch_warnings():
352+
with open(filename, "w", encoding="utf-8") as f, warnings.catch_warnings():
353353
warnings.filterwarnings("ignore", category=DeprecationWarning) # we need to support both pydantic 1 & 2
354354
f.writelines(r.json() + "\n" for r in reports)
355355
return
@@ -373,7 +373,7 @@ def generate_rails(self, filename=None, colang_version="1.0"):
373373
_rails = generate_rails_from_scan_report(self, colang_version=colang_version)
374374

375375
if filename:
376-
with open(filename, "a") as f:
376+
with open(filename, "a", encoding="utf-8") as f:
377377
f.write(_rails)
378378
return
379379

giskard/scanner/robustness/text_transformations.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,7 @@ class TextNationalityTransformation(TextLanguageBasedTransformation):
341341
name = "Switch countries from high- to low-income and vice versa"
342342

343343
def _load_dictionaries(self):
344-
with Path(__file__).parent.joinpath("nationalities.json").open("r") as f:
344+
with Path(__file__).parent.joinpath("nationalities.json").open("r", encoding="utf-8") as f:
345345
nationalities_dict = json.load(f)
346346
self._lang_dictionary = {"en": nationalities_dict["en"], "fr": nationalities_dict["fr"]}
347347

giskard/visualization/widget.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -98,9 +98,11 @@ def render_html(self, template="full", embed=False) -> str:
9898
escaped = escape(html)
9999
uid = id(self)
100100

101-
with Path(__file__).parent.joinpath("templates", "scan_report", "html", "static", "external.js").open(
102-
"r"
103-
) as f:
101+
with (
102+
Path(__file__)
103+
.parent.joinpath("templates", "scan_report", "html", "static", "external.js")
104+
.open("r", encoding="utf-8") as f
105+
):
104106
js_lib = f.read()
105107

106108
html = f"""<iframe id="scan-{uid}" srcdoc="{escaped}" style="width: 100%; border: none;" class="gsk-scan"></iframe>

tests/fixtures/enron_multilabel_classification.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656

5757
# get_labels returns a dictionary representation of these labels.
5858
def get_labels(filename):
59-
with open(filename + ".cats") as f:
59+
with open(filename + ".cats", encoding="utf-8") as f:
6060
labels = defaultdict(dict)
6161
line = f.readline()
6262
while line:
@@ -99,7 +99,7 @@ def enron_raw_data_full() -> pd.DataFrame:
9999

100100
# Features are metadata from the email object
101101
filename = email_file + ".txt"
102-
with open(filename) as f:
102+
with open(filename, encoding="utf-8") as f:
103103
message = email.message_from_string(f.read())
104104

105105
values_to_add["Subject"] = str(message["Subject"])

tests/integrations/test_avid.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def test_scan_report_can_be_exported_to_avid():
7676
dest_path = Path(tmpdir).joinpath("test_report.avid")
7777
report.to_avid(dest_path)
7878

79-
with dest_path.open("r") as f:
79+
with dest_path.open("r", encoding="utf-8") as f:
8080
avid_reports_read = [json.loads(line) for line in f.readlines()]
8181

8282
assert len(avid_reports_read) == len(avid_reports)

tests/integrations/test_nemoguardrails.py

+22-4
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,39 @@
11
import json
2+
import tempfile
3+
from pathlib import Path
24
from unittest.mock import Mock, patch
35

46
import pandas as pd
7+
import pytest
58
from nemoguardrails.colang import parse_colang_file
69

710
from giskard.llm.client.base import ChatMessage
811
from giskard.scanner.issues import Issue, Robustness
912
from giskard.scanner.report import ScanReport
1013

1114

15+
def _generate_rails(report: ScanReport, filename=None, colang_version="1.0"):
16+
if filename:
17+
with tempfile.TemporaryDirectory() as tmpdir:
18+
dest = Path(tmpdir).joinpath("rails.co")
19+
report.generate_rails(filename=dest, colang_version=colang_version)
20+
assert dest.exists()
21+
assert dest.is_file()
22+
rails = dest.read_text(encoding="utf-8")
23+
else:
24+
rails = report.generate_rails(colang_version=colang_version)
25+
return rails
26+
27+
28+
@pytest.mark.parametrize("filename", [(None), ("rails.co")])
1229
@patch("giskard.integrations.nemoguardrails.get_default_client")
13-
def test_generate_colang_v1_rails_from_scan(get_default_client_mock):
30+
def test_generate_colang_v1_rails_from_scan(get_default_client_mock, filename):
1431
report = make_test_report()
1532

1633
llm_client = get_default_client_mock()
1734
llm_client.complete.side_effect = make_llm_answers()
1835

19-
rails = report.generate_rails()
36+
rails = _generate_rails(report, filename=filename, colang_version="1.0")
2037

2138
# Check that the file is correctly formatted
2239
parsed = parse_colang_file("rails.co", rails, version="1.0")
@@ -27,14 +44,15 @@ def test_generate_colang_v1_rails_from_scan(get_default_client_mock):
2744
assert parsed["flows"][1]["id"] == "ask help on illegal activities"
2845

2946

47+
@pytest.mark.parametrize("filename", [(None), ("rails.co")])
3048
@patch("giskard.integrations.nemoguardrails.get_default_client")
31-
def test_generate_colang_v2_rails_from_scan(get_default_client_mock):
49+
def test_generate_colang_v2_rails_from_scan(get_default_client_mock, filename):
3250
report = make_test_report()
3351

3452
llm_client = get_default_client_mock()
3553
llm_client.complete.side_effect = make_llm_answers()
3654

37-
rails = report.generate_rails(colang_version="2.x")
55+
rails = _generate_rails(report, filename=filename, colang_version="2.x")
3856

3957
# Check that the file is correctly formatted
4058
parsed = parse_colang_file("rails.co", rails, version="2.x")

tests/registry/module_utils.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ class PythonModule:
2323

2424
def _write_file(dir: Path, file: Union[str, Path], content: str):
2525
os.makedirs(os.path.dirname(dir / file), exist_ok=True)
26-
with open(dir / file, "w") as f:
26+
with open(dir / file, "w", encoding="utf-8") as f:
2727
f.write(content)
2828

2929

tests/scan/test_scan_report.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def test_scan_report_exports_to_markdown():
5050

5151
assert dest.exists()
5252
assert dest.is_file()
53-
assert dest.read_text() == markdown
53+
assert dest.read_text(encoding="utf-8") == markdown
5454

5555

5656
def test_scan_report_to_json():

tests/scan/test_scanner.py

+34
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import re
22
import sys
3+
import tempfile
34
import warnings
5+
from pathlib import Path
46
from unittest import mock
57

68
import numpy as np
@@ -279,3 +281,35 @@ def test_min_slice_size(titanic_model, titanic_dataset):
279281
detector = SpuriousCorrelationDetector(min_slice_size=2000)
280282
issues = detector.run(titanic_model, titanic_dataset, features=titanic_model.feature_names)
281283
assert len(issues) == 0
284+
285+
286+
@pytest.mark.parametrize(
287+
"filename",
288+
[(None), ("scan_test_suite_results.json")],
289+
)
290+
@pytest.mark.slow
291+
def test_export_scan_test_suite_results_to_json(filename, request):
292+
DATASET_NAME = "diabetes_dataset_with_target"
293+
MODEL_NAME = "linear_regression_diabetes"
294+
295+
dataset = request.getfixturevalue(DATASET_NAME)
296+
model = request.getfixturevalue(MODEL_NAME)
297+
298+
scanner = Scanner()
299+
scan_results = scanner.analyze(model, dataset)
300+
test_suite_results = scan_results.generate_test_suite().run()
301+
302+
if filename:
303+
with tempfile.TemporaryDirectory() as tmpdir:
304+
dest = Path(tmpdir).joinpath(filename)
305+
test_suite_results.to_json(dest)
306+
assert dest.exists()
307+
assert dest.is_file()
308+
test_results_json = dest.read_text(encoding="utf-8")
309+
310+
else:
311+
test_results_json = test_suite_results.to_json()
312+
assert test_results_json is not None
313+
314+
assert test_results_json.startswith("{")
315+
assert test_results_json.strip().endswith("}")

0 commit comments

Comments
 (0)