Skip to content

Commit 2ae7287

Browse files
authored
feat: Add lossy decoding to read_csv for non-utf8 encodings (#21433)
1 parent 7332717 commit 2ae7287

File tree

3 files changed

+56
-7
lines changed

3 files changed

+56
-7
lines changed

py-polars/polars/io/_utils.py

+25-6
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,9 @@ def prepare_file_arg(
131131
132132
When `encoding` is not `utf8` or `utf8-lossy`, the whole file is
133133
first read in Python and decoded using the specified encoding and
134-
returned as a `BytesIO` (for usage with `read_csv`).
134+
returned as a `BytesIO` (for usage with `read_csv`). If encoding
135+
ends with "-lossy", characters that can't be decoded are replaced
136+
with `�`.
135137
136138
A `bytes` file is returned as a `BytesIO` if `use_pyarrow=True`.
137139
@@ -157,14 +159,19 @@ def managed_file(file: Any) -> Iterator[Any]:
157159
encoding in {"utf8", "utf8-lossy"} if encoding else True
158160
)
159161
encoding_str = encoding if encoding else "utf8"
162+
encoding_str, encoding_errors = (
163+
(encoding_str[:-6], "replace")
164+
if encoding_str.endswith("-lossy")
165+
else (encoding_str, "strict")
166+
)
160167

161168
# PyArrow allows directories, so we only check that something is not
162169
# a dir if we are not using PyArrow
163170
check_not_dir = not use_pyarrow
164171

165172
if isinstance(file, bytes):
166173
if not has_utf8_utf8_lossy_encoding:
167-
file = file.decode(encoding_str).encode("utf8")
174+
file = file.decode(encoding_str, errors=encoding_errors).encode("utf8")
168175
return _check_empty(
169176
BytesIO(file), context="bytes", raise_if_empty=raise_if_empty
170177
)
@@ -180,7 +187,11 @@ def managed_file(file: Any) -> Iterator[Any]:
180187
if isinstance(file, BytesIO):
181188
if not has_utf8_utf8_lossy_encoding:
182189
return _check_empty(
183-
BytesIO(file.read().decode(encoding_str).encode("utf8")),
190+
BytesIO(
191+
file.read()
192+
.decode(encoding_str, errors=encoding_errors)
193+
.encode("utf8")
194+
),
184195
context="BytesIO",
185196
read_position=file.tell(),
186197
raise_if_empty=raise_if_empty,
@@ -197,7 +208,11 @@ def managed_file(file: Any) -> Iterator[Any]:
197208
if isinstance(file, Path):
198209
if not has_utf8_utf8_lossy_encoding:
199210
return _check_empty(
200-
BytesIO(file.read_bytes().decode(encoding_str).encode("utf8")),
211+
BytesIO(
212+
file.read_bytes()
213+
.decode(encoding_str, errors=encoding_errors)
214+
.encode("utf8")
215+
),
201216
context=f"Path ({file!r})",
202217
raise_if_empty=raise_if_empty,
203218
)
@@ -220,13 +235,16 @@ def managed_file(file: Any) -> Iterator[Any]:
220235
normalize_filepath(file, check_not_directory=check_not_dir)
221236
)
222237
# decode first
223-
with Path(file).open(encoding=encoding_str) as f:
238+
with Path(file).open(
239+
encoding=encoding_str, errors=encoding_errors
240+
) as f:
224241
return _check_empty(
225242
BytesIO(f.read().encode("utf8")),
226243
context=f"{file!r}",
227244
raise_if_empty=raise_if_empty,
228245
)
229246
storage_options["encoding"] = encoding
247+
storage_options["errors"] = encoding_errors
230248
return fsspec.open(file, **storage_options)
231249

232250
if isinstance(file, list) and bool(file) and all(isinstance(f, str) for f in file):
@@ -242,12 +260,13 @@ def managed_file(file: Any) -> Iterator[Any]:
242260
]
243261
)
244262
storage_options["encoding"] = encoding
263+
storage_options["errors"] = encoding_errors
245264
return fsspec.open_files(file, **storage_options)
246265

247266
if isinstance(file, str):
248267
file = normalize_filepath(file, check_not_directory=check_not_dir)
249268
if not has_utf8_utf8_lossy_encoding:
250-
with Path(file).open(encoding=encoding_str) as f:
269+
with Path(file).open(encoding=encoding_str, errors=encoding_errors) as f:
251270
return _check_empty(
252271
BytesIO(f.read().encode("utf8")),
253272
context=f"{file!r}",

py-polars/polars/io/csv/functions.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ def read_csv(
171171
Stop reading from CSV file after reading `n_rows`.
172172
During multi-threaded parsing, an upper bound of `n_rows`
173173
rows cannot be guaranteed.
174-
encoding : {'utf8', 'utf8-lossy', ...}
174+
encoding : {'utf8', 'utf8-lossy', 'windows-1252', 'windows-1252-lossy', ...}
175175
Lossy means that invalid utf8 values are replaced with `�`
176176
characters. When using other encodings than `utf8` or
177177
`utf8-lossy`, the input is first decoded in memory with

py-polars/tests/unit/io/test_csv.py

+30
Original file line numberDiff line numberDiff line change
@@ -503,6 +503,36 @@ def test_read_csv_encoding(tmp_path: Path) -> None:
503503
)
504504

505505

506+
@pytest.mark.may_fail_auto_streaming # read->scan_csv dispatch
507+
@pytest.mark.write_disk
508+
def test_read_csv_encoding_lossy(tmp_path: Path) -> None:
509+
tmp_path.mkdir(exist_ok=True)
510+
511+
bts = (
512+
b"\xc8\xec\xff,\xc2\xee\xe7\xf0\xe0\xf1\xf2,\xc3\xee\xf0\xee\xe4\n"
513+
b"\xc8\xe2\xe0\xed,25,\xcc\xee\xf1\xea\xe2\xe0\n"
514+
# \x98 is not supported in "windows-1251".
515+
b"\xce\xeb\xfc\xe3\xe0,30,\xd1\xe0\xed\xea\xf2-\x98\xcf\xe5\xf2\xe5\xf0\xe1\xf3\xf0\xe3\n"
516+
)
517+
518+
file_path = tmp_path / "encoding_lossy.csv"
519+
file_path.write_bytes(bts)
520+
521+
file_str = str(file_path)
522+
bytesio = io.BytesIO(bts)
523+
bytesio.seek(0)
524+
525+
for file in [file_path, file_str, bts, bytesio]:
526+
assert_series_equal(
527+
pl.read_csv(
528+
file, # type: ignore[arg-type]
529+
encoding="windows-1251-lossy",
530+
use_pyarrow=False,
531+
).get_column("Город"),
532+
pl.Series("Город", ["Москва", "Санкт-�Петербург"]),
533+
)
534+
535+
506536
@pytest.mark.may_fail_auto_streaming # read->scan_csv dispatch
507537
def test_column_rename_and_schema_overrides() -> None:
508538
csv = textwrap.dedent(

0 commit comments

Comments
 (0)