@@ -131,7 +131,9 @@ def prepare_file_arg(
131
131
132
132
When `encoding` is not `utf8` or `utf8-lossy`, the whole file is
133
133
first read in Python and decoded using the specified encoding and
134
- returned as a `BytesIO` (for usage with `read_csv`).
134
+ returned as a `BytesIO` (for usage with `read_csv`). If encoding
135
+ ends with "-lossy", characters that can't be decoded are replaced
136
+ with `�`.
135
137
136
138
A `bytes` file is returned as a `BytesIO` if `use_pyarrow=True`.
137
139
@@ -157,14 +159,19 @@ def managed_file(file: Any) -> Iterator[Any]:
157
159
encoding in {"utf8" , "utf8-lossy" } if encoding else True
158
160
)
159
161
encoding_str = encoding if encoding else "utf8"
162
+ encoding_str , encoding_errors = (
163
+ (encoding_str [:- 6 ], "replace" )
164
+ if encoding_str .endswith ("-lossy" )
165
+ else (encoding_str , "strict" )
166
+ )
160
167
161
168
# PyArrow allows directories, so we only check that something is not
162
169
# a dir if we are not using PyArrow
163
170
check_not_dir = not use_pyarrow
164
171
165
172
if isinstance (file , bytes ):
166
173
if not has_utf8_utf8_lossy_encoding :
167
- file = file .decode (encoding_str ).encode ("utf8" )
174
+ file = file .decode (encoding_str , errors = encoding_errors ).encode ("utf8" )
168
175
return _check_empty (
169
176
BytesIO (file ), context = "bytes" , raise_if_empty = raise_if_empty
170
177
)
@@ -180,7 +187,11 @@ def managed_file(file: Any) -> Iterator[Any]:
180
187
if isinstance (file , BytesIO ):
181
188
if not has_utf8_utf8_lossy_encoding :
182
189
return _check_empty (
183
- BytesIO (file .read ().decode (encoding_str ).encode ("utf8" )),
190
+ BytesIO (
191
+ file .read ()
192
+ .decode (encoding_str , errors = encoding_errors )
193
+ .encode ("utf8" )
194
+ ),
184
195
context = "BytesIO" ,
185
196
read_position = file .tell (),
186
197
raise_if_empty = raise_if_empty ,
@@ -197,7 +208,11 @@ def managed_file(file: Any) -> Iterator[Any]:
197
208
if isinstance (file , Path ):
198
209
if not has_utf8_utf8_lossy_encoding :
199
210
return _check_empty (
200
- BytesIO (file .read_bytes ().decode (encoding_str ).encode ("utf8" )),
211
+ BytesIO (
212
+ file .read_bytes ()
213
+ .decode (encoding_str , errors = encoding_errors )
214
+ .encode ("utf8" )
215
+ ),
201
216
context = f"Path ({ file !r} )" ,
202
217
raise_if_empty = raise_if_empty ,
203
218
)
@@ -220,13 +235,16 @@ def managed_file(file: Any) -> Iterator[Any]:
220
235
normalize_filepath (file , check_not_directory = check_not_dir )
221
236
)
222
237
# decode first
223
- with Path (file ).open (encoding = encoding_str ) as f :
238
+ with Path (file ).open (
239
+ encoding = encoding_str , errors = encoding_errors
240
+ ) as f :
224
241
return _check_empty (
225
242
BytesIO (f .read ().encode ("utf8" )),
226
243
context = f"{ file !r} " ,
227
244
raise_if_empty = raise_if_empty ,
228
245
)
229
246
storage_options ["encoding" ] = encoding
247
+ storage_options ["errors" ] = encoding_errors
230
248
return fsspec .open (file , ** storage_options )
231
249
232
250
if isinstance (file , list ) and bool (file ) and all (isinstance (f , str ) for f in file ):
@@ -242,12 +260,13 @@ def managed_file(file: Any) -> Iterator[Any]:
242
260
]
243
261
)
244
262
storage_options ["encoding" ] = encoding
263
+ storage_options ["errors" ] = encoding_errors
245
264
return fsspec .open_files (file , ** storage_options )
246
265
247
266
if isinstance (file , str ):
248
267
file = normalize_filepath (file , check_not_directory = check_not_dir )
249
268
if not has_utf8_utf8_lossy_encoding :
250
- with Path (file ).open (encoding = encoding_str ) as f :
269
+ with Path (file ).open (encoding = encoding_str , errors = encoding_errors ) as f :
251
270
return _check_empty (
252
271
BytesIO (f .read ().encode ("utf8" )),
253
272
context = f"{ file !r} " ,
0 commit comments