Skip to content

Commit 0e198c3

Browse files
alexander-beedieritchie46
authored andcommittedDec 24, 2024
feat(python): Add "drop_empty_cols" parameter for read_excel and read_ods (#20430)
1 parent 33580b8 commit 0e198c3

File tree

4 files changed

+140
-38
lines changed

4 files changed

+140
-38
lines changed
 

‎py-polars/polars/_utils/various.py

+12-1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import re
66
import sys
77
import warnings
8+
from collections import Counter
89
from collections.abc import (
910
Collection,
1011
Generator,
@@ -42,7 +43,7 @@
4243
from polars.dependencies import numpy as np
4344

4445
if TYPE_CHECKING:
45-
from collections.abc import Iterator, Reversible
46+
from collections.abc import Iterator, MutableMapping, Reversible
4647

4748
from polars import DataFrame, Expr
4849
from polars._typing import PolarsDataType, SizeUnit
@@ -247,6 +248,16 @@ def ordered_unique(values: Sequence[Any]) -> list[Any]:
247248
return [v for v in values if not (v in seen or add_(v))]
248249

249250

251+
def deduplicate_names(names: Iterable[str]) -> list[str]:
252+
"""Ensure name uniqueness by appending a counter to subsequent duplicates."""
253+
seen: MutableMapping[str, int] = Counter()
254+
deduped = []
255+
for nm in names:
256+
deduped.append(f"{nm}{seen[nm] - 1}" if nm in seen else nm)
257+
seen[nm] += 1
258+
return deduped
259+
260+
250261
@overload
251262
def scale_bytes(sz: int, unit: SizeUnit) -> int | float: ...
252263

‎py-polars/polars/io/spreadsheet/functions.py

+85-37
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
from __future__ import annotations
22

3+
import os
34
import re
45
import warnings
56
from collections.abc import Sequence
67
from datetime import time
78
from glob import glob
89
from io import BufferedReader, BytesIO, StringIO, TextIOWrapper
9-
from os import PathLike
1010
from pathlib import Path
1111
from typing import IO, TYPE_CHECKING, Any, Callable, NoReturn, overload
1212

@@ -17,7 +17,7 @@
1717
deprecate_renamed_parameter,
1818
issue_deprecation_warning,
1919
)
20-
from polars._utils.various import normalize_filepath, parse_version
20+
from polars._utils.various import deduplicate_names, normalize_filepath, parse_version
2121
from polars.datatypes import (
2222
N_INFER_DEFAULT,
2323
Boolean,
@@ -57,14 +57,20 @@ def _sources(
5757
source = [source] # type: ignore[assignment]
5858

5959
for src in source: # type: ignore[union-attr]
60-
if isinstance(src, (str, PathLike)) and not Path(src).exists():
61-
sources.extend(glob(str(src), recursive=True)) # noqa: PTH207
60+
if isinstance(src, (str, os.PathLike)) and not Path(src).exists():
61+
src = os.path.expanduser(str(src)) # noqa: PTH111
62+
sources.extend(glob(src, recursive=True)) # noqa: PTH207
6263
else:
6364
sources.append(src)
6465

6566
return sources, read_multiple_workbooks
6667

6768

69+
def _standardize_duplicates(s: str) -> str:
70+
"""Standardize columns with '_duplicated_n' names."""
71+
return re.sub(r"_duplicated_(\d+)", repl=r"\1", string=s)
72+
73+
6874
@overload
6975
def read_excel(
7076
source: FileSource,
@@ -79,6 +85,7 @@ def read_excel(
7985
schema_overrides: SchemaDict | None = ...,
8086
infer_schema_length: int | None = ...,
8187
drop_empty_rows: bool = ...,
88+
drop_empty_cols: bool = ...,
8289
raise_if_empty: bool = ...,
8390
) -> pl.DataFrame: ...
8491

@@ -97,6 +104,7 @@ def read_excel(
97104
schema_overrides: SchemaDict | None = ...,
98105
infer_schema_length: int | None = ...,
99106
drop_empty_rows: bool = ...,
107+
drop_empty_cols: bool = ...,
100108
raise_if_empty: bool = ...,
101109
) -> pl.DataFrame: ...
102110

@@ -115,6 +123,7 @@ def read_excel(
115123
schema_overrides: SchemaDict | None = ...,
116124
infer_schema_length: int | None = ...,
117125
drop_empty_rows: bool = ...,
126+
drop_empty_cols: bool = ...,
118127
raise_if_empty: bool = ...,
119128
) -> NoReturn: ...
120129

@@ -135,6 +144,7 @@ def read_excel(
135144
schema_overrides: SchemaDict | None = ...,
136145
infer_schema_length: int | None = ...,
137146
drop_empty_rows: bool = ...,
147+
drop_empty_cols: bool = ...,
138148
raise_if_empty: bool = ...,
139149
) -> dict[str, pl.DataFrame]: ...
140150

@@ -153,6 +163,7 @@ def read_excel(
153163
schema_overrides: SchemaDict | None = ...,
154164
infer_schema_length: int | None = ...,
155165
drop_empty_rows: bool = ...,
166+
drop_empty_cols: bool = ...,
156167
raise_if_empty: bool = ...,
157168
) -> pl.DataFrame: ...
158169

@@ -171,6 +182,7 @@ def read_excel(
171182
schema_overrides: SchemaDict | None = ...,
172183
infer_schema_length: int | None = ...,
173184
drop_empty_rows: bool = ...,
185+
drop_empty_cols: bool = ...,
174186
raise_if_empty: bool = ...,
175187
) -> dict[str, pl.DataFrame]: ...
176188

@@ -190,6 +202,7 @@ def read_excel(
190202
schema_overrides: SchemaDict | None = None,
191203
infer_schema_length: int | None = N_INFER_DEFAULT,
192204
drop_empty_rows: bool = True,
205+
drop_empty_cols: bool = True,
193206
raise_if_empty: bool = True,
194207
) -> pl.DataFrame | dict[str, pl.DataFrame]:
195208
"""
@@ -262,6 +275,10 @@ def read_excel(
262275
this parameter.
263276
drop_empty_rows
264277
Indicate whether to omit empty rows when reading data into the DataFrame.
278+
drop_empty_cols
279+
Indicate whether to omit empty columns (with no headers) when reading data into
280+
the DataFrame (note that empty column identification may vary depending on the
281+
underlying engine being used).
265282
raise_if_empty
266283
When there is no data in the sheet,`NoDataError` is raised. If this parameter
267284
is set to False, an empty DataFrame (with no columns) is returned instead.
@@ -335,6 +352,7 @@ def read_excel(
335352
has_header=has_header,
336353
columns=columns,
337354
drop_empty_rows=drop_empty_rows,
355+
drop_empty_cols=drop_empty_cols,
338356
read_multiple_workbooks=read_multiple_workbooks,
339357
)
340358
for src in sources
@@ -355,6 +373,7 @@ def read_ods(
355373
schema_overrides: SchemaDict | None = ...,
356374
infer_schema_length: int | None = ...,
357375
drop_empty_rows: bool = ...,
376+
drop_empty_cols: bool = ...,
358377
raise_if_empty: bool = ...,
359378
) -> pl.DataFrame: ...
360379

@@ -370,6 +389,7 @@ def read_ods(
370389
schema_overrides: SchemaDict | None = ...,
371390
infer_schema_length: int | None = ...,
372391
drop_empty_rows: bool = ...,
392+
drop_empty_cols: bool = ...,
373393
raise_if_empty: bool = ...,
374394
) -> pl.DataFrame: ...
375395

@@ -385,6 +405,7 @@ def read_ods(
385405
schema_overrides: SchemaDict | None = ...,
386406
infer_schema_length: int | None = ...,
387407
drop_empty_rows: bool = ...,
408+
drop_empty_cols: bool = ...,
388409
raise_if_empty: bool = ...,
389410
) -> NoReturn: ...
390411

@@ -400,6 +421,7 @@ def read_ods(
400421
schema_overrides: SchemaDict | None = ...,
401422
infer_schema_length: int | None = ...,
402423
drop_empty_rows: bool = ...,
424+
drop_empty_cols: bool = ...,
403425
raise_if_empty: bool = ...,
404426
) -> dict[str, pl.DataFrame]: ...
405427

@@ -415,6 +437,7 @@ def read_ods(
415437
schema_overrides: SchemaDict | None = ...,
416438
infer_schema_length: int | None = ...,
417439
drop_empty_rows: bool = ...,
440+
drop_empty_cols: bool = ...,
418441
raise_if_empty: bool = ...,
419442
) -> pl.DataFrame: ...
420443

@@ -430,6 +453,7 @@ def read_ods(
430453
schema_overrides: SchemaDict | None = ...,
431454
infer_schema_length: int | None = ...,
432455
drop_empty_rows: bool = ...,
456+
drop_empty_cols: bool = ...,
433457
raise_if_empty: bool = ...,
434458
) -> dict[str, pl.DataFrame]: ...
435459

@@ -444,6 +468,7 @@ def read_ods(
444468
schema_overrides: SchemaDict | None = None,
445469
infer_schema_length: int | None = N_INFER_DEFAULT,
446470
drop_empty_rows: bool = True,
471+
drop_empty_cols: bool = True,
447472
raise_if_empty: bool = True,
448473
) -> pl.DataFrame | dict[str, pl.DataFrame]:
449474
"""
@@ -479,6 +504,10 @@ def read_ods(
479504
large workbooks.
480505
drop_empty_rows
481506
Indicate whether to omit empty rows when reading data into the DataFrame.
507+
drop_empty_cols
508+
Indicate whether to omit empty columns (with no headers) when reading data into
509+
the DataFrame (note that empty column identification may vary depending on the
510+
underlying engine being used).
482511
raise_if_empty
483512
When there is no data in the sheet,`NoDataError` is raised. If this parameter
484513
is set to False, an empty DataFrame (with no columns) is returned instead.
@@ -523,6 +552,7 @@ def read_ods(
523552
infer_schema_length=infer_schema_length,
524553
raise_if_empty=raise_if_empty,
525554
drop_empty_rows=drop_empty_rows,
555+
drop_empty_cols=drop_empty_cols,
526556
has_header=has_header,
527557
columns=columns,
528558
read_multiple_workbooks=read_multiple_workbooks,
@@ -548,6 +578,7 @@ def _read_spreadsheet(
548578
has_header: bool = True,
549579
raise_if_empty: bool = True,
550580
drop_empty_rows: bool = True,
581+
drop_empty_cols: bool = True,
551582
read_multiple_workbooks: bool = False,
552583
) -> pl.DataFrame | dict[str, pl.DataFrame]:
553584
if isinstance(source, (str, Path)):
@@ -587,6 +618,7 @@ def _read_spreadsheet(
587618
raise_if_empty=raise_if_empty,
588619
columns=columns,
589620
drop_empty_rows=drop_empty_rows,
621+
drop_empty_cols=drop_empty_cols,
590622
)
591623
for name in sheet_names
592624
}
@@ -774,8 +806,9 @@ def _csv_buffer_to_frame(
774806
separator: str,
775807
read_options: dict[str, Any],
776808
schema_overrides: SchemaDict | None,
777-
raise_if_empty: bool,
778809
drop_empty_rows: bool,
810+
drop_empty_cols: bool,
811+
raise_if_empty: bool,
779812
) -> pl.DataFrame:
780813
"""Translate StringIO buffer containing delimited data as a DataFrame."""
781814
# handle (completely) empty sheet data
@@ -810,35 +843,39 @@ def _csv_buffer_to_frame(
810843
**read_options,
811844
)
812845
return _drop_null_data(
813-
df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows
846+
df,
847+
raise_if_empty=raise_if_empty,
848+
drop_empty_rows=drop_empty_rows,
849+
drop_empty_cols=drop_empty_cols,
814850
)
815851

816852

817853
def _drop_null_data(
818-
df: pl.DataFrame, *, raise_if_empty: bool, drop_empty_rows: bool = True
854+
df: pl.DataFrame,
855+
*,
856+
raise_if_empty: bool,
857+
drop_empty_rows: bool = True,
858+
drop_empty_cols: bool = True,
819859
) -> pl.DataFrame:
820-
"""
821-
If DataFrame contains columns/rows that contain only nulls, drop them.
822-
823-
If `drop_empty_rows` is set to `False`, empty rows are not dropped.
824-
"""
860+
"""If DataFrame contains columns/rows that contain only nulls, drop them."""
825861
null_cols: list[str] = []
826-
for col_name in df.columns:
827-
# note that if multiple unnamed columns are found then all but the first one
828-
# will be named as "_duplicated_{n}" (or "__UNNAMED__{n}" from calamine)
829-
if col_name == "" or re.match(r"(_duplicated_|__UNNAMED__)\d+$", col_name):
830-
col = df[col_name]
831-
if (
832-
col.dtype == Null
833-
or col.null_count() == len(df)
834-
or (
835-
col.dtype in NUMERIC_DTYPES
836-
and col.replace(0, None).null_count() == len(df)
837-
)
838-
):
839-
null_cols.append(col_name)
840-
if null_cols:
841-
df = df.drop(*null_cols)
862+
if drop_empty_cols:
863+
for col_name in df.columns:
864+
# note that if multiple unnamed columns are found then all but the first one
865+
# will be named as "_duplicated_{n}" (or "__UNNAMED__{n}" from calamine)
866+
if col_name == "" or re.match(r"(_duplicated_|__UNNAMED__)\d+$", col_name):
867+
col = df[col_name]
868+
if (
869+
col.dtype == Null
870+
or col.null_count() == len(df)
871+
or (
872+
col.dtype in NUMERIC_DTYPES
873+
and col.replace(0, None).null_count() == len(df)
874+
)
875+
):
876+
null_cols.append(col_name)
877+
if null_cols:
878+
df = df.drop(*null_cols)
842879

843880
if len(df) == 0 and len(df.columns) == 0:
844881
return _empty_frame(raise_if_empty)
@@ -875,8 +912,9 @@ def _read_spreadsheet_openpyxl(
875912
read_options: dict[str, Any],
876913
schema_overrides: SchemaDict | None,
877914
columns: Sequence[int] | Sequence[str] | None,
878-
raise_if_empty: bool,
879915
drop_empty_rows: bool,
916+
drop_empty_cols: bool,
917+
raise_if_empty: bool,
880918
) -> pl.DataFrame:
881919
"""Use the 'openpyxl' library to read data from the given worksheet."""
882920
infer_schema_length = read_options.pop("infer_schema_length", None)
@@ -916,25 +954,28 @@ def _read_spreadsheet_openpyxl(
916954
dtype = String if no_inference else None
917955
series_data = []
918956
for name, column_data in zip(header, zip(*rows_iter)):
919-
if name:
957+
if name or not drop_empty_cols:
920958
values = [cell.value for cell in column_data]
921-
if no_inference or (dtype := (schema_overrides or {}).get(name)) == String: # type: ignore[assignment]
959+
if no_inference or (dtype := (schema_overrides or {}).get(name)) == String: # type: ignore[assignment,arg-type]
922960
# note: if we initialise the series with mixed-type data (eg: str/int)
923961
# then the non-strings will become null, so we handle the cast here
924962
values = [str(v) if (v is not None) else v for v in values]
925963

926964
s = pl.Series(name, values, dtype=dtype, strict=False)
927965
series_data.append(s)
928966

967+
names = deduplicate_names(s.name for s in series_data)
929968
df = pl.DataFrame(
930-
{s.name: s for s in series_data},
969+
dict(zip(names, series_data)),
931970
schema_overrides=schema_overrides,
932971
infer_schema_length=infer_schema_length,
933972
strict=False,
934973
)
935-
936974
df = _drop_null_data(
937-
df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows
975+
df,
976+
raise_if_empty=raise_if_empty,
977+
drop_empty_rows=drop_empty_rows,
978+
drop_empty_cols=drop_empty_cols,
938979
)
939980
df = _reorder_columns(df, columns)
940981
return df
@@ -947,8 +988,9 @@ def _read_spreadsheet_calamine(
947988
read_options: dict[str, Any],
948989
schema_overrides: SchemaDict | None,
949990
columns: Sequence[int] | Sequence[str] | None,
950-
raise_if_empty: bool,
951991
drop_empty_rows: bool,
992+
drop_empty_cols: bool,
993+
raise_if_empty: bool,
952994
) -> pl.DataFrame:
953995
# if we have 'schema_overrides' and a more recent version of `fastexcel`
954996
# we can pass translated dtypes to the engine to refine the initial parse
@@ -1002,7 +1044,10 @@ def _read_spreadsheet_calamine(
10021044
df.columns = [f"column_{i}" for i in range(1, len(df.columns) + 1)]
10031045

10041046
df = _drop_null_data(
1005-
df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows
1047+
df,
1048+
raise_if_empty=raise_if_empty,
1049+
drop_empty_rows=drop_empty_rows,
1050+
drop_empty_cols=drop_empty_cols,
10061051
)
10071052

10081053
# note: even if we applied parser dtypes we still re-apply schema_overrides
@@ -1050,8 +1095,9 @@ def _read_spreadsheet_xlsx2csv(
10501095
read_options: dict[str, Any],
10511096
schema_overrides: SchemaDict | None,
10521097
columns: Sequence[int] | Sequence[str] | None,
1053-
raise_if_empty: bool,
10541098
drop_empty_rows: bool,
1099+
drop_empty_cols: bool,
1100+
raise_if_empty: bool,
10551101
) -> pl.DataFrame:
10561102
"""Use the 'xlsx2csv' library to read data from the given worksheet."""
10571103
csv_buffer = StringIO()
@@ -1080,8 +1126,10 @@ def _read_spreadsheet_xlsx2csv(
10801126
schema_overrides=schema_overrides,
10811127
raise_if_empty=raise_if_empty,
10821128
drop_empty_rows=drop_empty_rows,
1129+
drop_empty_cols=drop_empty_cols,
10831130
)
10841131
if cast_to_boolean:
10851132
df = df.with_columns(*cast_to_boolean)
10861133

1134+
df = df.rename(_standardize_duplicates)
10871135
return _reorder_columns(df, columns)
12 Bytes
Binary file not shown.

‎py-polars/tests/unit/io/test_spreadsheet.py

+43
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,49 @@ def test_read_excel_basic_datatypes(engine: ExcelSpreadsheetEngine) -> None:
309309
)
310310

311311

312+
@pytest.mark.parametrize(
313+
("read_spreadsheet", "source", "params"),
314+
[
315+
# TODO: uncomment once fastexcel offers a suitable param
316+
# (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
317+
(pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
318+
(pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),
319+
],
320+
)
321+
def test_read_dropped_cols(
322+
read_spreadsheet: Callable[..., dict[str, pl.DataFrame]],
323+
source: str,
324+
params: dict[str, str],
325+
request: pytest.FixtureRequest,
326+
) -> None:
327+
spreadsheet_path = request.getfixturevalue(source)
328+
329+
df1 = read_spreadsheet(
330+
spreadsheet_path,
331+
sheet_name="test4",
332+
**params,
333+
)
334+
df2 = read_spreadsheet(
335+
spreadsheet_path,
336+
sheet_name="test4",
337+
drop_empty_cols=False,
338+
**params,
339+
)
340+
assert df1.to_dict(as_series=False) == { # type: ignore[attr-defined]
341+
"cardinality": [1, 3, 15, 30, 150, 300],
342+
"rows_by_key": [0.05059, 0.04478, 0.04414, 0.05245, 0.05395, 0.05677],
343+
"iter_groups": [0.04806, 0.04223, 0.04774, 0.04864, 0.0572, 0.06945],
344+
}
345+
assert df2.to_dict(as_series=False) == { # type: ignore[attr-defined]
346+
"": [None, None, None, None, None, None],
347+
"cardinality": [1, 3, 15, 30, 150, 300],
348+
"rows_by_key": [0.05059, 0.04478, 0.04414, 0.05245, 0.05395, 0.05677],
349+
"iter_groups": [0.04806, 0.04223, 0.04774, 0.04864, 0.0572, 0.06945],
350+
"0": [None, None, None, None, None, None],
351+
"1": [None, None, None, None, None, None],
352+
}
353+
354+
312355
@pytest.mark.parametrize(
313356
("read_spreadsheet", "source", "params"),
314357
[

0 commit comments

Comments
 (0)
Please sign in to comment.