1
1
from __future__ import annotations
2
2
3
+ import os
3
4
import re
4
5
import warnings
5
6
from collections .abc import Sequence
6
7
from datetime import time
7
8
from glob import glob
8
9
from io import BufferedReader , BytesIO , StringIO , TextIOWrapper
9
- from os import PathLike
10
10
from pathlib import Path
11
11
from typing import IO , TYPE_CHECKING , Any , Callable , NoReturn , overload
12
12
17
17
deprecate_renamed_parameter ,
18
18
issue_deprecation_warning ,
19
19
)
20
- from polars ._utils .various import normalize_filepath , parse_version
20
+ from polars ._utils .various import deduplicate_names , normalize_filepath , parse_version
21
21
from polars .datatypes import (
22
22
N_INFER_DEFAULT ,
23
23
Boolean ,
@@ -57,14 +57,20 @@ def _sources(
57
57
source = [source ] # type: ignore[assignment]
58
58
59
59
for src in source : # type: ignore[union-attr]
60
- if isinstance (src , (str , PathLike )) and not Path (src ).exists ():
61
- sources .extend (glob (str (src ), recursive = True )) # noqa: PTH207
60
+ if isinstance (src , (str , os .PathLike )) and not Path (src ).exists ():
61
+ src = os .path .expanduser (str (src )) # noqa: PTH111
62
+ sources .extend (glob (src , recursive = True )) # noqa: PTH207
62
63
else :
63
64
sources .append (src )
64
65
65
66
return sources , read_multiple_workbooks
66
67
67
68
69
+ def _standardize_duplicates (s : str ) -> str :
70
+ """Standardize columns with '_duplicated_n' names."""
71
+ return re .sub (r"_duplicated_(\d+)" , repl = r"\1" , string = s )
72
+
73
+
68
74
@overload
69
75
def read_excel (
70
76
source : FileSource ,
@@ -79,6 +85,7 @@ def read_excel(
79
85
schema_overrides : SchemaDict | None = ...,
80
86
infer_schema_length : int | None = ...,
81
87
drop_empty_rows : bool = ...,
88
+ drop_empty_cols : bool = ...,
82
89
raise_if_empty : bool = ...,
83
90
) -> pl .DataFrame : ...
84
91
@@ -97,6 +104,7 @@ def read_excel(
97
104
schema_overrides : SchemaDict | None = ...,
98
105
infer_schema_length : int | None = ...,
99
106
drop_empty_rows : bool = ...,
107
+ drop_empty_cols : bool = ...,
100
108
raise_if_empty : bool = ...,
101
109
) -> pl .DataFrame : ...
102
110
@@ -115,6 +123,7 @@ def read_excel(
115
123
schema_overrides : SchemaDict | None = ...,
116
124
infer_schema_length : int | None = ...,
117
125
drop_empty_rows : bool = ...,
126
+ drop_empty_cols : bool = ...,
118
127
raise_if_empty : bool = ...,
119
128
) -> NoReturn : ...
120
129
@@ -135,6 +144,7 @@ def read_excel(
135
144
schema_overrides : SchemaDict | None = ...,
136
145
infer_schema_length : int | None = ...,
137
146
drop_empty_rows : bool = ...,
147
+ drop_empty_cols : bool = ...,
138
148
raise_if_empty : bool = ...,
139
149
) -> dict [str , pl .DataFrame ]: ...
140
150
@@ -153,6 +163,7 @@ def read_excel(
153
163
schema_overrides : SchemaDict | None = ...,
154
164
infer_schema_length : int | None = ...,
155
165
drop_empty_rows : bool = ...,
166
+ drop_empty_cols : bool = ...,
156
167
raise_if_empty : bool = ...,
157
168
) -> pl .DataFrame : ...
158
169
@@ -171,6 +182,7 @@ def read_excel(
171
182
schema_overrides : SchemaDict | None = ...,
172
183
infer_schema_length : int | None = ...,
173
184
drop_empty_rows : bool = ...,
185
+ drop_empty_cols : bool = ...,
174
186
raise_if_empty : bool = ...,
175
187
) -> dict [str , pl .DataFrame ]: ...
176
188
@@ -190,6 +202,7 @@ def read_excel(
190
202
schema_overrides : SchemaDict | None = None ,
191
203
infer_schema_length : int | None = N_INFER_DEFAULT ,
192
204
drop_empty_rows : bool = True ,
205
+ drop_empty_cols : bool = True ,
193
206
raise_if_empty : bool = True ,
194
207
) -> pl .DataFrame | dict [str , pl .DataFrame ]:
195
208
"""
@@ -262,6 +275,10 @@ def read_excel(
262
275
this parameter.
263
276
drop_empty_rows
264
277
Indicate whether to omit empty rows when reading data into the DataFrame.
278
+ drop_empty_cols
279
+ Indicate whether to omit empty columns (with no headers) when reading data into
280
+ the DataFrame (note that empty column identification may vary depending on the
281
+ underlying engine being used).
265
282
raise_if_empty
266
283
When there is no data in the sheet,`NoDataError` is raised. If this parameter
267
284
is set to False, an empty DataFrame (with no columns) is returned instead.
@@ -335,6 +352,7 @@ def read_excel(
335
352
has_header = has_header ,
336
353
columns = columns ,
337
354
drop_empty_rows = drop_empty_rows ,
355
+ drop_empty_cols = drop_empty_cols ,
338
356
read_multiple_workbooks = read_multiple_workbooks ,
339
357
)
340
358
for src in sources
@@ -355,6 +373,7 @@ def read_ods(
355
373
schema_overrides : SchemaDict | None = ...,
356
374
infer_schema_length : int | None = ...,
357
375
drop_empty_rows : bool = ...,
376
+ drop_empty_cols : bool = ...,
358
377
raise_if_empty : bool = ...,
359
378
) -> pl .DataFrame : ...
360
379
@@ -370,6 +389,7 @@ def read_ods(
370
389
schema_overrides : SchemaDict | None = ...,
371
390
infer_schema_length : int | None = ...,
372
391
drop_empty_rows : bool = ...,
392
+ drop_empty_cols : bool = ...,
373
393
raise_if_empty : bool = ...,
374
394
) -> pl .DataFrame : ...
375
395
@@ -385,6 +405,7 @@ def read_ods(
385
405
schema_overrides : SchemaDict | None = ...,
386
406
infer_schema_length : int | None = ...,
387
407
drop_empty_rows : bool = ...,
408
+ drop_empty_cols : bool = ...,
388
409
raise_if_empty : bool = ...,
389
410
) -> NoReturn : ...
390
411
@@ -400,6 +421,7 @@ def read_ods(
400
421
schema_overrides : SchemaDict | None = ...,
401
422
infer_schema_length : int | None = ...,
402
423
drop_empty_rows : bool = ...,
424
+ drop_empty_cols : bool = ...,
403
425
raise_if_empty : bool = ...,
404
426
) -> dict [str , pl .DataFrame ]: ...
405
427
@@ -415,6 +437,7 @@ def read_ods(
415
437
schema_overrides : SchemaDict | None = ...,
416
438
infer_schema_length : int | None = ...,
417
439
drop_empty_rows : bool = ...,
440
+ drop_empty_cols : bool = ...,
418
441
raise_if_empty : bool = ...,
419
442
) -> pl .DataFrame : ...
420
443
@@ -430,6 +453,7 @@ def read_ods(
430
453
schema_overrides : SchemaDict | None = ...,
431
454
infer_schema_length : int | None = ...,
432
455
drop_empty_rows : bool = ...,
456
+ drop_empty_cols : bool = ...,
433
457
raise_if_empty : bool = ...,
434
458
) -> dict [str , pl .DataFrame ]: ...
435
459
@@ -444,6 +468,7 @@ def read_ods(
444
468
schema_overrides : SchemaDict | None = None ,
445
469
infer_schema_length : int | None = N_INFER_DEFAULT ,
446
470
drop_empty_rows : bool = True ,
471
+ drop_empty_cols : bool = True ,
447
472
raise_if_empty : bool = True ,
448
473
) -> pl .DataFrame | dict [str , pl .DataFrame ]:
449
474
"""
@@ -479,6 +504,10 @@ def read_ods(
479
504
large workbooks.
480
505
drop_empty_rows
481
506
Indicate whether to omit empty rows when reading data into the DataFrame.
507
+ drop_empty_cols
508
+ Indicate whether to omit empty columns (with no headers) when reading data into
509
+ the DataFrame (note that empty column identification may vary depending on the
510
+ underlying engine being used).
482
511
raise_if_empty
483
512
When there is no data in the sheet,`NoDataError` is raised. If this parameter
484
513
is set to False, an empty DataFrame (with no columns) is returned instead.
@@ -523,6 +552,7 @@ def read_ods(
523
552
infer_schema_length = infer_schema_length ,
524
553
raise_if_empty = raise_if_empty ,
525
554
drop_empty_rows = drop_empty_rows ,
555
+ drop_empty_cols = drop_empty_cols ,
526
556
has_header = has_header ,
527
557
columns = columns ,
528
558
read_multiple_workbooks = read_multiple_workbooks ,
@@ -548,6 +578,7 @@ def _read_spreadsheet(
548
578
has_header : bool = True ,
549
579
raise_if_empty : bool = True ,
550
580
drop_empty_rows : bool = True ,
581
+ drop_empty_cols : bool = True ,
551
582
read_multiple_workbooks : bool = False ,
552
583
) -> pl .DataFrame | dict [str , pl .DataFrame ]:
553
584
if isinstance (source , (str , Path )):
@@ -587,6 +618,7 @@ def _read_spreadsheet(
587
618
raise_if_empty = raise_if_empty ,
588
619
columns = columns ,
589
620
drop_empty_rows = drop_empty_rows ,
621
+ drop_empty_cols = drop_empty_cols ,
590
622
)
591
623
for name in sheet_names
592
624
}
@@ -774,8 +806,9 @@ def _csv_buffer_to_frame(
774
806
separator : str ,
775
807
read_options : dict [str , Any ],
776
808
schema_overrides : SchemaDict | None ,
777
- raise_if_empty : bool ,
778
809
drop_empty_rows : bool ,
810
+ drop_empty_cols : bool ,
811
+ raise_if_empty : bool ,
779
812
) -> pl .DataFrame :
780
813
"""Translate StringIO buffer containing delimited data as a DataFrame."""
781
814
# handle (completely) empty sheet data
@@ -810,35 +843,39 @@ def _csv_buffer_to_frame(
810
843
** read_options ,
811
844
)
812
845
return _drop_null_data (
813
- df , raise_if_empty = raise_if_empty , drop_empty_rows = drop_empty_rows
846
+ df ,
847
+ raise_if_empty = raise_if_empty ,
848
+ drop_empty_rows = drop_empty_rows ,
849
+ drop_empty_cols = drop_empty_cols ,
814
850
)
815
851
816
852
817
853
def _drop_null_data (
818
- df : pl .DataFrame , * , raise_if_empty : bool , drop_empty_rows : bool = True
854
+ df : pl .DataFrame ,
855
+ * ,
856
+ raise_if_empty : bool ,
857
+ drop_empty_rows : bool = True ,
858
+ drop_empty_cols : bool = True ,
819
859
) -> pl .DataFrame :
820
- """
821
- If DataFrame contains columns/rows that contain only nulls, drop them.
822
-
823
- If `drop_empty_rows` is set to `False`, empty rows are not dropped.
824
- """
860
+ """If DataFrame contains columns/rows that contain only nulls, drop them."""
825
861
null_cols : list [str ] = []
826
- for col_name in df .columns :
827
- # note that if multiple unnamed columns are found then all but the first one
828
- # will be named as "_duplicated_{n}" (or "__UNNAMED__{n}" from calamine)
829
- if col_name == "" or re .match (r"(_duplicated_|__UNNAMED__)\d+$" , col_name ):
830
- col = df [col_name ]
831
- if (
832
- col .dtype == Null
833
- or col .null_count () == len (df )
834
- or (
835
- col .dtype in NUMERIC_DTYPES
836
- and col .replace (0 , None ).null_count () == len (df )
837
- )
838
- ):
839
- null_cols .append (col_name )
840
- if null_cols :
841
- df = df .drop (* null_cols )
862
+ if drop_empty_cols :
863
+ for col_name in df .columns :
864
+ # note that if multiple unnamed columns are found then all but the first one
865
+ # will be named as "_duplicated_{n}" (or "__UNNAMED__{n}" from calamine)
866
+ if col_name == "" or re .match (r"(_duplicated_|__UNNAMED__)\d+$" , col_name ):
867
+ col = df [col_name ]
868
+ if (
869
+ col .dtype == Null
870
+ or col .null_count () == len (df )
871
+ or (
872
+ col .dtype in NUMERIC_DTYPES
873
+ and col .replace (0 , None ).null_count () == len (df )
874
+ )
875
+ ):
876
+ null_cols .append (col_name )
877
+ if null_cols :
878
+ df = df .drop (* null_cols )
842
879
843
880
if len (df ) == 0 and len (df .columns ) == 0 :
844
881
return _empty_frame (raise_if_empty )
@@ -875,8 +912,9 @@ def _read_spreadsheet_openpyxl(
875
912
read_options : dict [str , Any ],
876
913
schema_overrides : SchemaDict | None ,
877
914
columns : Sequence [int ] | Sequence [str ] | None ,
878
- raise_if_empty : bool ,
879
915
drop_empty_rows : bool ,
916
+ drop_empty_cols : bool ,
917
+ raise_if_empty : bool ,
880
918
) -> pl .DataFrame :
881
919
"""Use the 'openpyxl' library to read data from the given worksheet."""
882
920
infer_schema_length = read_options .pop ("infer_schema_length" , None )
@@ -916,25 +954,28 @@ def _read_spreadsheet_openpyxl(
916
954
dtype = String if no_inference else None
917
955
series_data = []
918
956
for name , column_data in zip (header , zip (* rows_iter )):
919
- if name :
957
+ if name or not drop_empty_cols :
920
958
values = [cell .value for cell in column_data ]
921
- if no_inference or (dtype := (schema_overrides or {}).get (name )) == String : # type: ignore[assignment]
959
+ if no_inference or (dtype := (schema_overrides or {}).get (name )) == String : # type: ignore[assignment,arg-type ]
922
960
# note: if we initialise the series with mixed-type data (eg: str/int)
923
961
# then the non-strings will become null, so we handle the cast here
924
962
values = [str (v ) if (v is not None ) else v for v in values ]
925
963
926
964
s = pl .Series (name , values , dtype = dtype , strict = False )
927
965
series_data .append (s )
928
966
967
+ names = deduplicate_names (s .name for s in series_data )
929
968
df = pl .DataFrame (
930
- { s . name : s for s in series_data } ,
969
+ dict ( zip ( names , series_data )) ,
931
970
schema_overrides = schema_overrides ,
932
971
infer_schema_length = infer_schema_length ,
933
972
strict = False ,
934
973
)
935
-
936
974
df = _drop_null_data (
937
- df , raise_if_empty = raise_if_empty , drop_empty_rows = drop_empty_rows
975
+ df ,
976
+ raise_if_empty = raise_if_empty ,
977
+ drop_empty_rows = drop_empty_rows ,
978
+ drop_empty_cols = drop_empty_cols ,
938
979
)
939
980
df = _reorder_columns (df , columns )
940
981
return df
@@ -947,8 +988,9 @@ def _read_spreadsheet_calamine(
947
988
read_options : dict [str , Any ],
948
989
schema_overrides : SchemaDict | None ,
949
990
columns : Sequence [int ] | Sequence [str ] | None ,
950
- raise_if_empty : bool ,
951
991
drop_empty_rows : bool ,
992
+ drop_empty_cols : bool ,
993
+ raise_if_empty : bool ,
952
994
) -> pl .DataFrame :
953
995
# if we have 'schema_overrides' and a more recent version of `fastexcel`
954
996
# we can pass translated dtypes to the engine to refine the initial parse
@@ -1002,7 +1044,10 @@ def _read_spreadsheet_calamine(
1002
1044
df .columns = [f"column_{ i } " for i in range (1 , len (df .columns ) + 1 )]
1003
1045
1004
1046
df = _drop_null_data (
1005
- df , raise_if_empty = raise_if_empty , drop_empty_rows = drop_empty_rows
1047
+ df ,
1048
+ raise_if_empty = raise_if_empty ,
1049
+ drop_empty_rows = drop_empty_rows ,
1050
+ drop_empty_cols = drop_empty_cols ,
1006
1051
)
1007
1052
1008
1053
# note: even if we applied parser dtypes we still re-apply schema_overrides
@@ -1050,8 +1095,9 @@ def _read_spreadsheet_xlsx2csv(
1050
1095
read_options : dict [str , Any ],
1051
1096
schema_overrides : SchemaDict | None ,
1052
1097
columns : Sequence [int ] | Sequence [str ] | None ,
1053
- raise_if_empty : bool ,
1054
1098
drop_empty_rows : bool ,
1099
+ drop_empty_cols : bool ,
1100
+ raise_if_empty : bool ,
1055
1101
) -> pl .DataFrame :
1056
1102
"""Use the 'xlsx2csv' library to read data from the given worksheet."""
1057
1103
csv_buffer = StringIO ()
@@ -1080,8 +1126,10 @@ def _read_spreadsheet_xlsx2csv(
1080
1126
schema_overrides = schema_overrides ,
1081
1127
raise_if_empty = raise_if_empty ,
1082
1128
drop_empty_rows = drop_empty_rows ,
1129
+ drop_empty_cols = drop_empty_cols ,
1083
1130
)
1084
1131
if cast_to_boolean :
1085
1132
df = df .with_columns (* cast_to_boolean )
1086
1133
1134
+ df = df .rename (_standardize_duplicates )
1087
1135
return _reorder_columns (df , columns )
0 commit comments