Skip to content

Commit d079ecb

Browse files
committed
BUG: Delegate more of Excel parsing to CSV
The idea is that we read the Excel file, get the data, and then let the TextParser handle the reading and parsing. We shouldn't be doing a lot of work that is already defined in parsers.py In doing so, we identified several bugs: * index_col=None was not being respected * usecols behavior was inconsistent with that of read_csv for list of strings and callable inputs * usecols was not being validated as proper Excel column names when passed as a string. Closes pandas-devgh-18273. Closes pandas-devgh-20480.
1 parent 5938ce1 commit d079ecb

File tree

4 files changed

+652
-508
lines changed

4 files changed

+652
-508
lines changed

doc/source/io.rst

+17
Original file line numberDiff line numberDiff line change
@@ -2867,6 +2867,23 @@ indices to be parsed.
28672867
28682868
Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
28692869

2870+
If ``usecols`` is a list of strings, it is assumed that each string corresponds
2871+
to a column name provided either by the user in ``names`` or inferred from the
2872+
document header row(s). Those strings define which columns will be parsed:
2873+
2874+
.. code-block:: python
2875+
2876+
read_excel('path_to_file.xls', 'Sheet1', usecols=['foo', 'bar'])
2877+
2878+
Element order is ignored, so ``usecols=['baz', 'joe']`` is the same as ``['joe', 'baz']``.
2879+
2880+
If ``usecols`` is callable, the callable function will be evaluated against
2881+
the column names, returning names where the callable function evaluates to ``True``.
2882+
2883+
.. code-block:: python
2884+
2885+
read_excel('path_to_file.xls', 'Sheet1', usecols=lambda x: x.isalpha())
2886+
28702887
Parsing Dates
28712888
+++++++++++++
28722889

doc/source/whatsnew/v0.24.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -1289,6 +1289,9 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
12891289
- Bug in :func:`DataFrame.to_csv` where a single level MultiIndex incorrectly wrote a tuple. Now just the value of the index is written (:issue:`19589`).
12901290
- Bug in :meth:`HDFStore.append` when appending a :class:`DataFrame` with an empty string column and ``min_itemsize`` < 8 (:issue:`12242`)
12911291
- Bug in :meth:`read_csv()` in which :class:`MultiIndex` index names were being improperly handled in the cases when they were not provided (:issue:`23484`)
1292+
- Bug in :meth:`read_excel()` in which passing a list of column names to the ``usecols`` parameter returned an empty :class:`DataFrame` (:issue:`18273`)
1293+
- Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`20480`)
1294+
- Bug in :meth:`read_excel()` in which ``usecols`` was not being validated for proper column names when passed in as a string (:issue:`20480`)
12921295

12931296
Plotting
12941297
^^^^^^^^

pandas/io/excel.py

+120-65
Original file line numberDiff line numberDiff line change
@@ -93,13 +93,16 @@
9393
.. deprecated:: 0.21.0
9494
Pass in `usecols` instead.
9595
96-
usecols : int or list, default None
97-
* If None then parse all columns,
98-
* If int then indicates last column to be parsed
99-
* If list of ints then indicates list of column numbers to be parsed
100-
* If string then indicates comma separated list of Excel column letters and
101-
column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of
96+
usecols : int, str, list-like, or callable default None
97+
* If None, then parse all columns,
98+
* If int, then indicates last column to be parsed
99+
* If string, then indicates comma separated list of Excel column letters
100+
and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of
102101
both sides.
102+
* If list of ints, then indicates list of column numbers to be parsed.
103+
* If list of strings, then indicates list of column names to be parsed.
104+
* If callable, then evaluate each column name against it and parse the
105+
column if the callable returns ``True``.
103106
squeeze : boolean, default False
104107
If the parsed data only contains one column then return a Series
105108
dtype : Type name or dict of column -> type, default None
@@ -466,39 +469,6 @@ def parse(self,
466469
convert_float=convert_float,
467470
**kwds)
468471

469-
def _should_parse(self, i, usecols):
470-
471-
def _range2cols(areas):
472-
"""
473-
Convert comma separated list of column names and column ranges to a
474-
list of 0-based column indexes.
475-
476-
>>> _range2cols('A:E')
477-
[0, 1, 2, 3, 4]
478-
>>> _range2cols('A,C,Z:AB')
479-
[0, 2, 25, 26, 27]
480-
"""
481-
def _excel2num(x):
482-
"Convert Excel column name like 'AB' to 0-based column index"
483-
return reduce(lambda s, a: s * 26 + ord(a) - ord('A') + 1,
484-
x.upper().strip(), 0) - 1
485-
486-
cols = []
487-
for rng in areas.split(','):
488-
if ':' in rng:
489-
rng = rng.split(':')
490-
cols += lrange(_excel2num(rng[0]), _excel2num(rng[1]) + 1)
491-
else:
492-
cols.append(_excel2num(rng))
493-
return cols
494-
495-
if isinstance(usecols, int):
496-
return i <= usecols
497-
elif isinstance(usecols, compat.string_types):
498-
return i in _range2cols(usecols)
499-
else:
500-
return i in usecols
501-
502472
def _parse_excel(self,
503473
sheet_name=0,
504474
header=0,
@@ -527,10 +497,6 @@ def _parse_excel(self,
527497
raise NotImplementedError("chunksize keyword of read_excel "
528498
"is not implemented")
529499

530-
if parse_dates is True and index_col is None:
531-
warnings.warn("The 'parse_dates=True' keyword of read_excel was "
532-
"provided without an 'index_col' keyword value.")
533-
534500
import xlrd
535501
from xlrd import (xldate, XL_CELL_DATE,
536502
XL_CELL_ERROR, XL_CELL_BOOLEAN,
@@ -620,17 +586,13 @@ def _parse_cell(cell_contents, cell_typ):
620586
sheet = self.book.sheet_by_index(asheetname)
621587

622588
data = []
623-
should_parse = {}
589+
usecols = _maybe_convert_usecols(usecols)
624590

625591
for i in range(sheet.nrows):
626592
row = []
627593
for j, (value, typ) in enumerate(zip(sheet.row_values(i),
628594
sheet.row_types(i))):
629-
if usecols is not None and j not in should_parse:
630-
should_parse[j] = self._should_parse(j, usecols)
631-
632-
if usecols is None or should_parse[j]:
633-
row.append(_parse_cell(value, typ))
595+
row.append(_parse_cell(value, typ))
634596
data.append(row)
635597

636598
if sheet.nrows == 0:
@@ -642,31 +604,30 @@ def _parse_cell(cell_contents, cell_typ):
642604

643605
# forward fill and pull out names for MultiIndex column
644606
header_names = None
645-
if header is not None:
646-
if is_list_like(header):
647-
header_names = []
648-
control_row = [True] * len(data[0])
649-
for row in header:
650-
if is_integer(skiprows):
651-
row += skiprows
652-
653-
data[row], control_row = _fill_mi_header(
654-
data[row], control_row)
655-
header_name, data[row] = _pop_header_name(
656-
data[row], index_col)
657-
header_names.append(header_name)
658-
else:
659-
data[header] = _trim_excel_header(data[header])
607+
if header is not None and is_list_like(header):
608+
header_names = []
609+
control_row = [True] * len(data[0])
610+
611+
for row in header:
612+
if is_integer(skiprows):
613+
row += skiprows
614+
615+
data[row], control_row = _fill_mi_header(
616+
data[row], control_row)
617+
header_name, _ = _pop_header_name(
618+
data[row], index_col)
619+
header_names.append(header_name)
660620

661621
if is_list_like(index_col):
662-
# forward fill values for MultiIndex index
622+
# Forward fill values for MultiIndex index.
663623
if not is_list_like(header):
664624
offset = 1 + header
665625
else:
666626
offset = 1 + max(header)
667627

668628
for col in index_col:
669629
last = data[offset][col]
630+
670631
for row in range(offset + 1, len(data)):
671632
if data[row][col] == '' or data[row][col] is None:
672633
data[row][col] = last
@@ -693,11 +654,14 @@ def _parse_cell(cell_contents, cell_typ):
693654
thousands=thousands,
694655
comment=comment,
695656
skipfooter=skipfooter,
657+
usecols=usecols,
696658
**kwds)
697659

698660
output[asheetname] = parser.read(nrows=nrows)
661+
699662
if names is not None:
700663
output[asheetname].columns = names
664+
701665
if not squeeze or isinstance(output[asheetname], DataFrame):
702666
output[asheetname].columns = output[
703667
asheetname].columns.set_names(header_names)
@@ -726,6 +690,97 @@ def __exit__(self, exc_type, exc_value, traceback):
726690
self.close()
727691

728692

693+
def _excel2num(x):
694+
"""
695+
Convert Excel column name like 'AB' to 0-based column index.
696+
697+
Parameters
698+
----------
699+
x : str
700+
The Excel column name to convert to a 0-based column index.
701+
702+
Returns
703+
-------
704+
num : int
705+
The column index corresponding to the name.
706+
707+
Raises
708+
------
709+
ValueError
710+
Part of the Excel column name was invalid.
711+
"""
712+
index = 0
713+
714+
for c in x.upper().strip():
715+
cp = ord(c)
716+
717+
if cp < ord("A") or cp > ord("Z"):
718+
raise ValueError("Invalid column name: {x}".format(x=x))
719+
720+
index = index * 26 + cp - ord("A") + 1
721+
722+
return index - 1
723+
724+
725+
def _range2cols(areas):
726+
"""
727+
Convert comma separated list of column names and ranges to indices.
728+
729+
Parameters
730+
----------
731+
areas : str
732+
A string containing a sequence of column ranges (or areas).
733+
734+
Returns
735+
-------
736+
cols : list
737+
A list of 0-based column indices.
738+
739+
Examples
740+
--------
741+
>>> _range2cols('A:E')
742+
[0, 1, 2, 3, 4]
743+
>>> _range2cols('A,C,Z:AB')
744+
[0, 2, 25, 26, 27]
745+
"""
746+
cols = []
747+
748+
for rng in areas.split(","):
749+
if ":" in rng:
750+
rng = rng.split(":")
751+
cols += lrange(_excel2num(rng[0]), _excel2num(rng[1]) + 1)
752+
else:
753+
cols.append(_excel2num(rng))
754+
755+
return cols
756+
757+
758+
def _maybe_convert_usecols(usecols):
759+
"""
760+
Convert `usecols` into a compatible format for parsing in `parsers.py`.
761+
762+
Parameters
763+
----------
764+
usecols : object
765+
The use-columns object to potentially convert.
766+
767+
Returns
768+
-------
769+
converted : object
770+
The compatible format of `usecols`.
771+
"""
772+
if usecols is None:
773+
return usecols
774+
775+
if isinstance(usecols, int):
776+
return lrange(usecols + 1)
777+
778+
if isinstance(usecols, compat.string_types):
779+
return _range2cols(usecols)
780+
781+
return usecols
782+
783+
729784
def _validate_freeze_panes(freeze_panes):
730785
if freeze_panes is not None:
731786
if (

0 commit comments

Comments
 (0)