BUG: Delegate more of Excel parsing to CSV

gfyoung · gfyoung · commit 487a33614989 · 2018-11-08T10:40:51.000-08:00
The idea is that we read the Excel file, get the data, and then let the TextParser handle the reading and parsing. We shouldn't be doing a lot of work that is already defined in parsers.py In doing so, we identified several bugs: * index_col=None was not being respected * usecols behavior was inconsistent with that of read_csv for list of strings and callable inputs * usecols was not being validated as proper Excel column names when passed as a string. Closes pandas-devgh-18273. Closes pandas-devgh-20480.
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -2867,6 +2867,27 @@ indices to be parsed.
 
 Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
 
+.. versionadded:: 0.24
+
+If ``usecols`` is a list of strings, it is assumed that each string corresponds
+to a column name provided either by the user in ``names`` or inferred from the
+document header row(s). Those strings define which columns will be parsed:
+
+.. code-block:: python
+
+    read_excel('path_to_file.xls', 'Sheet1', usecols=['foo', 'bar'])
+
+Element order is ignored, so ``usecols=['baz', 'joe']`` is the same as ``['joe', 'baz']``.
+
+.. versionadded:: 0.24
+
+If ``usecols`` is callable, the callable function will be evaluated against
+the column names, returning names where the callable function evaluates to ``True``.
+
+.. code-block:: python
+
+    read_excel('path_to_file.xls', 'Sheet1', usecols=lambda x: x.isalpha())
+
 Parsing Dates
 +++++++++++++
 
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -237,6 +237,7 @@ Other Enhancements
 - Compatibility with Matplotlib 3.0 (:issue:`22790`).
 - Added :meth:`Interval.overlaps`, :meth:`IntervalArray.overlaps`, and :meth:`IntervalIndex.overlaps` for determining overlaps between interval-like objects (:issue:`21998`)
 - :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` have gained the ``nonexistent`` argument for alternative handling of nonexistent times. See :ref:`timeseries.timezone_nonexsistent` (:issue:`8917`)
+- :meth:`read_excel()` now accepts ``usecols`` as a list of column names or callable (:issue:`18273`)
 
 .. _whatsnew_0240.api_breaking:
 
@@ -1298,6 +1299,8 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
 - Bug in :meth:`HDFStore.append` when appending a :class:`DataFrame` with an empty string column and ``min_itemsize`` < 8 (:issue:`12242`)
 - Bug in :meth:`read_csv()` in which :class:`MultiIndex` index names were being improperly handled in the cases when they were not provided (:issue:`23484`)
 - Bug in :meth:`read_html()` in which the error message was not displaying the valid flavors when an invalid one was provided (:issue:`23549`)
+- Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`20480`)
+- Bug in :meth:`read_excel()` in which ``usecols`` was not being validated for proper column names when passed in as a string (:issue:`20480`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -17,8 +17,7 @@
 import pandas._libs.json as json
 import pandas.compat as compat
 from pandas.compat import (
-    OrderedDict, add_metaclass, lrange, map, range, reduce, string_types, u,
-    zip)
+    OrderedDict, add_metaclass, lrange, map, range, string_types, u, zip)
 from pandas.errors import EmptyDataError
 from pandas.util._decorators import Appender, deprecate_kwarg
 
@@ -93,13 +92,22 @@
     .. deprecated:: 0.21.0
        Pass in `usecols` instead.
 
-usecols : int or list, default None
-    * If None then parse all columns,
-    * If int then indicates last column to be parsed
-    * If list of ints then indicates list of column numbers to be parsed
-    * If string then indicates comma separated list of Excel column letters and
-      column ranges (e.g. "A:E" or "A,C,E:F").  Ranges are inclusive of
+usecols : int, str, list-like, or callable default None
+    * If None, then parse all columns,
+    * If int, then indicates last column to be parsed
+    * If string, then indicates comma separated list of Excel column letters
+      and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of
       both sides.
+    * If list of ints, then indicates list of column numbers to be parsed.
+    * If list of strings, then indicates list of column names to be parsed.
+
+    .. versionadded:: 0.24.0
+
+    * If callable, then evaluate each column name against it and parse the
+      column if the callable returns ``True``.
+
+    .. versionadded:: 0.24.0
+
 squeeze : boolean, default False
     If the parsed data only contains one column then return a Series
 dtype : Type name or dict of column -> type, default None
@@ -466,39 +474,6 @@ def parse(self,
                                  convert_float=convert_float,
                                  **kwds)
 
-    def _should_parse(self, i, usecols):
-
-        def _range2cols(areas):
-            """
-            Convert comma separated list of column names and column ranges to a
-            list of 0-based column indexes.
-
-            >>> _range2cols('A:E')
-            [0, 1, 2, 3, 4]
-            >>> _range2cols('A,C,Z:AB')
-            [0, 2, 25, 26, 27]
-            """
-            def _excel2num(x):
-                "Convert Excel column name like 'AB' to 0-based column index"
-                return reduce(lambda s, a: s * 26 + ord(a) - ord('A') + 1,
-                              x.upper().strip(), 0) - 1
-
-            cols = []
-            for rng in areas.split(','):
-                if ':' in rng:
-                    rng = rng.split(':')
-                    cols += lrange(_excel2num(rng[0]), _excel2num(rng[1]) + 1)
-                else:
-                    cols.append(_excel2num(rng))
-            return cols
-
-        if isinstance(usecols, int):
-            return i <= usecols
-        elif isinstance(usecols, compat.string_types):
-            return i in _range2cols(usecols)
-        else:
-            return i in usecols
-
     def _parse_excel(self,
                      sheet_name=0,
                      header=0,
@@ -527,10 +502,6 @@ def _parse_excel(self,
             raise NotImplementedError("chunksize keyword of read_excel "
                                       "is not implemented")
 
-        if parse_dates is True and index_col is None:
-            warnings.warn("The 'parse_dates=True' keyword of read_excel was "
-                          "provided without an 'index_col' keyword value.")
-
         import xlrd
         from xlrd import (xldate, XL_CELL_DATE,
                           XL_CELL_ERROR, XL_CELL_BOOLEAN,
@@ -620,17 +591,13 @@ def _parse_cell(cell_contents, cell_typ):
                 sheet = self.book.sheet_by_index(asheetname)
 
             data = []
-            should_parse = {}
+            usecols = _maybe_convert_usecols(usecols)
 
             for i in range(sheet.nrows):
                 row = []
                 for j, (value, typ) in enumerate(zip(sheet.row_values(i),
                                                      sheet.row_types(i))):
-                    if usecols is not None and j not in should_parse:
-                        should_parse[j] = self._should_parse(j, usecols)
-
-                    if usecols is None or should_parse[j]:
-                        row.append(_parse_cell(value, typ))
+                    row.append(_parse_cell(value, typ))
                 data.append(row)
 
             if sheet.nrows == 0:
@@ -642,31 +609,30 @@ def _parse_cell(cell_contents, cell_typ):
 
             # forward fill and pull out names for MultiIndex column
             header_names = None
-            if header is not None:
-                if is_list_like(header):
-                    header_names = []
-                    control_row = [True] * len(data[0])
-                    for row in header:
-                        if is_integer(skiprows):
-                            row += skiprows
-
-                        data[row], control_row = _fill_mi_header(
-                            data[row], control_row)
-                        header_name, data[row] = _pop_header_name(
-                            data[row], index_col)
-                        header_names.append(header_name)
-                else:
-                    data[header] = _trim_excel_header(data[header])
+            if header is not None and is_list_like(header):
+                header_names = []
+                control_row = [True] * len(data[0])
+
+                for row in header:
+                    if is_integer(skiprows):
+                        row += skiprows
+
+                    data[row], control_row = _fill_mi_header(
+                        data[row], control_row)
+                    header_name, _ = _pop_header_name(
+                        data[row], index_col)
+                    header_names.append(header_name)
 
             if is_list_like(index_col):
-                # forward fill values for MultiIndex index
+                # Forward fill values for MultiIndex index.
                 if not is_list_like(header):
                     offset = 1 + header
                 else:
                     offset = 1 + max(header)
 
                 for col in index_col:
                     last = data[offset][col]
+
                     for row in range(offset + 1, len(data)):
                         if data[row][col] == '' or data[row][col] is None:
                             data[row][col] = last
@@ -693,11 +659,14 @@ def _parse_cell(cell_contents, cell_typ):
                                     thousands=thousands,
                                     comment=comment,
                                     skipfooter=skipfooter,
+                                    usecols=usecols,
                                     **kwds)
 
                 output[asheetname] = parser.read(nrows=nrows)
+
                 if names is not None:
                     output[asheetname].columns = names
+
                 if not squeeze or isinstance(output[asheetname], DataFrame):
                     output[asheetname].columns = output[
                         asheetname].columns.set_names(header_names)
@@ -726,6 +695,97 @@ def __exit__(self, exc_type, exc_value, traceback):
         self.close()
 
 
+def _excel2num(x):
+    """
+    Convert Excel column name like 'AB' to 0-based column index.
+
+    Parameters
+    ----------
+    x : str
+        The Excel column name to convert to a 0-based column index.
+
+    Returns
+    -------
+    num : int
+        The column index corresponding to the name.
+
+    Raises
+    ------
+    ValueError
+        Part of the Excel column name was invalid.
+    """
+    index = 0
+
+    for c in x.upper().strip():
+        cp = ord(c)
+
+        if cp < ord("A") or cp > ord("Z"):
+            raise ValueError("Invalid column name: {x}".format(x=x))
+
+        index = index * 26 + cp - ord("A") + 1
+
+    return index - 1
+
+
+def _range2cols(areas):
+    """
+    Convert comma separated list of column names and ranges to indices.
+
+    Parameters
+    ----------
+    areas : str
+        A string containing a sequence of column ranges (or areas).
+
+    Returns
+    -------
+    cols : list
+        A list of 0-based column indices.
+
+    Examples
+    --------
+    >>> _range2cols('A:E')
+    [0, 1, 2, 3, 4]
+    >>> _range2cols('A,C,Z:AB')
+    [0, 2, 25, 26, 27]
+    """
+    cols = []
+
+    for rng in areas.split(","):
+        if ":" in rng:
+            rng = rng.split(":")
+            cols.extend(lrange(_excel2num(rng[0]), _excel2num(rng[1]) + 1))
+        else:
+            cols.append(_excel2num(rng))
+
+    return cols
+
+
+def _maybe_convert_usecols(usecols):
+    """
+    Convert `usecols` into a compatible format for parsing in `parsers.py`.
+
+    Parameters
+    ----------
+    usecols : object
+        The use-columns object to potentially convert.
+
+    Returns
+    -------
+    converted : object
+        The compatible format of `usecols`.
+    """
+    if usecols is None:
+        return usecols
+
+    if is_integer(usecols):
+        return lrange(usecols + 1)
+
+    if isinstance(usecols, compat.string_types):
+        return _range2cols(usecols)
+
+    return usecols
+
+
 def _validate_freeze_panes(freeze_panes):
     if freeze_panes is not None:
         if (
diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py