BUG: read_excel return empty dataframe when using usecols

jacksonjos · jacksonjos · commit 445d94aa8dc1 · 2018-03-25T20:24:05.000Z
- [x] closes #18273 - [x] tests added / passed - [x] passes git diff master --name-only -- "*.py" | grep "pandas/" | xargs -r flake8 - [x] whatsnew entry As mentioned read_excel returns an empty DataFrame when usecols argument is a list of strings. Now lists of strings are correctly interpreted by read_excel function.
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -979,6 +979,7 @@ I/O
 - Bug in :func:`DataFrame.to_latex()` where pairs of braces meant to serve as invisible placeholders were escaped (:issue:`18667`)
 - Bug in :func:`read_json` where large numeric values were causing an ``OverflowError`` (:issue:`18842`)
 - Bug in :func:`DataFrame.to_parquet` where an exception was raised if the write destination is S3 (:issue:`19134`)
+- Bug in :func:`read_excel` where `usecols` named argument as a list of strings were returning a empty DataFrame (:issue:`18273`)
 - :class:`Interval` now supported in :func:`DataFrame.to_excel` for all Excel file types (:issue:`19242`)
 - :class:`Timedelta` now supported in :func:`DataFrame.to_excel` for all Excel file types (:issue:`19242`, :issue:`9155`, :issue:`19900`)
 - Bug in :meth:`pandas.io.stata.StataReader.value_labels` raising an ``AttributeError`` when called on very old files. Now returns an empty dict (:issue:`19417`)
diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -96,8 +96,11 @@
     * If int then indicates last column to be parsed
     * If list of ints then indicates list of column numbers to be parsed
     * If string then indicates comma separated list of Excel column letters and
-      column ranges (e.g. "A:E" or "A,C,E:F").  Ranges are inclusive of
-      both sides.
+      column ranges (e.g. "A:E" or "A,C,E:F") to be parsed. Ranges are inclusive
+      of both sides.
+    * If list of strings each string shall be a Excel column letter or column
+      range (e.g. "A:E" or "A,C,E:F") to be parsed. Ranges are inclusive of both
+      sides.
 squeeze : boolean, default False
     If the parsed data only contains one column then return a Series
 dtype : Type name or dict of column -> type, default None
diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py
@@ -179,6 +179,32 @@ def test_usecols_str(self, ext):
         tm.assert_frame_equal(df2, df1, check_names=False)
         tm.assert_frame_equal(df3, df1, check_names=False)
 
+    @pytest.mark.parametrize("columns,usecols,parse_cols", [
+        (['A', 'B', 'C'], ['A:D'], ['A:D']),
+        (['B', 'C'], ['A', 'C', 'D'], ['A', 'C', 'D']),
+        (['B', 'C'], ['A', 'C:D'], ['A', 'C:D'])
+    ])
+    # GH18273 - read_excel return empty dataframe when using usecols as a list
+    # of strings
+    def test_usecols_str_list(self, ext, columns, usecols, parse_cols):
+
+        dfref = self.get_csv_refdf('test1')
+
+        df1 = dfref.reindex(columns=columns)
+        df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
+                               usecols=usecols)
+        df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
+                               index_col=0, usecols=usecols)
+
+        with tm.assert_produces_warning(FutureWarning):
+            df4 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
+                                   index_col=0, parse_cols=parse_cols)
+
+        # TODO add index to xls, read xls ignores index name ?
+        tm.assert_frame_equal(df2, df1, check_names=False)
+        tm.assert_frame_equal(df3, df1, check_names=False)
+        tm.assert_frame_equal(df4, df1, check_names=False)
+
     def test_excel_stop_iterator(self, ext):
 
         parsed = self.get_exceldf('test2', ext, 'Sheet1')