BUG: read_excel return empty dataframe when using usecols

jacksonjos · jacksonjos · commit 171c9b676895 · 2018-03-25T04:35:37.000Z
closes pandas-dev#18273 tests added/passed passes git diff master --name-only -- "*.py" | grep "pandas/" | xargs -r flake8 whatsnew entry As mentioned read_excel returns an empty DataFrame when usecols argument is a list of strings. Now lists of strings are correctly interpreted by read_excel function.
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -983,6 +983,7 @@ I/O
 - Bug in :meth:`pandas.io.stata.StataReader.value_labels` raising an ``AttributeError`` when called on very old files. Now returns an empty dict (:issue:`19417`)
 - Bug in :func:`read_pickle` when unpickling objects with :class:`TimedeltaIndex` or :class:`Float64Index` created with pandas prior to version 0.20 (:issue:`19939`)
 - Bug in :meth:`pandas.io.json.json_normalize` where subrecords are not properly normalized if any subrecords values are NoneType (:issue:`20030`)
+- Bug in :func:`read_excel` where read_excel return empty ``DataFrame`` when using ``usecols`` argument as a list of strings (:issue:`18273`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -96,8 +96,11 @@
     * If int then indicates last column to be parsed
     * If list of ints then indicates list of column numbers to be parsed
     * If string then indicates comma separated list of Excel column letters and
-      column ranges (e.g. "A:E" or "A,C,E:F").  Ranges are inclusive of
-      both sides.
+      column ranges (e.g. "A:E" or "A,C,E:F") to be parsed.  Ranges are
+      inclusive of both sides.
+    * If list of strings then indicates list of Excel column letters and
+      column ranges (e.g. "A:E" or "A,C,E:F") to be parsed.  Ranges are
+      inclusive of both sides.
 squeeze : boolean, default False
     If the parsed data only contains one column then return a Series
 dtype : Type name or dict of column -> type, default None
@@ -479,6 +482,9 @@ def _excel2num(x):
             return i <= usecols
         elif isinstance(usecols, compat.string_types):
             return i in _range2cols(usecols)
+        elif all(isinstance(x, compat.string_types) for x in usecols) is True:
+            usecols_str = ",".join(usecols)
+            return i in _range2cols(usecols_str)
         else:
             return i in usecols
 
diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py
@@ -179,6 +179,42 @@ def test_usecols_str(self, ext):
         tm.assert_frame_equal(df2, df1, check_names=False)
         tm.assert_frame_equal(df3, df1, check_names=False)
 
+    def test_usecols_str_list(self, ext):
+
+        dfref = self.get_csv_refdf('test1')
+
+        df1 = dfref.reindex(columns=['A', 'B', 'C'])
+        df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
+                               usecols=['A:D'])
+        df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
+                               index_col=0, usecols=['A:D'])
+
+        with tm.assert_produces_warning(FutureWarning):
+            df4 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
+                                   index_col=0, parse_cols=['A:D'])
+
+        # TODO add index to xls, read xls ignores index name ?
+        tm.assert_frame_equal(df2, df1, check_names=False)
+        tm.assert_frame_equal(df3, df1, check_names=False)
+        tm.assert_frame_equal(df4, df1, check_names=False)
+
+        df1 = dfref.reindex(columns=['B', 'C'])
+        df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
+                               usecols=['A', 'C', 'D'])
+        df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
+                               index_col=0, usecols=['A', 'C', 'D'])
+        # TODO add index to xls file
+        tm.assert_frame_equal(df2, df1, check_names=False)
+        tm.assert_frame_equal(df3, df1, check_names=False)
+
+        df1 = dfref.reindex(columns=['B', 'C'])
+        df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
+                               usecols=['A', 'C:D'])
+        df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
+                               index_col=0, usecols=['A', 'C:D'])
+        tm.assert_frame_equal(df2, df1, check_names=False)
+        tm.assert_frame_equal(df3, df1, check_names=False)
+
     def test_excel_stop_iterator(self, ext):
 
         parsed = self.get_exceldf('test2', ext, 'Sheet1')