BUG: Respect usecols even with empty data

gfyoung · jreback · commit 827745dcf0ce · 2016-04-12T21:30:01.000-04:00
Closes #12493 Closes #12506 BUG: Better handling of empty data reads with Python engine In Python, when reading an empty file, it used to throw a StopIteration error with no error message. This PR helps to differentiate the case when no columns are inferable, which now leads to an EmptyDataError for both the C and Python engines.
diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt
@@ -179,6 +179,45 @@ New Behavior:
     # Output is a DataFrame
     df.groupby(pd.TimeGrouper(key='date', freq='M')).apply(lambda x: x[['value']].sum())
 
+.. _whatsnew_0181.read_csv_exceptions:
+
+Change in ``read_csv`` exceptions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In order to standardize the ``read_csv`` API for both the C and Python engines, both will now raise an
+``EmptyDataError``, a subclass of ``ValueError``, in response to empty columns or header (:issue:`12493`, :issue:`12506`)
+
+Previous behaviour:
+
+.. code-block:: python
+
+   In [1]: df = pd.read_csv(StringIO(''), engine='c')
+   ...
+   ValueError: No columns to parse from file
+
+   In [2]: df = pd.read_csv(StringIO(''), engine='python')
+   ...
+   StopIteration
+
+New behaviour:
+
+.. code-block:: python
+
+   In [1]: df = pd.read_csv(StringIO(''), engine='c')
+   ...
+   pandas.io.common.EmptyDataError: No columns to parse from file
+
+   In [2]: df = pd.read_csv(StringIO(''), engine='python')
+   ...
+   pandas.io.common.EmptyDataError: No columns to parse from file
+
+In addition to this error change, several others have been made as well:
+
+- ``CParserError`` is now a ``ValueError`` instead of just an ``Exception`` (:issue:`12551`)
+- A ``CParserError`` is now raised instead of a generic ``Exception`` in ``read_csv`` when the C engine cannot parse a column
+- A ``ValueError`` is now raised instead of a generic ``Exception`` in ``read_csv`` when the C engine encounters a ``NaN`` value in an integer column
+- A ``ValueError`` is now raised instead of a generic ``Exception`` in ``read_csv`` when ``true_values`` is specified, and the C engine encounters an element in a column containing unencodable bytes
+- ``pandas.parser.OverflowError`` exception has been removed and has been replaced with Python's built-in ``OverflowError`` exception
 
 .. _whatsnew_0181.deprecations:
 
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -56,7 +56,37 @@ def urlopen(*args, **kwargs):
 _VALID_URLS.discard('')
 
 
+class CParserError(ValueError):
+    """
+    Exception that is thrown by the C engine when it encounters
+    a parsing error in `pd.read_csv`
+    """
+    pass
+
+
 class DtypeWarning(Warning):
+    """
+    Warning that is raised whenever `pd.read_csv` encounters non-
+    uniform dtypes in a column(s) of a given CSV file
+    """
+    pass
+
+
+class EmptyDataError(ValueError):
+    """
+    Exception that is thrown in `pd.read_csv` (by both the C and
+    Python engines) when empty data or header is encountered
+    """
+    pass
+
+
+class ParserWarning(Warning):
+    """
+    Warning that is raised in `pd.read_csv` whenever it is necessary
+    to change parsers (generally from 'c' to 'python') contrary to the
+    one specified by the user due to lack of support or functionality for
+    parsing particular attributes of a CSV file with the requsted engine
+    """
     pass
 
 
diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -13,7 +13,7 @@
 from pandas.core.frame import DataFrame
 from pandas.io.parsers import TextParser
 from pandas.io.common import (_is_url, _urlopen, _validate_header_arg,
-                              get_filepath_or_buffer)
+                              EmptyDataError, get_filepath_or_buffer)
 from pandas.tseries.period import Period
 from pandas import json
 from pandas.compat import (map, zip, reduce, range, lrange, u, add_metaclass,
@@ -468,7 +468,7 @@ def _parse_cell(cell_contents, cell_typ):
                 if not squeeze or isinstance(output[asheetname], DataFrame):
                     output[asheetname].columns = output[
                         asheetname].columns.set_names(header_names)
-            except StopIteration:
+            except EmptyDataError:
                 # No Data, return an empty DataFrame
                 output[asheetname] = DataFrame()
 
diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -12,7 +12,8 @@
 
 import numpy as np
 
-from pandas.io.common import _is_url, urlopen, parse_url, _validate_header_arg
+from pandas.io.common import (EmptyDataError, _is_url, urlopen,
+                              parse_url, _validate_header_arg)
 from pandas.io.parsers import TextParser
 from pandas.compat import (lrange, lmap, u, string_types, iteritems,
                            raise_with_traceback, binary_type)
@@ -742,7 +743,7 @@ def _parse(flavor, io, match, header, index_col, skiprows,
                                       parse_dates=parse_dates,
                                       tupleize_cols=tupleize_cols,
                                       thousands=thousands))
-        except StopIteration:  # empty table
+        except EmptyDataError:  # empty table
             continue
     return ret
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -20,7 +20,8 @@
 from pandas.io.date_converters import generic_parser
 from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg,
                               _get_handle, UnicodeReader, UTF8Recoder,
-                              BaseIterator)
+                              BaseIterator, CParserError, EmptyDataError,
+                              ParserWarning)
 from pandas.tseries import tools
 
 from pandas.util.decorators import Appender
@@ -36,10 +37,6 @@
     'N/A', 'NA', '#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan', ''
 ])
 
-
-class ParserWarning(Warning):
-    pass
-
 _parser_params = """Also supports optionally iterating or breaking of the file
 into chunks.
 
@@ -936,7 +933,7 @@ def tostr(x):
         # long
         for n in range(len(columns[0])):
             if all(['Unnamed' in tostr(c[n]) for c in columns]):
-                raise _parser.CParserError(
+                raise CParserError(
                     "Passed header=[%s] are too many rows for this "
                     "multi_index of columns"
                     % ','.join([str(x) for x in self.header])
@@ -1255,10 +1252,19 @@ def read(self, nrows=None):
         except StopIteration:
             if self._first_chunk:
                 self._first_chunk = False
-                return _get_empty_meta(self.orig_names,
-                                       self.index_col,
-                                       self.index_names,
-                                       dtype=self.kwds.get('dtype'))
+
+                index, columns, col_dict = _get_empty_meta(
+                    self.orig_names, self.index_col,
+                    self.index_names, dtype=self.kwds.get('dtype'))
+
+                if self.usecols is not None:
+                    columns = self._filter_usecols(columns)
+
+                col_dict = dict(filter(lambda item: item[0] in columns,
+                                       col_dict.items()))
+
+                return index, columns, col_dict
+
             else:
                 raise
 
@@ -1750,10 +1756,26 @@ def _infer_columns(self):
 
             columns = []
             for level, hr in enumerate(header):
-                line = self._buffered_line()
+                try:
+                    line = self._buffered_line()
+
+                    while self.line_pos <= hr:
+                        line = self._next_line()
 
-                while self.line_pos <= hr:
-                    line = self._next_line()
+                except StopIteration:
+                    if self.line_pos < hr:
+                        raise ValueError(
+                            'Passed header=%s but only %d lines in file'
+                            % (hr, self.line_pos + 1))
+
+                    # We have an empty file, so check
+                    # if columns are provided. That will
+                    # serve as the 'line' for parsing
+                    if not self.names:
+                        raise EmptyDataError(
+                            "No columns to parse from file")
+
+                    line = self.names[:]
 
                 unnamed_count = 0
                 this_columns = []
@@ -1818,10 +1840,19 @@ def _infer_columns(self):
             else:
                 columns = self._handle_usecols(columns, columns[0])
         else:
-            # header is None
-            line = self._buffered_line()
+            try:
+                line = self._buffered_line()
+
+            except StopIteration:
+                if not names:
+                    raise EmptyDataError(
+                        "No columns to parse from file")
+
+                line = names[:]
+
             ncols = len(line)
             num_original_columns = ncols
+
             if not names:
                 if self.prefix:
                     columns = [['%s%d' % (self.prefix, i)
diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py
@@ -804,3 +804,7 @@ def test_same_ordering():
     dfs_lxml = read_html(filename, index_col=0, flavor=['lxml'])
     dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4'])
     assert_framelist_equal(dfs_lxml, dfs_bs4)
+
+if __name__ == '__main__':
+    nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
+                   exit=False)
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -16,7 +16,6 @@
 import nose
 import numpy as np
 import pandas.lib as lib
-import pandas.parser
 from numpy import nan
 from numpy.testing.decorators import slow
 from pandas.lib import Timestamp
@@ -32,7 +31,8 @@
 )
 from pandas.compat import parse_date
 from pandas.core.common import AbstractMethodError
-from pandas.io.common import DtypeWarning, URLError
+from pandas.io.common import (CParserError, DtypeWarning,
+                              EmptyDataError, URLError)
 from pandas.io.parsers import (read_csv, read_table, read_fwf,
                                TextFileReader, TextParser)
 from pandas.tseries.index import date_range
@@ -1209,7 +1209,7 @@ def test_read_table_wrong_num_columns(self):
 6,7,8,9,10,11,12
 11,12,13,14,15,16
 """
-        self.assertRaises(Exception, self.read_csv, StringIO(data))
+        self.assertRaises(ValueError, self.read_csv, StringIO(data))
 
     def test_read_table_duplicate_index(self):
         data = """index,A,B,C,D
@@ -1740,7 +1740,7 @@ def test_read_table_buglet_4x_multiindex(self):
         # Temporarily copied to TestPythonParser.
         # Here test that CParserError is raised:
 
-        with tm.assertRaises(pandas.parser.CParserError):
+        with tm.assertRaises(CParserError):
             text = """                      A       B       C       D        E
 one two three   four
 a   b   10.0032 5    -0.5109 -2.3358 -0.4645  0.05076  0.3640
@@ -1840,7 +1840,7 @@ def test_parse_dates_custom_euroformat(self):
         tm.assert_frame_equal(df, expected)
 
         parser = lambda d: parse_date(d, day_first=True)
-        self.assertRaises(Exception, self.read_csv,
+        self.assertRaises(TypeError, self.read_csv,
                           StringIO(text), skiprows=[0],
                           names=['time', 'Q', 'NTU'], index_col=0,
                           parse_dates=True, date_parser=parser,
@@ -2014,7 +2014,7 @@ def test_bool_na_values(self):
     def test_nonexistent_path(self):
         # don't segfault pls #2428
         path = '%s.csv' % tm.rands(10)
-        self.assertRaises(Exception, self.read_csv, path)
+        self.assertRaises(IOError, self.read_csv, path)
 
     def test_missing_trailing_delimiters(self):
         data = """A,B,C,D
@@ -2358,7 +2358,7 @@ def test_catch_too_many_names(self):
 4,,6
 7,8,9
 10,11,12\n"""
-        tm.assertRaises(Exception, read_csv, StringIO(data),
+        tm.assertRaises(ValueError, read_csv, StringIO(data),
                         header=0, names=['a', 'b', 'c', 'd'])
 
     def test_ignore_leading_whitespace(self):
@@ -2525,9 +2525,8 @@ def test_int64_overflow(self):
         result = self.read_csv(StringIO(data))
         self.assertTrue(result['ID'].dtype == object)
 
-        self.assertRaises((OverflowError, pandas.parser.OverflowError),
-                          self.read_csv, StringIO(data),
-                          converters={'ID': np.int64})
+        self.assertRaises(OverflowError, self.read_csv,
+                          StringIO(data), converters={'ID': np.int64})
 
         # Just inside int64 range: parse as integer
         i_max = np.iinfo(np.int64).max
@@ -2774,7 +2773,7 @@ def test_mixed_dtype_usecols(self):
         usecols = [0, 'b', 2]
 
         with tm.assertRaisesRegexp(ValueError, msg):
-            df = self.read_csv(StringIO(data), usecols=usecols)
+            self.read_csv(StringIO(data), usecols=usecols)
 
     def test_usecols_with_integer_like_header(self):
         data = """2,0,1
@@ -2796,6 +2795,37 @@ def test_usecols_with_integer_like_header(self):
         df = self.read_csv(StringIO(data), usecols=usecols)
         tm.assert_frame_equal(df, expected)
 
+    def test_read_empty_with_usecols(self):
+        # See gh-12493
+        names = ['Dummy', 'X', 'Dummy_2']
+        usecols = names[1:2]  # ['X']
+
+        # first, check to see that the response of
+        # parser when faced with no provided columns
+        # throws the correct error, with or without usecols
+        errmsg = "No columns to parse from file"
+
+        with tm.assertRaisesRegexp(EmptyDataError, errmsg):
+            self.read_csv(StringIO(''))
+
+        with tm.assertRaisesRegexp(EmptyDataError, errmsg):
+            self.read_csv(StringIO(''), usecols=usecols)
+
+        expected = DataFrame(columns=usecols, index=[0], dtype=np.float64)
+        df = self.read_csv(StringIO(',,'), names=names, usecols=usecols)
+        tm.assert_frame_equal(df, expected)
+
+        expected = DataFrame(columns=usecols)
+        df = self.read_csv(StringIO(''), names=names, usecols=usecols)
+        tm.assert_frame_equal(df, expected)
+
+    def test_read_with_bad_header(self):
+        errmsg = "but only \d+ lines in file"
+
+        with tm.assertRaisesRegexp(ValueError, errmsg):
+            s = StringIO(',,')
+            self.read_csv(s, header=[10])
+
 
 class CompressionTests(object):
     def test_zip(self):
@@ -4399,7 +4429,7 @@ def test_raise_on_passed_int_dtype_with_nas(self):
 2001,106380451,10
 2001,,11
 2001,106380451,67"""
-        self.assertRaises(Exception, read_csv, StringIO(data), sep=",",
+        self.assertRaises(ValueError, read_csv, StringIO(data), sep=",",
                           skipinitialspace=True,
                           dtype={'DOY': np.int64})
 
diff --git a/pandas/parser.pyx b/pandas/parser.pyx