diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 071cf5f17fc56..4387a51db8df3 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -179,6 +179,45 @@ New Behavior: # Output is a DataFrame df.groupby(pd.TimeGrouper(key='date', freq='M')).apply(lambda x: x[['value']].sum()) +.. _whatsnew_0181.read_csv_exceptions: + +Change in ``read_csv`` exceptions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In order to standardize the ``read_csv`` API for both the C and Python engines, both will now raise an +``EmptyDataError``, a subclass of ``ValueError``, in response to empty columns or header (:issue:`12506`) + +Previous behaviour: + +.. code-block:: python + + In [1]: df = pd.read_csv(StringIO(''), engine='c') + ... + ValueError: No columns to parse from file + + In [2]: df = pd.read_csv(StringIO(''), engine='python') + ... + StopIteration + +New behaviour: + +.. code-block:: python + + In [1]: df = pd.read_csv(StringIO(''), engine='c') + ... + pandas.io.common.EmptyDataError: No columns to parse from file + + In [2]: df = pd.read_csv(StringIO(''), engine='python') + ... + pandas.io.common.EmptyDataError: No columns to parse from file + +In addition to this error change, several others have been made as well: + +- ``CParserError`` is now a ``ValueError`` instead of just an ``Exception`` (:issue:`12551`) +- A ``CParserError`` is now raised instead of a generic ``Exception`` in ``read_csv`` when the C engine cannot parse a column +- A ``ValueError`` is now raised instead of a generic ``Exception`` in ``read_csv`` when the C engine encounters a ``NaN`` value in an integer column +- A ``ValueError`` is now raised instead of a generic ``Exception`` in ``read_csv`` when ``true_values`` is specified, and the C engine encounters an element in a column containing unencodable bytes +- ``pandas.parser.OverflowError`` exception has been removed and has been replaced with Python's built-in ``OverflowError`` exception .. _whatsnew_0181.deprecations: diff --git a/pandas/io/common.py b/pandas/io/common.py index e644f3a5f5090..8319e4c586e3b 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -56,7 +56,37 @@ def urlopen(*args, **kwargs): _VALID_URLS.discard('') +class CParserError(ValueError): + """ + Exception that is thrown by the C engine when it encounters + a parsing error in `pd.read_csv` + """ + pass + + class DtypeWarning(Warning): + """ + Warning that is raised whenever `pd.read_csv` encounters non- + uniform dtypes in a column(s) of a given CSV file + """ + pass + + +class EmptyDataError(ValueError): + """ + Exception that is thrown in `pd.read_csv` (by both the C and + Python engines) when empty data or header is encountered + """ + pass + + +class ParserWarning(Warning): + """ + Warning that is raised in `pd.read_csv` whenever it is necessary + to change parsers (generally from 'c' to 'python') contrary to the + one specified by the user due to lack of support or functionality for + parsing particular attributes of a CSV file with the requsted engine + """ pass diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 0261e825d56e2..882f23a83f29d 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -13,7 +13,7 @@ from pandas.core.frame import DataFrame from pandas.io.parsers import TextParser from pandas.io.common import (_is_url, _urlopen, _validate_header_arg, - get_filepath_or_buffer) + EmptyDataError, get_filepath_or_buffer) from pandas.tseries.period import Period from pandas import json from pandas.compat import (map, zip, reduce, range, lrange, u, add_metaclass, @@ -468,7 +468,7 @@ def _parse_cell(cell_contents, cell_typ): if not squeeze or isinstance(output[asheetname], DataFrame): output[asheetname].columns = output[ asheetname].columns.set_names(header_names) - except StopIteration: + except EmptyDataError: # No Data, return an empty DataFrame output[asheetname] = DataFrame() diff --git a/pandas/io/html.py b/pandas/io/html.py index af4ecb2484797..90bbeb161442f 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -12,7 +12,8 @@ import numpy as np -from pandas.io.common import _is_url, urlopen, parse_url, _validate_header_arg +from pandas.io.common import (EmptyDataError, _is_url, urlopen, + parse_url, _validate_header_arg) from pandas.io.parsers import TextParser from pandas.compat import (lrange, lmap, u, string_types, iteritems, raise_with_traceback, binary_type) @@ -742,7 +743,7 @@ def _parse(flavor, io, match, header, index_col, skiprows, parse_dates=parse_dates, tupleize_cols=tupleize_cols, thousands=thousands)) - except StopIteration: # empty table + except EmptyDataError: # empty table continue return ret diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 18670e9afa65f..360025836cbd0 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -20,7 +20,8 @@ from pandas.io.date_converters import generic_parser from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg, _get_handle, UnicodeReader, UTF8Recoder, - BaseIterator) + BaseIterator, CParserError, EmptyDataError, + ParserWarning) from pandas.tseries import tools from pandas.util.decorators import Appender @@ -36,10 +37,6 @@ 'N/A', 'NA', '#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan', '' ]) - -class ParserWarning(Warning): - pass - _parser_params = """Also supports optionally iterating or breaking of the file into chunks. @@ -936,7 +933,7 @@ def tostr(x): # long for n in range(len(columns[0])): if all(['Unnamed' in tostr(c[n]) for c in columns]): - raise _parser.CParserError( + raise CParserError( "Passed header=[%s] are too many rows for this " "multi_index of columns" % ','.join([str(x) for x in self.header]) @@ -1255,10 +1252,19 @@ def read(self, nrows=None): except StopIteration: if self._first_chunk: self._first_chunk = False - return _get_empty_meta(self.orig_names, - self.index_col, - self.index_names, - dtype=self.kwds.get('dtype')) + + index, columns, col_dict = _get_empty_meta( + self.orig_names, self.index_col, + self.index_names, dtype=self.kwds.get('dtype')) + + if self.usecols is not None: + columns = self._filter_usecols(columns) + + col_dict = dict(filter(lambda item: item[0] in columns, + col_dict.items())) + + return index, columns, col_dict + else: raise @@ -1750,10 +1756,26 @@ def _infer_columns(self): columns = [] for level, hr in enumerate(header): - line = self._buffered_line() + try: + line = self._buffered_line() + + while self.line_pos <= hr: + line = self._next_line() - while self.line_pos <= hr: - line = self._next_line() + except StopIteration: + if self.line_pos < hr: + raise ValueError( + 'Passed header=%s but only %d lines in file' + % (hr, self.line_pos + 1)) + + # We have an empty file, so check + # if columns are provided. That will + # serve as the 'line' for parsing + if not self.names: + raise EmptyDataError( + "No columns to parse from file") + + line = self.names[:] unnamed_count = 0 this_columns = [] @@ -1818,10 +1840,19 @@ def _infer_columns(self): else: columns = self._handle_usecols(columns, columns[0]) else: - # header is None - line = self._buffered_line() + try: + line = self._buffered_line() + + except StopIteration: + if not names: + raise EmptyDataError( + "No columns to parse from file") + + line = names[:] + ncols = len(line) num_original_columns = ncols + if not names: if self.prefix: columns = [['%s%d' % (self.prefix, i) diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 9a18da7d57648..cb625a26e40f9 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -804,3 +804,7 @@ def test_same_ordering(): dfs_lxml = read_html(filename, index_col=0, flavor=['lxml']) dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4']) assert_framelist_equal(dfs_lxml, dfs_bs4) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 2d56275279453..8f57b08ee9817 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -16,7 +16,6 @@ import nose import numpy as np import pandas.lib as lib -import pandas.parser from numpy import nan from numpy.testing.decorators import slow from pandas.lib import Timestamp @@ -32,7 +31,8 @@ ) from pandas.compat import parse_date from pandas.core.common import AbstractMethodError -from pandas.io.common import DtypeWarning, URLError +from pandas.io.common import (CParserError, DtypeWarning, + EmptyDataError, URLError) from pandas.io.parsers import (read_csv, read_table, read_fwf, TextFileReader, TextParser) from pandas.tseries.index import date_range @@ -1209,7 +1209,7 @@ def test_read_table_wrong_num_columns(self): 6,7,8,9,10,11,12 11,12,13,14,15,16 """ - self.assertRaises(Exception, self.read_csv, StringIO(data)) + self.assertRaises(ValueError, self.read_csv, StringIO(data)) def test_read_table_duplicate_index(self): data = """index,A,B,C,D @@ -1740,7 +1740,7 @@ def test_read_table_buglet_4x_multiindex(self): # Temporarily copied to TestPythonParser. # Here test that CParserError is raised: - with tm.assertRaises(pandas.parser.CParserError): + with tm.assertRaises(CParserError): text = """ A B C D E one two three four a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 @@ -1840,7 +1840,7 @@ def test_parse_dates_custom_euroformat(self): tm.assert_frame_equal(df, expected) parser = lambda d: parse_date(d, day_first=True) - self.assertRaises(Exception, self.read_csv, + self.assertRaises(TypeError, self.read_csv, StringIO(text), skiprows=[0], names=['time', 'Q', 'NTU'], index_col=0, parse_dates=True, date_parser=parser, @@ -2014,7 +2014,7 @@ def test_bool_na_values(self): def test_nonexistent_path(self): # don't segfault pls #2428 path = '%s.csv' % tm.rands(10) - self.assertRaises(Exception, self.read_csv, path) + self.assertRaises(IOError, self.read_csv, path) def test_missing_trailing_delimiters(self): data = """A,B,C,D @@ -2358,7 +2358,7 @@ def test_catch_too_many_names(self): 4,,6 7,8,9 10,11,12\n""" - tm.assertRaises(Exception, read_csv, StringIO(data), + tm.assertRaises(ValueError, read_csv, StringIO(data), header=0, names=['a', 'b', 'c', 'd']) def test_ignore_leading_whitespace(self): @@ -2525,9 +2525,8 @@ def test_int64_overflow(self): result = self.read_csv(StringIO(data)) self.assertTrue(result['ID'].dtype == object) - self.assertRaises((OverflowError, pandas.parser.OverflowError), - self.read_csv, StringIO(data), - converters={'ID': np.int64}) + self.assertRaises(OverflowError, self.read_csv, + StringIO(data), converters={'ID': np.int64}) # Just inside int64 range: parse as integer i_max = np.iinfo(np.int64).max @@ -2774,7 +2773,7 @@ def test_mixed_dtype_usecols(self): usecols = [0, 'b', 2] with tm.assertRaisesRegexp(ValueError, msg): - df = self.read_csv(StringIO(data), usecols=usecols) + self.read_csv(StringIO(data), usecols=usecols) def test_usecols_with_integer_like_header(self): data = """2,0,1 @@ -2796,6 +2795,37 @@ def test_usecols_with_integer_like_header(self): df = self.read_csv(StringIO(data), usecols=usecols) tm.assert_frame_equal(df, expected) + def test_read_empty_with_usecols(self): + # See gh-12493 + names = ['Dummy', 'X', 'Dummy_2'] + usecols = names[1:2] # ['X'] + + # first, check to see that the response of + # parser when faced with no provided columns + # throws the correct error, with or without usecols + errmsg = "No columns to parse from file" + + with tm.assertRaisesRegexp(EmptyDataError, errmsg): + self.read_csv(StringIO('')) + + with tm.assertRaisesRegexp(EmptyDataError, errmsg): + self.read_csv(StringIO(''), usecols=usecols) + + expected = DataFrame(columns=usecols, index=[0], dtype=np.float64) + df = self.read_csv(StringIO(',,'), names=names, usecols=usecols) + tm.assert_frame_equal(df, expected) + + expected = DataFrame(columns=usecols) + df = self.read_csv(StringIO(''), names=names, usecols=usecols) + tm.assert_frame_equal(df, expected) + + def test_read_with_bad_header(self): + errmsg = "but only \d+ lines in file" + + with tm.assertRaisesRegexp(ValueError, errmsg): + s = StringIO(',,') + self.read_csv(s, header=[10]) + class CompressionTests(object): def test_zip(self): @@ -4399,7 +4429,7 @@ def test_raise_on_passed_int_dtype_with_nas(self): 2001,106380451,10 2001,,11 2001,106380451,67""" - self.assertRaises(Exception, read_csv, StringIO(data), sep=",", + self.assertRaises(ValueError, read_csv, StringIO(data), sep=",", skipinitialspace=True, dtype={'DOY': np.int64}) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 743d2ea0c3bb8..8369fda3e9b00 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -10,7 +10,7 @@ import warnings from cpython cimport (PyObject, PyBytes_FromString, PyBytes_AsString, PyBytes_Check, PyUnicode_Check, PyUnicode_AsUTF8String) -from io.common import DtypeWarning +from io.common import CParserError, DtypeWarning, EmptyDataError cdef extern from "Python.h": @@ -519,7 +519,7 @@ cdef class TextReader: self.header, self.table_width = self._get_header() if not self.table_width: - raise ValueError("No columns to parse from file") + raise EmptyDataError("No columns to parse from file") # compute buffer_lines as function of table width heuristic = 2**20 // self.table_width @@ -1009,7 +1009,7 @@ cdef class TextReader: col_res = downcast_int64(col_res, self.use_unsigned) if col_res is None: - raise Exception('Unable to parse column %d' % i) + raise CParserError('Unable to parse column %d' % i) results[i] = col_res @@ -1097,7 +1097,7 @@ cdef class TextReader: na_filter, na_hashset) if user_dtype and na_count is not None: if na_count > 0: - raise Exception("Integer column has NA values in " + raise ValueError("Integer column has NA values in " "column {column}".format(column=i)) if result is not None and dtype[1:] != 'i8': @@ -1235,13 +1235,6 @@ cdef class TextReader: else: return None -class CParserError(ValueError): - pass - - -class OverflowError(ValueError): - pass - cdef object _true_values = [b'True', b'TRUE', b'true'] cdef object _false_values = [b'False', b'FALSE', b'false'] @@ -1815,7 +1808,7 @@ cdef kh_str_t* kset_from_list(list values) except NULL: # None creeps in sometimes, which isn't possible here if not PyBytes_Check(val): - raise Exception('Must be all encoded bytes') + raise ValueError('Must be all encoded bytes') k = kh_put_str(table, PyBytes_AsString(val), &ret)