Skip to content

Commit 827745d

Browse files
gfyoungjreback
authored andcommitted
BUG: Respect usecols even with empty data
Closes #12493 Closes #12506 BUG: Better handling of empty data reads with Python engine In Python, when reading an empty file, it used to throw a StopIteration error with no error message. This PR helps to differentiate the case when no columns are inferable, which now leads to an EmptyDataError for both the C and Python engines.
1 parent 5a53f03 commit 827745d

File tree

8 files changed

+171
-43
lines changed

8 files changed

+171
-43
lines changed

doc/source/whatsnew/v0.18.1.txt

+39
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,45 @@ New Behavior:
179179
# Output is a DataFrame
180180
df.groupby(pd.TimeGrouper(key='date', freq='M')).apply(lambda x: x[['value']].sum())
181181

182+
.. _whatsnew_0181.read_csv_exceptions:
183+
184+
Change in ``read_csv`` exceptions
185+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
186+
187+
In order to standardize the ``read_csv`` API for both the C and Python engines, both will now raise an
188+
``EmptyDataError``, a subclass of ``ValueError``, in response to empty columns or header (:issue:`12493`, :issue:`12506`)
189+
190+
Previous behaviour:
191+
192+
.. code-block:: python
193+
194+
In [1]: df = pd.read_csv(StringIO(''), engine='c')
195+
...
196+
ValueError: No columns to parse from file
197+
198+
In [2]: df = pd.read_csv(StringIO(''), engine='python')
199+
...
200+
StopIteration
201+
202+
New behaviour:
203+
204+
.. code-block:: python
205+
206+
In [1]: df = pd.read_csv(StringIO(''), engine='c')
207+
...
208+
pandas.io.common.EmptyDataError: No columns to parse from file
209+
210+
In [2]: df = pd.read_csv(StringIO(''), engine='python')
211+
...
212+
pandas.io.common.EmptyDataError: No columns to parse from file
213+
214+
In addition to this error change, several others have been made as well:
215+
216+
- ``CParserError`` is now a ``ValueError`` instead of just an ``Exception`` (:issue:`12551`)
217+
- A ``CParserError`` is now raised instead of a generic ``Exception`` in ``read_csv`` when the C engine cannot parse a column
218+
- A ``ValueError`` is now raised instead of a generic ``Exception`` in ``read_csv`` when the C engine encounters a ``NaN`` value in an integer column
219+
- A ``ValueError`` is now raised instead of a generic ``Exception`` in ``read_csv`` when ``true_values`` is specified, and the C engine encounters an element in a column containing unencodable bytes
220+
- ``pandas.parser.OverflowError`` exception has been removed and has been replaced with Python's built-in ``OverflowError`` exception
182221

183222
.. _whatsnew_0181.deprecations:
184223

pandas/io/common.py

+30
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,37 @@ def urlopen(*args, **kwargs):
5656
_VALID_URLS.discard('')
5757

5858

59+
class CParserError(ValueError):
60+
"""
61+
Exception that is thrown by the C engine when it encounters
62+
a parsing error in `pd.read_csv`
63+
"""
64+
pass
65+
66+
5967
class DtypeWarning(Warning):
68+
"""
69+
Warning that is raised whenever `pd.read_csv` encounters non-
70+
uniform dtypes in a column(s) of a given CSV file
71+
"""
72+
pass
73+
74+
75+
class EmptyDataError(ValueError):
76+
"""
77+
Exception that is thrown in `pd.read_csv` (by both the C and
78+
Python engines) when empty data or header is encountered
79+
"""
80+
pass
81+
82+
83+
class ParserWarning(Warning):
84+
"""
85+
Warning that is raised in `pd.read_csv` whenever it is necessary
86+
to change parsers (generally from 'c' to 'python') contrary to the
87+
one specified by the user due to lack of support or functionality for
88+
parsing particular attributes of a CSV file with the requsted engine
89+
"""
6090
pass
6191

6292

pandas/io/excel.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from pandas.core.frame import DataFrame
1414
from pandas.io.parsers import TextParser
1515
from pandas.io.common import (_is_url, _urlopen, _validate_header_arg,
16-
get_filepath_or_buffer)
16+
EmptyDataError, get_filepath_or_buffer)
1717
from pandas.tseries.period import Period
1818
from pandas import json
1919
from pandas.compat import (map, zip, reduce, range, lrange, u, add_metaclass,
@@ -468,7 +468,7 @@ def _parse_cell(cell_contents, cell_typ):
468468
if not squeeze or isinstance(output[asheetname], DataFrame):
469469
output[asheetname].columns = output[
470470
asheetname].columns.set_names(header_names)
471-
except StopIteration:
471+
except EmptyDataError:
472472
# No Data, return an empty DataFrame
473473
output[asheetname] = DataFrame()
474474

pandas/io/html.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212

1313
import numpy as np
1414

15-
from pandas.io.common import _is_url, urlopen, parse_url, _validate_header_arg
15+
from pandas.io.common import (EmptyDataError, _is_url, urlopen,
16+
parse_url, _validate_header_arg)
1617
from pandas.io.parsers import TextParser
1718
from pandas.compat import (lrange, lmap, u, string_types, iteritems,
1819
raise_with_traceback, binary_type)
@@ -742,7 +743,7 @@ def _parse(flavor, io, match, header, index_col, skiprows,
742743
parse_dates=parse_dates,
743744
tupleize_cols=tupleize_cols,
744745
thousands=thousands))
745-
except StopIteration: # empty table
746+
except EmptyDataError: # empty table
746747
continue
747748
return ret
748749

pandas/io/parsers.py

+46-15
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020
from pandas.io.date_converters import generic_parser
2121
from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg,
2222
_get_handle, UnicodeReader, UTF8Recoder,
23-
BaseIterator)
23+
BaseIterator, CParserError, EmptyDataError,
24+
ParserWarning)
2425
from pandas.tseries import tools
2526

2627
from pandas.util.decorators import Appender
@@ -36,10 +37,6 @@
3637
'N/A', 'NA', '#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan', ''
3738
])
3839

39-
40-
class ParserWarning(Warning):
41-
pass
42-
4340
_parser_params = """Also supports optionally iterating or breaking of the file
4441
into chunks.
4542
@@ -936,7 +933,7 @@ def tostr(x):
936933
# long
937934
for n in range(len(columns[0])):
938935
if all(['Unnamed' in tostr(c[n]) for c in columns]):
939-
raise _parser.CParserError(
936+
raise CParserError(
940937
"Passed header=[%s] are too many rows for this "
941938
"multi_index of columns"
942939
% ','.join([str(x) for x in self.header])
@@ -1255,10 +1252,19 @@ def read(self, nrows=None):
12551252
except StopIteration:
12561253
if self._first_chunk:
12571254
self._first_chunk = False
1258-
return _get_empty_meta(self.orig_names,
1259-
self.index_col,
1260-
self.index_names,
1261-
dtype=self.kwds.get('dtype'))
1255+
1256+
index, columns, col_dict = _get_empty_meta(
1257+
self.orig_names, self.index_col,
1258+
self.index_names, dtype=self.kwds.get('dtype'))
1259+
1260+
if self.usecols is not None:
1261+
columns = self._filter_usecols(columns)
1262+
1263+
col_dict = dict(filter(lambda item: item[0] in columns,
1264+
col_dict.items()))
1265+
1266+
return index, columns, col_dict
1267+
12621268
else:
12631269
raise
12641270

@@ -1750,10 +1756,26 @@ def _infer_columns(self):
17501756

17511757
columns = []
17521758
for level, hr in enumerate(header):
1753-
line = self._buffered_line()
1759+
try:
1760+
line = self._buffered_line()
1761+
1762+
while self.line_pos <= hr:
1763+
line = self._next_line()
17541764

1755-
while self.line_pos <= hr:
1756-
line = self._next_line()
1765+
except StopIteration:
1766+
if self.line_pos < hr:
1767+
raise ValueError(
1768+
'Passed header=%s but only %d lines in file'
1769+
% (hr, self.line_pos + 1))
1770+
1771+
# We have an empty file, so check
1772+
# if columns are provided. That will
1773+
# serve as the 'line' for parsing
1774+
if not self.names:
1775+
raise EmptyDataError(
1776+
"No columns to parse from file")
1777+
1778+
line = self.names[:]
17571779

17581780
unnamed_count = 0
17591781
this_columns = []
@@ -1818,10 +1840,19 @@ def _infer_columns(self):
18181840
else:
18191841
columns = self._handle_usecols(columns, columns[0])
18201842
else:
1821-
# header is None
1822-
line = self._buffered_line()
1843+
try:
1844+
line = self._buffered_line()
1845+
1846+
except StopIteration:
1847+
if not names:
1848+
raise EmptyDataError(
1849+
"No columns to parse from file")
1850+
1851+
line = names[:]
1852+
18231853
ncols = len(line)
18241854
num_original_columns = ncols
1855+
18251856
if not names:
18261857
if self.prefix:
18271858
columns = [['%s%d' % (self.prefix, i)

pandas/io/tests/test_html.py

+4
Original file line numberDiff line numberDiff line change
@@ -804,3 +804,7 @@ def test_same_ordering():
804804
dfs_lxml = read_html(filename, index_col=0, flavor=['lxml'])
805805
dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4'])
806806
assert_framelist_equal(dfs_lxml, dfs_bs4)
807+
808+
if __name__ == '__main__':
809+
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
810+
exit=False)

pandas/io/tests/test_parsers.py

+42-12
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
import nose
1717
import numpy as np
1818
import pandas.lib as lib
19-
import pandas.parser
2019
from numpy import nan
2120
from numpy.testing.decorators import slow
2221
from pandas.lib import Timestamp
@@ -32,7 +31,8 @@
3231
)
3332
from pandas.compat import parse_date
3433
from pandas.core.common import AbstractMethodError
35-
from pandas.io.common import DtypeWarning, URLError
34+
from pandas.io.common import (CParserError, DtypeWarning,
35+
EmptyDataError, URLError)
3636
from pandas.io.parsers import (read_csv, read_table, read_fwf,
3737
TextFileReader, TextParser)
3838
from pandas.tseries.index import date_range
@@ -1209,7 +1209,7 @@ def test_read_table_wrong_num_columns(self):
12091209
6,7,8,9,10,11,12
12101210
11,12,13,14,15,16
12111211
"""
1212-
self.assertRaises(Exception, self.read_csv, StringIO(data))
1212+
self.assertRaises(ValueError, self.read_csv, StringIO(data))
12131213

12141214
def test_read_table_duplicate_index(self):
12151215
data = """index,A,B,C,D
@@ -1740,7 +1740,7 @@ def test_read_table_buglet_4x_multiindex(self):
17401740
# Temporarily copied to TestPythonParser.
17411741
# Here test that CParserError is raised:
17421742

1743-
with tm.assertRaises(pandas.parser.CParserError):
1743+
with tm.assertRaises(CParserError):
17441744
text = """ A B C D E
17451745
one two three four
17461746
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
@@ -1840,7 +1840,7 @@ def test_parse_dates_custom_euroformat(self):
18401840
tm.assert_frame_equal(df, expected)
18411841

18421842
parser = lambda d: parse_date(d, day_first=True)
1843-
self.assertRaises(Exception, self.read_csv,
1843+
self.assertRaises(TypeError, self.read_csv,
18441844
StringIO(text), skiprows=[0],
18451845
names=['time', 'Q', 'NTU'], index_col=0,
18461846
parse_dates=True, date_parser=parser,
@@ -2014,7 +2014,7 @@ def test_bool_na_values(self):
20142014
def test_nonexistent_path(self):
20152015
# don't segfault pls #2428
20162016
path = '%s.csv' % tm.rands(10)
2017-
self.assertRaises(Exception, self.read_csv, path)
2017+
self.assertRaises(IOError, self.read_csv, path)
20182018

20192019
def test_missing_trailing_delimiters(self):
20202020
data = """A,B,C,D
@@ -2358,7 +2358,7 @@ def test_catch_too_many_names(self):
23582358
4,,6
23592359
7,8,9
23602360
10,11,12\n"""
2361-
tm.assertRaises(Exception, read_csv, StringIO(data),
2361+
tm.assertRaises(ValueError, read_csv, StringIO(data),
23622362
header=0, names=['a', 'b', 'c', 'd'])
23632363

23642364
def test_ignore_leading_whitespace(self):
@@ -2525,9 +2525,8 @@ def test_int64_overflow(self):
25252525
result = self.read_csv(StringIO(data))
25262526
self.assertTrue(result['ID'].dtype == object)
25272527

2528-
self.assertRaises((OverflowError, pandas.parser.OverflowError),
2529-
self.read_csv, StringIO(data),
2530-
converters={'ID': np.int64})
2528+
self.assertRaises(OverflowError, self.read_csv,
2529+
StringIO(data), converters={'ID': np.int64})
25312530

25322531
# Just inside int64 range: parse as integer
25332532
i_max = np.iinfo(np.int64).max
@@ -2774,7 +2773,7 @@ def test_mixed_dtype_usecols(self):
27742773
usecols = [0, 'b', 2]
27752774

27762775
with tm.assertRaisesRegexp(ValueError, msg):
2777-
df = self.read_csv(StringIO(data), usecols=usecols)
2776+
self.read_csv(StringIO(data), usecols=usecols)
27782777

27792778
def test_usecols_with_integer_like_header(self):
27802779
data = """2,0,1
@@ -2796,6 +2795,37 @@ def test_usecols_with_integer_like_header(self):
27962795
df = self.read_csv(StringIO(data), usecols=usecols)
27972796
tm.assert_frame_equal(df, expected)
27982797

2798+
def test_read_empty_with_usecols(self):
2799+
# See gh-12493
2800+
names = ['Dummy', 'X', 'Dummy_2']
2801+
usecols = names[1:2] # ['X']
2802+
2803+
# first, check to see that the response of
2804+
# parser when faced with no provided columns
2805+
# throws the correct error, with or without usecols
2806+
errmsg = "No columns to parse from file"
2807+
2808+
with tm.assertRaisesRegexp(EmptyDataError, errmsg):
2809+
self.read_csv(StringIO(''))
2810+
2811+
with tm.assertRaisesRegexp(EmptyDataError, errmsg):
2812+
self.read_csv(StringIO(''), usecols=usecols)
2813+
2814+
expected = DataFrame(columns=usecols, index=[0], dtype=np.float64)
2815+
df = self.read_csv(StringIO(',,'), names=names, usecols=usecols)
2816+
tm.assert_frame_equal(df, expected)
2817+
2818+
expected = DataFrame(columns=usecols)
2819+
df = self.read_csv(StringIO(''), names=names, usecols=usecols)
2820+
tm.assert_frame_equal(df, expected)
2821+
2822+
def test_read_with_bad_header(self):
2823+
errmsg = "but only \d+ lines in file"
2824+
2825+
with tm.assertRaisesRegexp(ValueError, errmsg):
2826+
s = StringIO(',,')
2827+
self.read_csv(s, header=[10])
2828+
27992829

28002830
class CompressionTests(object):
28012831
def test_zip(self):
@@ -4399,7 +4429,7 @@ def test_raise_on_passed_int_dtype_with_nas(self):
43994429
2001,106380451,10
44004430
2001,,11
44014431
2001,106380451,67"""
4402-
self.assertRaises(Exception, read_csv, StringIO(data), sep=",",
4432+
self.assertRaises(ValueError, read_csv, StringIO(data), sep=",",
44034433
skipinitialspace=True,
44044434
dtype={'DOY': np.int64})
44054435

0 commit comments

Comments
 (0)