Skip to content

Commit 0dabcd7

Browse files
toobazmattip
authored andcommitted
BUG: Check that values for "nrows" and "chunksize" are valid (pandas-dev#15774)
1 parent 7761a36 commit 0dabcd7

File tree

3 files changed

+38
-12
lines changed

3 files changed

+38
-12
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -815,6 +815,7 @@ Bug Fixes
815815
- Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`)
816816
- Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`)
817817
- Bug in ``pd.read_csv()`` in which a file containing a row with many columns followed by rows with fewer columns would cause a crash (:issue:`14125`)
818+
- Added checks in ``pd.read_csv()`` ensuring that values for ``nrows`` and ``chunksize`` are valid (:issue:`15767`)
818819
- Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`)
819820
- Bug in ``.groupby(..).resample()`` when passed the ``on=`` kwarg. (:issue:`15021`)
820821
- Bug in using ``__deepcopy__`` on empty NDFrame objects (:issue:`15370`)

pandas/io/parsers.py

+21-11
Original file line numberDiff line numberDiff line change
@@ -345,24 +345,34 @@
345345
""" % (_parser_params % (_fwf_widths, ''))
346346

347347

348-
def _validate_nrows(nrows):
348+
def _validate_integer(name, val, min_val=0):
349349
"""
350-
Checks whether the 'nrows' parameter for parsing is either
350+
Checks whether the 'name' parameter for parsing is either
351351
an integer OR float that can SAFELY be cast to an integer
352352
without losing accuracy. Raises a ValueError if that is
353353
not the case.
354+
355+
Parameters
356+
----------
357+
name : string
358+
Parameter name (used for error reporting)
359+
val : int or float
360+
The value to check
361+
min_val : int
362+
Minimum allowed value (val < min_val will result in a ValueError)
354363
"""
355-
msg = "'nrows' must be an integer"
364+
msg = "'{name:s}' must be an integer >={min_val:d}".format(name=name,
365+
min_val=min_val)
356366

357-
if nrows is not None:
358-
if is_float(nrows):
359-
if int(nrows) != nrows:
367+
if val is not None:
368+
if is_float(val):
369+
if int(val) != val:
360370
raise ValueError(msg)
361-
nrows = int(nrows)
362-
elif not is_integer(nrows):
371+
val = int(val)
372+
elif not (is_integer(val) and val >= min_val):
363373
raise ValueError(msg)
364374

365-
return nrows
375+
return val
366376

367377

368378
def _read(filepath_or_buffer, kwds):
@@ -384,8 +394,8 @@ def _read(filepath_or_buffer, kwds):
384394

385395
# Extract some of the arguments (pass chunksize on).
386396
iterator = kwds.get('iterator', False)
387-
chunksize = kwds.get('chunksize', None)
388-
nrows = _validate_nrows(kwds.get('nrows', None))
397+
chunksize = _validate_integer('chunksize', kwds.get('chunksize', None), 1)
398+
nrows = _validate_integer('nrows', kwds.get('nrows', None))
389399

390400
# Create the parser.
391401
parser = TextFileReader(filepath_or_buffer, **kwds)

pandas/tests/io/parser/common.py

+16-1
Original file line numberDiff line numberDiff line change
@@ -384,14 +384,17 @@ def test_read_nrows(self):
384384
df = self.read_csv(StringIO(self.data1), nrows=3.0)
385385
tm.assert_frame_equal(df, expected)
386386

387-
msg = "must be an integer"
387+
msg = r"'nrows' must be an integer >=0"
388388

389389
with tm.assertRaisesRegexp(ValueError, msg):
390390
self.read_csv(StringIO(self.data1), nrows=1.2)
391391

392392
with tm.assertRaisesRegexp(ValueError, msg):
393393
self.read_csv(StringIO(self.data1), nrows='foo')
394394

395+
with tm.assertRaisesRegexp(ValueError, msg):
396+
self.read_csv(StringIO(self.data1), nrows=-1)
397+
395398
def test_read_chunksize(self):
396399
reader = self.read_csv(StringIO(self.data1), index_col=0, chunksize=2)
397400
df = self.read_csv(StringIO(self.data1), index_col=0)
@@ -402,6 +405,18 @@ def test_read_chunksize(self):
402405
tm.assert_frame_equal(chunks[1], df[2:4])
403406
tm.assert_frame_equal(chunks[2], df[4:])
404407

408+
# with invalid chunksize value:
409+
msg = r"'chunksize' must be an integer >=1"
410+
411+
with tm.assertRaisesRegexp(ValueError, msg):
412+
self.read_csv(StringIO(self.data1), chunksize=1.3)
413+
414+
with tm.assertRaisesRegexp(ValueError, msg):
415+
self.read_csv(StringIO(self.data1), chunksize='foo')
416+
417+
with tm.assertRaisesRegexp(ValueError, msg):
418+
self.read_csv(StringIO(self.data1), chunksize=0)
419+
405420
def test_read_chunksize_and_nrows(self):
406421

407422
# gh-15755

0 commit comments

Comments
 (0)