Skip to content

Commit b7a2957

Browse files
committed
BUG: Check that values for "nrows" and "chunksize" are valid
1 parent 1c9d46a commit b7a2957

File tree

3 files changed

+28
-12
lines changed

3 files changed

+28
-12
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -815,6 +815,7 @@ Bug Fixes
815815
- Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`)
816816
- Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`)
817817
- Bug in ``pd.read_csv()`` in which a file containing a row with many columns followed by rows with fewer columns would cause a crash (:issue:`14125`)
818+
- Added checks in ``pd.read_csv()`` ensuring that values for ``nrows`` and ``chunksize`` are valid (:issue:`15767`)
818819
- Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`)
819820
- Bug in ``.groupby(..).resample()`` when passed the ``on=`` kwarg. (:issue:`15021`)
820821
- Bug in using ``__deepcopy__`` on empty NDFrame objects (:issue:`15370`)

pandas/io/parsers.py

+11-11
Original file line numberDiff line numberDiff line change
@@ -344,24 +344,24 @@
344344
""" % (_parser_params % (_fwf_widths, ''))
345345

346346

347-
def _validate_nrows(nrows):
347+
def _validate_integer(name, val, min_val=0):
348348
"""
349-
Checks whether the 'nrows' parameter for parsing is either
349+
Checks whether the 'name' parameter for parsing is either
350350
an integer OR float that can SAFELY be cast to an integer
351351
without losing accuracy. Raises a ValueError if that is
352352
not the case.
353353
"""
354-
msg = "'nrows' must be an integer"
354+
msg = "'%s' must be an integer >=%s" % (name, min_val)
355355

356-
if nrows is not None:
357-
if is_float(nrows):
358-
if int(nrows) != nrows:
356+
if val is not None:
357+
if is_float(val):
358+
if int(val) != val:
359359
raise ValueError(msg)
360-
nrows = int(nrows)
361-
elif not is_integer(nrows):
360+
val = int(val)
361+
elif not (is_integer(val) and val >= min_val):
362362
raise ValueError(msg)
363363

364-
return nrows
364+
return val
365365

366366

367367
def _read(filepath_or_buffer, kwds):
@@ -383,8 +383,8 @@ def _read(filepath_or_buffer, kwds):
383383

384384
# Extract some of the arguments (pass chunksize on).
385385
iterator = kwds.get('iterator', False)
386-
chunksize = kwds.get('chunksize', None)
387-
nrows = _validate_nrows(kwds.get('nrows', None))
386+
chunksize = _validate_integer('chunksize', kwds.get('chunksize', None), 1)
387+
nrows = _validate_integer('nrows', kwds.get('nrows', None))
388388

389389
# Create the parser.
390390
parser = TextFileReader(filepath_or_buffer, **kwds)

pandas/tests/io/parser/common.py

+16-1
Original file line numberDiff line numberDiff line change
@@ -384,14 +384,17 @@ def test_read_nrows(self):
384384
df = self.read_csv(StringIO(self.data1), nrows=3.0)
385385
tm.assert_frame_equal(df, expected)
386386

387-
msg = "must be an integer"
387+
msg = r"'nrows' must be an integer >=0"
388388

389389
with tm.assertRaisesRegexp(ValueError, msg):
390390
self.read_csv(StringIO(self.data1), nrows=1.2)
391391

392392
with tm.assertRaisesRegexp(ValueError, msg):
393393
self.read_csv(StringIO(self.data1), nrows='foo')
394394

395+
with tm.assertRaisesRegexp(ValueError, msg):
396+
self.read_csv(StringIO(self.data1), nrows=-1)
397+
395398
def test_read_chunksize(self):
396399
reader = self.read_csv(StringIO(self.data1), index_col=0, chunksize=2)
397400
df = self.read_csv(StringIO(self.data1), index_col=0)
@@ -402,6 +405,18 @@ def test_read_chunksize(self):
402405
tm.assert_frame_equal(chunks[1], df[2:4])
403406
tm.assert_frame_equal(chunks[2], df[4:])
404407

408+
# with invalid chunksize value:
409+
msg = r"'chunksize' must be an integer >=1"
410+
411+
with tm.assertRaisesRegexp(ValueError, msg):
412+
self.read_csv(StringIO(self.data1), chunksize=1.3)
413+
414+
with tm.assertRaisesRegexp(ValueError, msg):
415+
self.read_csv(StringIO(self.data1), chunksize='foo')
416+
417+
with tm.assertRaisesRegexp(ValueError, msg):
418+
self.read_csv(StringIO(self.data1), chunksize=0)
419+
405420
def test_read_chunksize_and_nrows(self):
406421

407422
# gh-15755

0 commit comments

Comments
 (0)