Skip to content

Commit d856051

Browse files
committed
BUG: Properly validate and parse nrows in read_csv
Closes gh-10476.
1 parent e0a2e3b commit d856051

File tree

4 files changed

+46
-8
lines changed

4 files changed

+46
-8
lines changed

doc/source/whatsnew/v0.18.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -252,3 +252,4 @@ Bug Fixes
252252
- Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`)
253253

254254
- Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`)
255+
- Bug in ``pd.read_csv`` in which the ``nrows`` argument was not properly validated for both engines (:issue:`10476`)

pandas/io/parsers.py

+22-2
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,26 @@
272272
""" % (_parser_params % (_fwf_widths, ''))
273273

274274

275+
def _validate_nrows(nrows):
276+
"""
277+
Checks whether the 'nrows' parameter for parsing is either
278+
an integer OR float that can SAFELY be cast to an integer
279+
without losing accuracy. Raises a ValueError if that is
280+
not the case.
281+
"""
282+
msg = "'nrows' must be an integer"
283+
284+
if nrows is not None:
285+
if com.is_float(nrows):
286+
if int(nrows) != nrows:
287+
raise ValueError(msg)
288+
nrows = int(nrows)
289+
elif not com.is_integer(nrows):
290+
raise ValueError(msg)
291+
292+
return nrows
293+
294+
275295
def _read(filepath_or_buffer, kwds):
276296
"Generic reader of line files."
277297
encoding = kwds.get('encoding', None)
@@ -311,14 +331,14 @@ def _read(filepath_or_buffer, kwds):
311331

312332
# Extract some of the arguments (pass chunksize on).
313333
iterator = kwds.get('iterator', False)
314-
nrows = kwds.pop('nrows', None)
315334
chunksize = kwds.get('chunksize', None)
335+
nrows = _validate_nrows(kwds.pop('nrows', None))
316336

317337
# Create the parser.
318338
parser = TextFileReader(filepath_or_buffer, **kwds)
319339

320340
if (nrows is not None) and (chunksize is not None):
321-
raise NotImplementedError("'nrows' and 'chunksize' can not be used"
341+
raise NotImplementedError("'nrows' and 'chunksize' cannot be used"
322342
" together yet.")
323343
elif nrows is not None:
324344
return parser.read(nrows)

pandas/io/tests/parser/common.py

+14-6
Original file line numberDiff line numberDiff line change
@@ -391,10 +391,23 @@ def test_int_conversion(self):
391391
self.assertEqual(data['B'].dtype, np.int64)
392392

393393
def test_read_nrows(self):
394-
df = self.read_csv(StringIO(self.data1), nrows=3)
395394
expected = self.read_csv(StringIO(self.data1))[:3]
395+
396+
df = self.read_csv(StringIO(self.data1), nrows=3)
396397
tm.assert_frame_equal(df, expected)
397398

399+
# see gh-10476
400+
df = self.read_csv(StringIO(self.data1), nrows=3.0)
401+
tm.assert_frame_equal(df, expected)
402+
403+
msg = "must be an integer"
404+
405+
with tm.assertRaisesRegexp(ValueError, msg):
406+
self.read_csv(StringIO(self.data1), nrows=1.2)
407+
408+
with tm.assertRaisesRegexp(ValueError, msg):
409+
self.read_csv(StringIO(self.data1), nrows='foo')
410+
398411
def test_read_chunksize(self):
399412
reader = self.read_csv(StringIO(self.data1), index_col=0, chunksize=2)
400413
df = self.read_csv(StringIO(self.data1), index_col=0)
@@ -815,11 +828,6 @@ def test_ignore_leading_whitespace(self):
815828
expected = DataFrame({'a': [1, 4, 7], 'b': [2, 5, 8], 'c': [3, 6, 9]})
816829
tm.assert_frame_equal(result, expected)
817830

818-
def test_nrows_and_chunksize_raises_notimplemented(self):
819-
data = 'a b c'
820-
self.assertRaises(NotImplementedError, self.read_csv, StringIO(data),
821-
nrows=10, chunksize=5)
822-
823831
def test_chunk_begins_with_newline_whitespace(self):
824832
# see gh-10022
825833
data = '\n hello\nworld\n'

pandas/io/tests/parser/test_unsupported.py

+9
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,15 @@ def test_mangle_dupe_cols_false(self):
3030
read_csv(StringIO(data), engine=engine,
3131
mangle_dupe_cols=False)
3232

33+
def test_nrows_and_chunksize(self):
34+
data = 'a b c'
35+
msg = "cannot be used together yet"
36+
37+
for engine in ('c', 'python'):
38+
with tm.assertRaisesRegexp(NotImplementedError, msg):
39+
read_csv(StringIO(data), engine=engine,
40+
nrows=10, chunksize=5)
41+
3342
def test_c_engine(self):
3443
# see gh-6607
3544
data = 'a b c\n1 2 3'

0 commit comments

Comments
 (0)