diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index eeb568c2e2558..5ac7624856040 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -815,6 +815,7 @@ Bug Fixes - Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`) - Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`) - Bug in ``pd.read_csv()`` in which a file containing a row with many columns followed by rows with fewer columns would cause a crash (:issue:`14125`) +- Added checks in ``pd.read_csv()`` ensuring that values for ``nrows`` and ``chunksize`` are valid (:issue:`15767`) - Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`) - Bug in ``.groupby(..).resample()`` when passed the ``on=`` kwarg. (:issue:`15021`) - Bug in using ``__deepcopy__`` on empty NDFrame objects (:issue:`15370`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 18343670fb39e..a6564a643058d 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -344,24 +344,34 @@ """ % (_parser_params % (_fwf_widths, '')) -def _validate_nrows(nrows): +def _validate_integer(name, val, min_val=0): """ - Checks whether the 'nrows' parameter for parsing is either + Checks whether the 'name' parameter for parsing is either an integer OR float that can SAFELY be cast to an integer without losing accuracy. Raises a ValueError if that is not the case. + + Parameters + ---------- + name : string + Parameter name (used for error reporting) + val : int or float + The value to check + min_val : int + Minimum allowed value (val < min_val will result in a ValueError) """ - msg = "'nrows' must be an integer" + msg = "'{name:s}' must be an integer >={min_val:d}".format(name=name, + min_val=min_val) - if nrows is not None: - if is_float(nrows): - if int(nrows) != nrows: + if val is not None: + if is_float(val): + if int(val) != val: raise ValueError(msg) - nrows = int(nrows) - elif not is_integer(nrows): + val = int(val) + elif not (is_integer(val) and val >= min_val): raise ValueError(msg) - return nrows + return val def _read(filepath_or_buffer, kwds): @@ -383,8 +393,8 @@ def _read(filepath_or_buffer, kwds): # Extract some of the arguments (pass chunksize on). iterator = kwds.get('iterator', False) - chunksize = kwds.get('chunksize', None) - nrows = _validate_nrows(kwds.get('nrows', None)) + chunksize = _validate_integer('chunksize', kwds.get('chunksize', None), 1) + nrows = _validate_integer('nrows', kwds.get('nrows', None)) # Create the parser. parser = TextFileReader(filepath_or_buffer, **kwds) diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 24d15dcb96fe7..2c8bca490f274 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -384,7 +384,7 @@ def test_read_nrows(self): df = self.read_csv(StringIO(self.data1), nrows=3.0) tm.assert_frame_equal(df, expected) - msg = "must be an integer" + msg = r"'nrows' must be an integer >=0" with tm.assertRaisesRegexp(ValueError, msg): self.read_csv(StringIO(self.data1), nrows=1.2) @@ -392,6 +392,9 @@ def test_read_nrows(self): with tm.assertRaisesRegexp(ValueError, msg): self.read_csv(StringIO(self.data1), nrows='foo') + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv(StringIO(self.data1), nrows=-1) + def test_read_chunksize(self): reader = self.read_csv(StringIO(self.data1), index_col=0, chunksize=2) df = self.read_csv(StringIO(self.data1), index_col=0) @@ -402,6 +405,18 @@ def test_read_chunksize(self): tm.assert_frame_equal(chunks[1], df[2:4]) tm.assert_frame_equal(chunks[2], df[4:]) + # with invalid chunksize value: + msg = r"'chunksize' must be an integer >=1" + + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv(StringIO(self.data1), chunksize=1.3) + + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv(StringIO(self.data1), chunksize='foo') + + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv(StringIO(self.data1), chunksize=0) + def test_read_chunksize_and_nrows(self): # gh-15755