diff --git a/doc/source/io.rst b/doc/source/io.rst index 6cf41bbc50fb5..4eb42e1fb918d 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -176,6 +176,17 @@ low_memory : boolean, default ``True`` Note that the entire file is read into a single DataFrame regardless, use the ``chunksize`` or ``iterator`` parameter to return the data in chunks. (Only valid with C parser) +compact_ints : boolean, default False + DEPRECATED: this argument will be removed in a future version + + If ``compact_ints`` is ``True``, then for any column that is of integer dtype, the + parser will attempt to cast it as the smallest integer ``dtype`` possible, either + signed or unsigned depending on the specification from the ``use_unsigned`` parameter. +use_unsigned : boolean, default False + DEPRECATED: this argument will be removed in a future version + + If integer columns are being compacted (i.e. ``compact_ints=True``), specify whether + the column should be compacted to the smallest signed or unsigned integer dtype. NA and Missing Data Handling ++++++++++++++++++++++++++++ diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 27540a9626398..03fe8bdc39611 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -292,6 +292,7 @@ Other API changes Deprecations ^^^^^^^^^^^^ +- ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv`` and will be removed in a future version (:issue:`13320`) .. _whatsnew_0182.performance: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index bba8ad3ccd72b..2c8726f588522 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -227,6 +227,20 @@ Note that the entire file is read into a single DataFrame regardless, use the `chunksize` or `iterator` parameter to return the data in chunks. (Only valid with C parser) +compact_ints : boolean, default False + DEPRECATED: this argument will be removed in a future version + + If compact_ints is True, then for any column that is of integer dtype, + the parser will attempt to cast it as the smallest integer dtype possible, + either signed or unsigned depending on the specification from the + `use_unsigned` parameter. + +use_unsigned : boolean, default False + DEPRECATED: this argument will be removed in a future version + + If integer columns are being compacted (i.e. `compact_ints=True`), specify + whether the column should be compacted to the smallest signed or unsigned + integer dtype. Returns ------- @@ -425,8 +439,6 @@ def _read(filepath_or_buffer, kwds): _c_unsupported = set(['skip_footer']) _python_unsupported = set([ 'as_recarray', - 'compact_ints', - 'use_unsigned', 'low_memory', 'memory_map', 'buffer_lines', @@ -435,6 +447,10 @@ def _read(filepath_or_buffer, kwds): 'dtype', 'float_precision', ]) +_deprecated_args = set([ + 'compact_ints', + 'use_unsigned', +]) def _make_parser_function(name, sep=','): @@ -789,6 +805,12 @@ def _clean_options(self, options, engine): _validate_header_arg(options['header']) + for arg in _deprecated_args: + if result[arg] != _c_parser_defaults[arg]: + warnings.warn("The '{arg}' argument has been deprecated " + "and will be removed in a future version" + .format(arg=arg), FutureWarning, stacklevel=2) + if index_col is True: raise ValueError("The value of index_col couldn't be 'True'") if _is_index_col(index_col): @@ -1206,6 +1228,12 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, cvals, na_count = self._convert_types( values, set(col_na_values) | col_na_fvalues, coerce_type) + + if issubclass(cvals.dtype.type, np.integer) and self.compact_ints: + cvals = lib.downcast_int64( + cvals, _parser.na_values, + self.use_unsigned) + result[c] = cvals if verbose and na_count: print('Filled %d NA values in column %s' % (na_count, str(c))) @@ -1648,8 +1676,11 @@ def __init__(self, f, **kwds): self.verbose = kwds['verbose'] self.converters = kwds['converters'] + self.compact_ints = kwds['compact_ints'] + self.use_unsigned = kwds['use_unsigned'] self.thousands = kwds['thousands'] self.decimal = kwds['decimal'] + self.comment = kwds['comment'] self._comment_lines = [] diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 7fca37cef473e..b7ef754004e18 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -172,28 +172,8 @@ def error(val): self.assertTrue(sum(precise_errors) <= sum(normal_errors)) self.assertTrue(max(precise_errors) <= max(normal_errors)) - def test_compact_ints(self): - if compat.is_platform_windows() and not self.low_memory: - raise nose.SkipTest( - "segfaults on win-64, only when all tests are run") - - data = ('0,1,0,0\n' - '1,1,0,0\n' - '0,1,0,1') - - result = self.read_csv(StringIO(data), delimiter=',', header=None, - compact_ints=True, as_recarray=True) - ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) - - result = self.read_csv(StringIO(data), delimiter=',', header=None, - as_recarray=True, compact_ints=True, - use_unsigned=True) - ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) - def test_compact_ints_as_recarray(self): - if compat.is_platform_windows() and self.low_memory: + if compat.is_platform_windows(): raise nose.SkipTest( "segfaults on win-64, only when all tests are run") @@ -201,16 +181,20 @@ def test_compact_ints_as_recarray(self): '1,1,0,0\n' '0,1,0,1') - result = self.read_csv(StringIO(data), delimiter=',', header=None, - compact_ints=True, as_recarray=True) - ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) - - result = self.read_csv(StringIO(data), delimiter=',', header=None, - as_recarray=True, compact_ints=True, - use_unsigned=True) - ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + result = self.read_csv(StringIO(data), delimiter=',', header=None, + compact_ints=True, as_recarray=True) + ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) + self.assertEqual(result.dtype, ex_dtype) + + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + result = self.read_csv(StringIO(data), delimiter=',', header=None, + as_recarray=True, compact_ints=True, + use_unsigned=True) + ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) + self.assertEqual(result.dtype, ex_dtype) def test_pass_dtype(self): data = """\ diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 44892dc17c47b..f8c7241fdf88a 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1330,3 +1330,46 @@ def test_raise_on_no_columns(self): # test with more than a single newline data = "\n\n\n" self.assertRaises(EmptyDataError, self.read_csv, StringIO(data)) + + def test_compact_ints_use_unsigned(self): + # see gh-13323 + data = 'a,b,c\n1,9,258' + + # sanity check + expected = DataFrame({ + 'a': np.array([1], dtype=np.int64), + 'b': np.array([9], dtype=np.int64), + 'c': np.array([258], dtype=np.int64), + }) + out = self.read_csv(StringIO(data)) + tm.assert_frame_equal(out, expected) + + expected = DataFrame({ + 'a': np.array([1], dtype=np.int8), + 'b': np.array([9], dtype=np.int8), + 'c': np.array([258], dtype=np.int16), + }) + + # default behaviour for 'use_unsigned' + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + out = self.read_csv(StringIO(data), compact_ints=True) + tm.assert_frame_equal(out, expected) + + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + out = self.read_csv(StringIO(data), compact_ints=True, + use_unsigned=False) + tm.assert_frame_equal(out, expected) + + expected = DataFrame({ + 'a': np.array([1], dtype=np.uint8), + 'b': np.array([9], dtype=np.uint8), + 'c': np.array([258], dtype=np.uint16), + }) + + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + out = self.read_csv(StringIO(data), compact_ints=True, + use_unsigned=True) + tm.assert_frame_equal(out, expected) diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py index 3c1c45831e7b4..e820924d2be8b 100644 --- a/pandas/io/tests/parser/test_unsupported.py +++ b/pandas/io/tests/parser/test_unsupported.py @@ -117,6 +117,27 @@ def test_python_engine(self): with tm.assertRaisesRegexp(ValueError, msg): read_csv(StringIO(data), engine=engine, **kwargs) + +class TestDeprecatedFeatures(tm.TestCase): + def test_deprecated_args(self): + data = '1,2,3' + + # deprecated arguments with non-default values + deprecated = { + 'compact_ints': True, + 'use_unsigned': True, + } + + engines = 'c', 'python' + + for engine in engines: + for arg, non_default_val in deprecated.items(): + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + kwargs = {arg: non_default_val} + read_csv(StringIO(data), engine=engine, + **kwargs) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 729e5af528b80..d7ddaee658fe7 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -1018,7 +1018,7 @@ cdef class TextReader: col_res = _maybe_upcast(col_res) if issubclass(col_res.dtype.type, np.integer) and self.compact_ints: - col_res = downcast_int64(col_res, self.use_unsigned) + col_res = lib.downcast_int64(col_res, na_values, self.use_unsigned) if col_res is None: raise CParserError('Unable to parse column %d' % i) @@ -1866,76 +1866,6 @@ cdef raise_parser_error(object base, parser_t *parser): raise CParserError(message) -def downcast_int64(ndarray[int64_t] arr, bint use_unsigned=0): - cdef: - Py_ssize_t i, n = len(arr) - int64_t mx = INT64_MIN + 1, mn = INT64_MAX - int64_t NA = na_values[np.int64] - int64_t val - ndarray[uint8_t] mask - int na_count = 0 - - _mask = np.empty(n, dtype=bool) - mask = _mask.view(np.uint8) - - for i in range(n): - val = arr[i] - - if val == NA: - mask[i] = 1 - na_count += 1 - continue - - # not NA - mask[i] = 0 - - if val > mx: - mx = val - - if val < mn: - mn = val - - if mn >= 0 and use_unsigned: - if mx <= UINT8_MAX - 1: - result = arr.astype(np.uint8) - if na_count: - np.putmask(result, _mask, na_values[np.uint8]) - return result - - if mx <= UINT16_MAX - 1: - result = arr.astype(np.uint16) - if na_count: - np.putmask(result, _mask, na_values[np.uint16]) - return result - - if mx <= UINT32_MAX - 1: - result = arr.astype(np.uint32) - if na_count: - np.putmask(result, _mask, na_values[np.uint32]) - return result - - else: - if mn >= INT8_MIN + 1 and mx <= INT8_MAX: - result = arr.astype(np.int8) - if na_count: - np.putmask(result, _mask, na_values[np.int8]) - return result - - if mn >= INT16_MIN + 1 and mx <= INT16_MAX: - result = arr.astype(np.int16) - if na_count: - np.putmask(result, _mask, na_values[np.int16]) - return result - - if mn >= INT32_MIN + 1 and mx <= INT32_MAX: - result = arr.astype(np.int32) - if na_count: - np.putmask(result, _mask, na_values[np.int32]) - return result - - return arr - - def _concatenate_chunks(list chunks): cdef: list names = list(chunks[0].keys()) diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 5f7c5478b5d87..262e036ff44f1 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -6,6 +6,20 @@ iNaT = util.get_nat() cdef bint PY2 = sys.version_info[0] == 2 +cdef extern from "headers/stdint.h": + enum: UINT8_MAX + enum: UINT16_MAX + enum: UINT32_MAX + enum: UINT64_MAX + enum: INT8_MIN + enum: INT8_MAX + enum: INT16_MIN + enum: INT16_MAX + enum: INT32_MAX + enum: INT32_MIN + enum: INT64_MAX + enum: INT64_MIN + # core.common import for fast inference checks def is_float(object obj): return util.is_float_object(obj) @@ -1240,3 +1254,74 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): output[i] = default return maybe_convert_objects(output) + + +def downcast_int64(ndarray[int64_t] arr, object na_values, + bint use_unsigned=0): + cdef: + Py_ssize_t i, n = len(arr) + int64_t mx = INT64_MIN + 1, mn = INT64_MAX + int64_t NA = na_values[np.int64] + int64_t val + ndarray[uint8_t] mask + int na_count = 0 + + _mask = np.empty(n, dtype=bool) + mask = _mask.view(np.uint8) + + for i in range(n): + val = arr[i] + + if val == NA: + mask[i] = 1 + na_count += 1 + continue + + # not NA + mask[i] = 0 + + if val > mx: + mx = val + + if val < mn: + mn = val + + if mn >= 0 and use_unsigned: + if mx <= UINT8_MAX - 1: + result = arr.astype(np.uint8) + if na_count: + np.putmask(result, _mask, na_values[np.uint8]) + return result + + if mx <= UINT16_MAX - 1: + result = arr.astype(np.uint16) + if na_count: + np.putmask(result, _mask, na_values[np.uint16]) + return result + + if mx <= UINT32_MAX - 1: + result = arr.astype(np.uint32) + if na_count: + np.putmask(result, _mask, na_values[np.uint32]) + return result + + else: + if mn >= INT8_MIN + 1 and mx <= INT8_MAX: + result = arr.astype(np.int8) + if na_count: + np.putmask(result, _mask, na_values[np.int8]) + return result + + if mn >= INT16_MIN + 1 and mx <= INT16_MAX: + result = arr.astype(np.int16) + if na_count: + np.putmask(result, _mask, na_values[np.int16]) + return result + + if mn >= INT32_MIN + 1 and mx <= INT32_MAX: + result = arr.astype(np.int32) + if na_count: + np.putmask(result, _mask, na_values[np.int32]) + return result + + return arr diff --git a/pandas/tests/test_infer_and_convert.py b/pandas/tests/test_infer_and_convert.py index 68eac12e5ec4c..a6941369b35be 100644 --- a/pandas/tests/test_infer_and_convert.py +++ b/pandas/tests/test_infer_and_convert.py @@ -401,6 +401,42 @@ def test_convert_sql_column_decimals(self): expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') self.assert_numpy_array_equal(result, expected) + def test_convert_downcast_int64(self): + from pandas.parser import na_values + + arr = np.array([1, 2, 7, 8, 10], dtype=np.int64) + expected = np.array([1, 2, 7, 8, 10], dtype=np.int8) + + # default argument + result = lib.downcast_int64(arr, na_values) + self.assert_numpy_array_equal(result, expected) + + result = lib.downcast_int64(arr, na_values, use_unsigned=False) + self.assert_numpy_array_equal(result, expected) + + expected = np.array([1, 2, 7, 8, 10], dtype=np.uint8) + result = lib.downcast_int64(arr, na_values, use_unsigned=True) + self.assert_numpy_array_equal(result, expected) + + # still cast to int8 despite use_unsigned=True + # because of the negative number as an element + arr = np.array([1, 2, -7, 8, 10], dtype=np.int64) + expected = np.array([1, 2, -7, 8, 10], dtype=np.int8) + result = lib.downcast_int64(arr, na_values, use_unsigned=True) + self.assert_numpy_array_equal(result, expected) + + arr = np.array([1, 2, 7, 8, 300], dtype=np.int64) + expected = np.array([1, 2, 7, 8, 300], dtype=np.int16) + result = lib.downcast_int64(arr, na_values) + self.assert_numpy_array_equal(result, expected) + + int8_na = na_values[np.int8] + int64_na = na_values[np.int64] + arr = np.array([int64_na, 2, 3, 10, 15], dtype=np.int64) + expected = np.array([int8_na, 2, 3, 10, 15], dtype=np.int8) + result = lib.downcast_int64(arr, na_values) + self.assert_numpy_array_equal(result, expected) + if __name__ == '__main__': import nose