diff --git a/doc/source/io.rst b/doc/source/io.rst index d51307081b17f..2584941ac14d2 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -199,21 +199,6 @@ low_memory : boolean, default ``True`` Note that the entire file is read into a single DataFrame regardless, use the ``chunksize`` or ``iterator`` parameter to return the data in chunks. (Only valid with C parser) -compact_ints : boolean, default False - .. deprecated:: 0.19.0 - - Argument moved to ``pd.to_numeric`` - - If ``compact_ints`` is ``True``, then for any column that is of integer dtype, the - parser will attempt to cast it as the smallest integer ``dtype`` possible, either - signed or unsigned depending on the specification from the ``use_unsigned`` parameter. -use_unsigned : boolean, default False - .. deprecated:: 0.18.2 - - Argument moved to ``pd.to_numeric`` - - If integer columns are being compacted (i.e. ``compact_ints=True``), specify whether - the column should be compacted to the smallest signed or unsigned integer dtype. memory_map : boolean, default False If a filepath is provided for ``filepath_or_buffer``, map the file object directly onto memory and access the data directly from there. Using this diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 0579a80aad28e..0e1577c1d9e29 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -233,6 +233,7 @@ Removal of prior version deprecations/changes - :func:`read_csv` has dropped the ``skip_footer`` parameter (:issue:`13386`) - :func:`read_csv` has dropped the ``as_recarray`` parameter (:issue:`13373`) - :func:`read_csv` has dropped the ``buffer_lines`` parameter (:issue:`13360`) +- :func:`read_csv` has dropped the ``compact_ints`` and ``use_unsigned`` parameters (:issue:`13323`) .. _whatsnew_0220.performance: diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index f01068ae2e538..1f7c359b519a5 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -305,7 +305,6 @@ cdef class TextReader: object index_col object low_memory object skiprows - object compact_ints, use_unsigned object dtype object encoding object compression @@ -355,10 +354,7 @@ cdef class TextReader: na_fvalues=None, true_values=None, false_values=None, - - compact_ints=False, allow_leading_cols=True, - use_unsigned=False, low_memory=False, skiprows=None, skipfooter=0, @@ -482,10 +478,7 @@ cdef class TextReader: self.false_set = kset_from_list(self.false_values) self.converters = converters - self.na_filter = na_filter - self.compact_ints = compact_ints - self.use_unsigned = use_unsigned self.verbose = verbose self.low_memory = low_memory @@ -1122,11 +1115,6 @@ cdef class TextReader: if upcast_na and na_count > 0: col_res = _maybe_upcast(col_res) - if issubclass(col_res.dtype.type, - np.integer) and self.compact_ints: - col_res = lib.downcast_int64(col_res, na_values, - self.use_unsigned) - if col_res is None: raise ParserError('Unable to parse column %d' % i) diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 8bfed4fe60fed..5ed8828a0f122 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -1657,74 +1657,3 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): output[i] = default return maybe_convert_objects(output) - - -def downcast_int64(ndarray[int64_t] arr, object na_values, - bint use_unsigned=0): - cdef: - Py_ssize_t i, n = len(arr) - int64_t mx = INT64_MIN + 1, mn = INT64_MAX - int64_t NA = na_values[np.int64] - int64_t val - ndarray[uint8_t] mask - int na_count = 0 - - _mask = np.empty(n, dtype=bool) - mask = _mask.view(np.uint8) - - for i in range(n): - val = arr[i] - - if val == NA: - mask[i] = 1 - na_count += 1 - continue - - # not NA - mask[i] = 0 - - if val > mx: - mx = val - - if val < mn: - mn = val - - if mn >= 0 and use_unsigned: - if mx <= UINT8_MAX - 1: - result = arr.astype(np.uint8) - if na_count: - np.putmask(result, _mask, na_values[np.uint8]) - return result - - if mx <= UINT16_MAX - 1: - result = arr.astype(np.uint16) - if na_count: - np.putmask(result, _mask, na_values[np.uint16]) - return result - - if mx <= UINT32_MAX - 1: - result = arr.astype(np.uint32) - if na_count: - np.putmask(result, _mask, na_values[np.uint32]) - return result - - else: - if mn >= INT8_MIN + 1 and mx <= INT8_MAX: - result = arr.astype(np.int8) - if na_count: - np.putmask(result, _mask, na_values[np.int8]) - return result - - if mn >= INT16_MIN + 1 and mx <= INT16_MAX: - result = arr.astype(np.int16) - if na_count: - np.putmask(result, _mask, na_values[np.int16]) - return result - - if mn >= INT32_MIN + 1 and mx <= INT32_MAX: - result = arr.astype(np.int32) - if na_count: - np.putmask(result, _mask, na_values[np.int32]) - return result - - return arr diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3d07b0e6cbdfd..92f58db775423 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -273,21 +273,6 @@ Note that the entire file is read into a single DataFrame regardless, use the `chunksize` or `iterator` parameter to return the data in chunks. (Only valid with C parser) -compact_ints : boolean, default False - .. deprecated:: 0.19.0 - Argument moved to ``pd.to_numeric`` - - If compact_ints is True, then for any column that is of integer dtype, - the parser will attempt to cast it as the smallest integer dtype possible, - either signed or unsigned depending on the specification from the - `use_unsigned` parameter. -use_unsigned : boolean, default False - .. deprecated:: 0.19.0 - Argument moved to ``pd.to_numeric`` - - If integer columns are being compacted (i.e. `compact_ints=True`), specify - whether the column should be compacted to the smallest signed or unsigned - integer dtype. memory_map : boolean, default False If a filepath is provided for `filepath_or_buffer`, map the file object directly onto memory and access the data directly from there. Using this @@ -496,8 +481,6 @@ def _read(filepath_or_buffer, kwds): _c_parser_defaults = { 'delim_whitespace': False, 'na_filter': True, - 'compact_ints': False, - 'use_unsigned': False, 'low_memory': True, 'memory_map': False, 'error_bad_lines': True, @@ -518,13 +501,9 @@ def _read(filepath_or_buffer, kwds): } _deprecated_defaults = { - 'compact_ints': None, - 'use_unsigned': None, 'tupleize_cols': None } _deprecated_args = { - 'compact_ints', - 'use_unsigned', 'tupleize_cols', } @@ -596,8 +575,6 @@ def parser_f(filepath_or_buffer, # Internal doublequote=True, delim_whitespace=False, - compact_ints=None, - use_unsigned=None, low_memory=_c_parser_defaults['low_memory'], memory_map=False, float_precision=None): @@ -662,8 +639,6 @@ def parser_f(filepath_or_buffer, float_precision=float_precision, na_filter=na_filter, - compact_ints=compact_ints, - use_unsigned=use_unsigned, delim_whitespace=delim_whitespace, warn_bad_lines=warn_bad_lines, error_bad_lines=error_bad_lines, @@ -1569,11 +1544,6 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, if cast_type and not is_dtype_equal(cvals, cast_type): cvals = self._cast_types(cvals, cast_type, c) - if issubclass(cvals.dtype.type, np.integer) and self.compact_ints: - cvals = lib.downcast_int64( - cvals, parsers.na_values, - self.use_unsigned) - result[c] = cvals if verbose and na_count: print('Filled %d NA values in column %s' % (na_count, str(c))) @@ -2064,8 +2034,6 @@ def __init__(self, f, **kwds): self.converters = kwds['converters'] self.dtype = kwds['dtype'] - self.compact_ints = kwds['compact_ints'] - self.use_unsigned = kwds['use_unsigned'] self.thousands = kwds['thousands'] self.decimal = kwds['decimal'] diff --git a/pandas/tests/dtypes/test_io.py b/pandas/tests/dtypes/test_io.py index ae92e9ecca681..06b61371c9a0b 100644 --- a/pandas/tests/dtypes/test_io.py +++ b/pandas/tests/dtypes/test_io.py @@ -71,39 +71,3 @@ def test_convert_sql_column_decimals(self): result = lib.convert_sql_column(arr) expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') tm.assert_numpy_array_equal(result, expected) - - def test_convert_downcast_int64(self): - from pandas._libs.parsers import na_values - - arr = np.array([1, 2, 7, 8, 10], dtype=np.int64) - expected = np.array([1, 2, 7, 8, 10], dtype=np.int8) - - # default argument - result = lib.downcast_int64(arr, na_values) - tm.assert_numpy_array_equal(result, expected) - - result = lib.downcast_int64(arr, na_values, use_unsigned=False) - tm.assert_numpy_array_equal(result, expected) - - expected = np.array([1, 2, 7, 8, 10], dtype=np.uint8) - result = lib.downcast_int64(arr, na_values, use_unsigned=True) - tm.assert_numpy_array_equal(result, expected) - - # still cast to int8 despite use_unsigned=True - # because of the negative number as an element - arr = np.array([1, 2, -7, 8, 10], dtype=np.int64) - expected = np.array([1, 2, -7, 8, 10], dtype=np.int8) - result = lib.downcast_int64(arr, na_values, use_unsigned=True) - tm.assert_numpy_array_equal(result, expected) - - arr = np.array([1, 2, 7, 8, 300], dtype=np.int64) - expected = np.array([1, 2, 7, 8, 300], dtype=np.int16) - result = lib.downcast_int64(arr, na_values) - tm.assert_numpy_array_equal(result, expected) - - int8_na = na_values[np.int8] - int64_na = na_values[np.int64] - arr = np.array([int64_na, 2, 3, 10, 15], dtype=np.int64) - expected = np.array([int8_na, 2, 3, 10, 15], dtype=np.int8) - result = lib.downcast_int64(arr, na_values) - tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 8a1f23d203a32..8525cb42c2455 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -1371,49 +1371,6 @@ def test_raise_on_no_columns(self): data = "\n\n\n" pytest.raises(EmptyDataError, self.read_csv, StringIO(data)) - def test_compact_ints_use_unsigned(self): - # see gh-13323 - data = 'a,b,c\n1,9,258' - - # sanity check - expected = DataFrame({ - 'a': np.array([1], dtype=np.int64), - 'b': np.array([9], dtype=np.int64), - 'c': np.array([258], dtype=np.int64), - }) - out = self.read_csv(StringIO(data)) - tm.assert_frame_equal(out, expected) - - expected = DataFrame({ - 'a': np.array([1], dtype=np.int8), - 'b': np.array([9], dtype=np.int8), - 'c': np.array([258], dtype=np.int16), - }) - - # default behaviour for 'use_unsigned' - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False): - out = self.read_csv(StringIO(data), compact_ints=True) - tm.assert_frame_equal(out, expected) - - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False): - out = self.read_csv(StringIO(data), compact_ints=True, - use_unsigned=False) - tm.assert_frame_equal(out, expected) - - expected = DataFrame({ - 'a': np.array([1], dtype=np.uint8), - 'b': np.array([9], dtype=np.uint8), - 'c': np.array([258], dtype=np.uint16), - }) - - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False): - out = self.read_csv(StringIO(data), compact_ints=True, - use_unsigned=True) - tm.assert_frame_equal(out, expected) - def test_memory_map(self): mmap_file = os.path.join(self.dirpath, 'test_mmap.csv') expected = DataFrame({ diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 30dcc3e5731aa..3117f6fae55da 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -128,20 +128,12 @@ def read(self): class TestDeprecatedFeatures(object): @pytest.mark.parametrize("engine", ["c", "python"]) - @pytest.mark.parametrize("kwargs", [{"compact_ints": True}, - {"compact_ints": False}, - {"use_unsigned": True}, - {"use_unsigned": False}, - {"tupleize_cols": True}, + @pytest.mark.parametrize("kwargs", [{"tupleize_cols": True}, {"tupleize_cols": False}]) def test_deprecated_args(self, engine, kwargs): data = "1,2,3" arg, _ = list(kwargs.items())[0] - if engine == "python" and arg == "buffer_lines": - # unsupported --> exception is raised - return - with tm.assert_produces_warning( FutureWarning, check_stacklevel=False): read_csv(StringIO(data), engine=engine, **kwargs)