CLN: Drop compact_ints/use_unsigned from read_csv (#18851)

gfyoung · jreback · commit 011226733f56 · 2017-12-21T10:01:24.000-05:00
* CLN: Drop compact_ints/use_unsigned from read_csv Deprecated in v0.19.0 xref gh-13323 * CLN: Remove downcast_int64 from inference.pyx It was only being used for the compact_ints and use_unsigned parameters in read_csv.
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -199,21 +199,6 @@ low_memory : boolean, default ``True``
   Note that the entire file is read into a single DataFrame regardless,
   use the ``chunksize`` or ``iterator`` parameter to return the data in chunks.
   (Only valid with C parser)
-compact_ints : boolean, default False
-  .. deprecated:: 0.19.0
-
-     Argument moved to ``pd.to_numeric``
-
-  If ``compact_ints`` is ``True``, then for any column that is of integer dtype, the
-  parser will attempt to cast it as the smallest integer ``dtype`` possible, either
-  signed or unsigned depending on the specification from the ``use_unsigned`` parameter.
-use_unsigned : boolean, default False
-  .. deprecated:: 0.18.2
-
-     Argument moved to ``pd.to_numeric``
-
-  If integer columns are being compacted (i.e. ``compact_ints=True``), specify whether
-  the column should be compacted to the smallest signed or unsigned integer dtype.
 memory_map : boolean, default False
   If a filepath is provided for ``filepath_or_buffer``, map the file object
   directly onto memory and access the data directly from there. Using this
diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
@@ -233,6 +233,7 @@ Removal of prior version deprecations/changes
 - :func:`read_csv` has dropped the ``skip_footer`` parameter (:issue:`13386`)
 - :func:`read_csv` has dropped the ``as_recarray`` parameter (:issue:`13373`)
 - :func:`read_csv` has dropped the ``buffer_lines`` parameter (:issue:`13360`)
+- :func:`read_csv` has dropped the ``compact_ints`` and ``use_unsigned`` parameters (:issue:`13323`)
 
 .. _whatsnew_0220.performance:
 
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -305,7 +305,6 @@ cdef class TextReader:
         object index_col
         object low_memory
         object skiprows
-        object compact_ints, use_unsigned
         object dtype
         object encoding
         object compression
@@ -355,10 +354,7 @@ cdef class TextReader:
                   na_fvalues=None,
                   true_values=None,
                   false_values=None,
-
-                  compact_ints=False,
                   allow_leading_cols=True,
-                  use_unsigned=False,
                   low_memory=False,
                   skiprows=None,
                   skipfooter=0,
@@ -482,10 +478,7 @@ cdef class TextReader:
         self.false_set = kset_from_list(self.false_values)
 
         self.converters = converters
-
         self.na_filter = na_filter
-        self.compact_ints = compact_ints
-        self.use_unsigned = use_unsigned
 
         self.verbose = verbose
         self.low_memory = low_memory
@@ -1122,11 +1115,6 @@ cdef class TextReader:
             if upcast_na and na_count > 0:
                 col_res = _maybe_upcast(col_res)
 
-            if issubclass(col_res.dtype.type,
-                          np.integer) and self.compact_ints:
-                col_res = lib.downcast_int64(col_res, na_values,
-                                             self.use_unsigned)
-
             if col_res is None:
                 raise ParserError('Unable to parse column %d' % i)
 
diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx
@@ -1657,74 +1657,3 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan):
             output[i] = default
 
     return maybe_convert_objects(output)
-
-
-def downcast_int64(ndarray[int64_t] arr, object na_values,
-                   bint use_unsigned=0):
-    cdef:
-        Py_ssize_t i, n = len(arr)
-        int64_t mx = INT64_MIN + 1, mn = INT64_MAX
-        int64_t NA = na_values[np.int64]
-        int64_t val
-        ndarray[uint8_t] mask
-        int na_count = 0
-
-    _mask = np.empty(n, dtype=bool)
-    mask = _mask.view(np.uint8)
-
-    for i in range(n):
-        val = arr[i]
-
-        if val == NA:
-            mask[i] = 1
-            na_count += 1
-            continue
-
-        # not NA
-        mask[i] = 0
-
-        if val > mx:
-            mx = val
-
-        if val < mn:
-            mn = val
-
-    if mn >= 0 and use_unsigned:
-        if mx <= UINT8_MAX - 1:
-            result = arr.astype(np.uint8)
-            if na_count:
-                np.putmask(result, _mask, na_values[np.uint8])
-            return result
-
-        if mx <= UINT16_MAX - 1:
-            result = arr.astype(np.uint16)
-            if na_count:
-                np.putmask(result, _mask, na_values[np.uint16])
-            return result
-
-        if mx <= UINT32_MAX - 1:
-            result = arr.astype(np.uint32)
-            if na_count:
-                np.putmask(result, _mask, na_values[np.uint32])
-            return result
-
-    else:
-        if mn >= INT8_MIN + 1 and mx <= INT8_MAX:
-            result = arr.astype(np.int8)
-            if na_count:
-                np.putmask(result, _mask, na_values[np.int8])
-            return result
-
-        if mn >= INT16_MIN + 1 and mx <= INT16_MAX:
-            result = arr.astype(np.int16)
-            if na_count:
-                np.putmask(result, _mask, na_values[np.int16])
-            return result
-
-        if mn >= INT32_MIN + 1 and mx <= INT32_MAX:
-            result = arr.astype(np.int32)
-            if na_count:
-                np.putmask(result, _mask, na_values[np.int32])
-            return result
-
-    return arr
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -273,21 +273,6 @@
     Note that the entire file is read into a single DataFrame regardless,
     use the `chunksize` or `iterator` parameter to return the data in chunks.
     (Only valid with C parser)
-compact_ints : boolean, default False
-    .. deprecated:: 0.19.0
-       Argument moved to ``pd.to_numeric``
-
-    If compact_ints is True, then for any column that is of integer dtype,
-    the parser will attempt to cast it as the smallest integer dtype possible,
-    either signed or unsigned depending on the specification from the
-    `use_unsigned` parameter.
-use_unsigned : boolean, default False
-    .. deprecated:: 0.19.0
-       Argument moved to ``pd.to_numeric``
-
-    If integer columns are being compacted (i.e. `compact_ints=True`), specify
-    whether the column should be compacted to the smallest signed or unsigned
-    integer dtype.
 memory_map : boolean, default False
     If a filepath is provided for `filepath_or_buffer`, map the file object
     directly onto memory and access the data directly from there. Using this
@@ -496,8 +481,6 @@ def _read(filepath_or_buffer, kwds):
 _c_parser_defaults = {
     'delim_whitespace': False,
     'na_filter': True,
-    'compact_ints': False,
-    'use_unsigned': False,
     'low_memory': True,
     'memory_map': False,
     'error_bad_lines': True,
@@ -518,13 +501,9 @@ def _read(filepath_or_buffer, kwds):
 }
 
 _deprecated_defaults = {
-    'compact_ints': None,
-    'use_unsigned': None,
     'tupleize_cols': None
 }
 _deprecated_args = {
-    'compact_ints',
-    'use_unsigned',
     'tupleize_cols',
 }
 
@@ -596,8 +575,6 @@ def parser_f(filepath_or_buffer,
                  # Internal
                  doublequote=True,
                  delim_whitespace=False,
-                 compact_ints=None,
-                 use_unsigned=None,
                  low_memory=_c_parser_defaults['low_memory'],
                  memory_map=False,
                  float_precision=None):
@@ -662,8 +639,6 @@ def parser_f(filepath_or_buffer,
                     float_precision=float_precision,
 
                     na_filter=na_filter,
-                    compact_ints=compact_ints,
-                    use_unsigned=use_unsigned,
                     delim_whitespace=delim_whitespace,
                     warn_bad_lines=warn_bad_lines,
                     error_bad_lines=error_bad_lines,
@@ -1569,11 +1544,6 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
                 if cast_type and not is_dtype_equal(cvals, cast_type):
                     cvals = self._cast_types(cvals, cast_type, c)
 
-            if issubclass(cvals.dtype.type, np.integer) and self.compact_ints:
-                cvals = lib.downcast_int64(
-                    cvals, parsers.na_values,
-                    self.use_unsigned)
-
             result[c] = cvals
             if verbose and na_count:
                 print('Filled %d NA values in column %s' % (na_count, str(c)))
@@ -2064,8 +2034,6 @@ def __init__(self, f, **kwds):
         self.converters = kwds['converters']
         self.dtype = kwds['dtype']
 
-        self.compact_ints = kwds['compact_ints']
-        self.use_unsigned = kwds['use_unsigned']
         self.thousands = kwds['thousands']
         self.decimal = kwds['decimal']
 
diff --git a/pandas/tests/dtypes/test_io.py b/pandas/tests/dtypes/test_io.py
@@ -71,39 +71,3 @@ def test_convert_sql_column_decimals(self):
         result = lib.convert_sql_column(arr)
         expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8')
         tm.assert_numpy_array_equal(result, expected)
-
-    def test_convert_downcast_int64(self):
-        from pandas._libs.parsers import na_values
-
-        arr = np.array([1, 2, 7, 8, 10], dtype=np.int64)
-        expected = np.array([1, 2, 7, 8, 10], dtype=np.int8)
-
-        # default argument
-        result = lib.downcast_int64(arr, na_values)
-        tm.assert_numpy_array_equal(result, expected)
-
-        result = lib.downcast_int64(arr, na_values, use_unsigned=False)
-        tm.assert_numpy_array_equal(result, expected)
-
-        expected = np.array([1, 2, 7, 8, 10], dtype=np.uint8)
-        result = lib.downcast_int64(arr, na_values, use_unsigned=True)
-        tm.assert_numpy_array_equal(result, expected)
-
-        # still cast to int8 despite use_unsigned=True
-        # because of the negative number as an element
-        arr = np.array([1, 2, -7, 8, 10], dtype=np.int64)
-        expected = np.array([1, 2, -7, 8, 10], dtype=np.int8)
-        result = lib.downcast_int64(arr, na_values, use_unsigned=True)
-        tm.assert_numpy_array_equal(result, expected)
-
-        arr = np.array([1, 2, 7, 8, 300], dtype=np.int64)
-        expected = np.array([1, 2, 7, 8, 300], dtype=np.int16)
-        result = lib.downcast_int64(arr, na_values)
-        tm.assert_numpy_array_equal(result, expected)
-
-        int8_na = na_values[np.int8]
-        int64_na = na_values[np.int64]
-        arr = np.array([int64_na, 2, 3, 10, 15], dtype=np.int64)
-        expected = np.array([int8_na, 2, 3, 10, 15], dtype=np.int8)
-        result = lib.downcast_int64(arr, na_values)
-        tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py
@@ -1371,49 +1371,6 @@ def test_raise_on_no_columns(self):
         data = "\n\n\n"
         pytest.raises(EmptyDataError, self.read_csv, StringIO(data))
 
-    def test_compact_ints_use_unsigned(self):
-        # see gh-13323
-        data = 'a,b,c\n1,9,258'
-
-        # sanity check
-        expected = DataFrame({
-            'a': np.array([1], dtype=np.int64),
-            'b': np.array([9], dtype=np.int64),
-            'c': np.array([258], dtype=np.int64),
-        })
-        out = self.read_csv(StringIO(data))
-        tm.assert_frame_equal(out, expected)
-
-        expected = DataFrame({
-            'a': np.array([1], dtype=np.int8),
-            'b': np.array([9], dtype=np.int8),
-            'c': np.array([258], dtype=np.int16),
-        })
-
-        # default behaviour for 'use_unsigned'
-        with tm.assert_produces_warning(
-                FutureWarning, check_stacklevel=False):
-            out = self.read_csv(StringIO(data), compact_ints=True)
-            tm.assert_frame_equal(out, expected)
-
-        with tm.assert_produces_warning(
-                FutureWarning, check_stacklevel=False):
-            out = self.read_csv(StringIO(data), compact_ints=True,
-                                use_unsigned=False)
-            tm.assert_frame_equal(out, expected)
-
-        expected = DataFrame({
-            'a': np.array([1], dtype=np.uint8),
-            'b': np.array([9], dtype=np.uint8),
-            'c': np.array([258], dtype=np.uint16),
-        })
-
-        with tm.assert_produces_warning(
-                FutureWarning, check_stacklevel=False):
-            out = self.read_csv(StringIO(data), compact_ints=True,
-                                use_unsigned=True)
-            tm.assert_frame_equal(out, expected)
-
     def test_memory_map(self):
         mmap_file = os.path.join(self.dirpath, 'test_mmap.csv')
         expected = DataFrame({
diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py
@@ -128,20 +128,12 @@ def read(self):
 class TestDeprecatedFeatures(object):
 
     @pytest.mark.parametrize("engine", ["c", "python"])
-    @pytest.mark.parametrize("kwargs", [{"compact_ints": True},
-                                        {"compact_ints": False},
-                                        {"use_unsigned": True},
-                                        {"use_unsigned": False},
-                                        {"tupleize_cols": True},
+    @pytest.mark.parametrize("kwargs", [{"tupleize_cols": True},
                                         {"tupleize_cols": False}])
     def test_deprecated_args(self, engine, kwargs):
         data = "1,2,3"
         arg, _ = list(kwargs.items())[0]
 
-        if engine == "python" and arg == "buffer_lines":
-            # unsupported --> exception is raised
-            return
-
         with tm.assert_produces_warning(
                 FutureWarning, check_stacklevel=False):
             read_csv(StringIO(data), engine=engine, **kwargs)