Skip to content

CLN: Drop compact_ints/use_unsigned from read_csv #18851

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Dec 21, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 0 additions & 15 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -199,21 +199,6 @@ low_memory : boolean, default ``True``
Note that the entire file is read into a single DataFrame regardless,
use the ``chunksize`` or ``iterator`` parameter to return the data in chunks.
(Only valid with C parser)
compact_ints : boolean, default False
.. deprecated:: 0.19.0

Argument moved to ``pd.to_numeric``

If ``compact_ints`` is ``True``, then for any column that is of integer dtype, the
parser will attempt to cast it as the smallest integer ``dtype`` possible, either
signed or unsigned depending on the specification from the ``use_unsigned`` parameter.
use_unsigned : boolean, default False
.. deprecated:: 0.18.2

Argument moved to ``pd.to_numeric``

If integer columns are being compacted (i.e. ``compact_ints=True``), specify whether
the column should be compacted to the smallest signed or unsigned integer dtype.
memory_map : boolean, default False
If a filepath is provided for ``filepath_or_buffer``, map the file object
directly onto memory and access the data directly from there. Using this
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ Removal of prior version deprecations/changes
- :func:`read_csv` has dropped the ``skip_footer`` parameter (:issue:`13386`)
- :func:`read_csv` has dropped the ``as_recarray`` parameter (:issue:`13373`)
- :func:`read_csv` has dropped the ``buffer_lines`` parameter (:issue:`13360`)
- :func:`read_csv` has dropped the ``compact_ints`` and ``use_unsigned`` parameters (:issue:`13323`)

.. _whatsnew_0220.performance:

Expand Down
12 changes: 0 additions & 12 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,6 @@ cdef class TextReader:
object index_col
object low_memory
object skiprows
object compact_ints, use_unsigned
object dtype
object encoding
object compression
Expand Down Expand Up @@ -355,10 +354,7 @@ cdef class TextReader:
na_fvalues=None,
true_values=None,
false_values=None,

compact_ints=False,
allow_leading_cols=True,
use_unsigned=False,
low_memory=False,
skiprows=None,
skipfooter=0,
Expand Down Expand Up @@ -482,10 +478,7 @@ cdef class TextReader:
self.false_set = kset_from_list(self.false_values)

self.converters = converters

self.na_filter = na_filter
self.compact_ints = compact_ints
self.use_unsigned = use_unsigned

self.verbose = verbose
self.low_memory = low_memory
Expand Down Expand Up @@ -1122,11 +1115,6 @@ cdef class TextReader:
if upcast_na and na_count > 0:
col_res = _maybe_upcast(col_res)

if issubclass(col_res.dtype.type,
np.integer) and self.compact_ints:
col_res = lib.downcast_int64(col_res, na_values,
self.use_unsigned)

if col_res is None:
raise ParserError('Unable to parse column %d' % i)

Expand Down
71 changes: 0 additions & 71 deletions pandas/_libs/src/inference.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1657,74 +1657,3 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan):
output[i] = default

return maybe_convert_objects(output)


def downcast_int64(ndarray[int64_t] arr, object na_values,
bint use_unsigned=0):
cdef:
Py_ssize_t i, n = len(arr)
int64_t mx = INT64_MIN + 1, mn = INT64_MAX
int64_t NA = na_values[np.int64]
int64_t val
ndarray[uint8_t] mask
int na_count = 0

_mask = np.empty(n, dtype=bool)
mask = _mask.view(np.uint8)

for i in range(n):
val = arr[i]

if val == NA:
mask[i] = 1
na_count += 1
continue

# not NA
mask[i] = 0

if val > mx:
mx = val

if val < mn:
mn = val

if mn >= 0 and use_unsigned:
if mx <= UINT8_MAX - 1:
result = arr.astype(np.uint8)
if na_count:
np.putmask(result, _mask, na_values[np.uint8])
return result

if mx <= UINT16_MAX - 1:
result = arr.astype(np.uint16)
if na_count:
np.putmask(result, _mask, na_values[np.uint16])
return result

if mx <= UINT32_MAX - 1:
result = arr.astype(np.uint32)
if na_count:
np.putmask(result, _mask, na_values[np.uint32])
return result

else:
if mn >= INT8_MIN + 1 and mx <= INT8_MAX:
result = arr.astype(np.int8)
if na_count:
np.putmask(result, _mask, na_values[np.int8])
return result

if mn >= INT16_MIN + 1 and mx <= INT16_MAX:
result = arr.astype(np.int16)
if na_count:
np.putmask(result, _mask, na_values[np.int16])
return result

if mn >= INT32_MIN + 1 and mx <= INT32_MAX:
result = arr.astype(np.int32)
if na_count:
np.putmask(result, _mask, na_values[np.int32])
return result

return arr
32 changes: 0 additions & 32 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,21 +273,6 @@
Note that the entire file is read into a single DataFrame regardless,
use the `chunksize` or `iterator` parameter to return the data in chunks.
(Only valid with C parser)
compact_ints : boolean, default False
.. deprecated:: 0.19.0
Argument moved to ``pd.to_numeric``

If compact_ints is True, then for any column that is of integer dtype,
the parser will attempt to cast it as the smallest integer dtype possible,
either signed or unsigned depending on the specification from the
`use_unsigned` parameter.
use_unsigned : boolean, default False
.. deprecated:: 0.19.0
Argument moved to ``pd.to_numeric``

If integer columns are being compacted (i.e. `compact_ints=True`), specify
whether the column should be compacted to the smallest signed or unsigned
integer dtype.
memory_map : boolean, default False
If a filepath is provided for `filepath_or_buffer`, map the file object
directly onto memory and access the data directly from there. Using this
Expand Down Expand Up @@ -496,8 +481,6 @@ def _read(filepath_or_buffer, kwds):
_c_parser_defaults = {
'delim_whitespace': False,
'na_filter': True,
'compact_ints': False,
'use_unsigned': False,
'low_memory': True,
'memory_map': False,
'error_bad_lines': True,
Expand All @@ -518,13 +501,9 @@ def _read(filepath_or_buffer, kwds):
}

_deprecated_defaults = {
'compact_ints': None,
'use_unsigned': None,
'tupleize_cols': None
}
_deprecated_args = {
'compact_ints',
'use_unsigned',
'tupleize_cols',
}

Expand Down Expand Up @@ -596,8 +575,6 @@ def parser_f(filepath_or_buffer,
# Internal
doublequote=True,
delim_whitespace=False,
compact_ints=None,
use_unsigned=None,
low_memory=_c_parser_defaults['low_memory'],
memory_map=False,
float_precision=None):
Expand Down Expand Up @@ -662,8 +639,6 @@ def parser_f(filepath_or_buffer,
float_precision=float_precision,

na_filter=na_filter,
compact_ints=compact_ints,
use_unsigned=use_unsigned,
delim_whitespace=delim_whitespace,
warn_bad_lines=warn_bad_lines,
error_bad_lines=error_bad_lines,
Expand Down Expand Up @@ -1569,11 +1544,6 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
if cast_type and not is_dtype_equal(cvals, cast_type):
cvals = self._cast_types(cvals, cast_type, c)

if issubclass(cvals.dtype.type, np.integer) and self.compact_ints:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe we can also blow away lib.downcast_int64, to_numeric uses a different (and better) mechanism for doing this. IIRC downcast_int64 was only for compatibility.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GitHub search seems to verify that belief.

cvals = lib.downcast_int64(
cvals, parsers.na_values,
self.use_unsigned)

result[c] = cvals
if verbose and na_count:
print('Filled %d NA values in column %s' % (na_count, str(c)))
Expand Down Expand Up @@ -2064,8 +2034,6 @@ def __init__(self, f, **kwds):
self.converters = kwds['converters']
self.dtype = kwds['dtype']

self.compact_ints = kwds['compact_ints']
self.use_unsigned = kwds['use_unsigned']
self.thousands = kwds['thousands']
self.decimal = kwds['decimal']

Expand Down
36 changes: 0 additions & 36 deletions pandas/tests/dtypes/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,39 +71,3 @@ def test_convert_sql_column_decimals(self):
result = lib.convert_sql_column(arr)
expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8')
tm.assert_numpy_array_equal(result, expected)

def test_convert_downcast_int64(self):
from pandas._libs.parsers import na_values

arr = np.array([1, 2, 7, 8, 10], dtype=np.int64)
expected = np.array([1, 2, 7, 8, 10], dtype=np.int8)

# default argument
result = lib.downcast_int64(arr, na_values)
tm.assert_numpy_array_equal(result, expected)

result = lib.downcast_int64(arr, na_values, use_unsigned=False)
tm.assert_numpy_array_equal(result, expected)

expected = np.array([1, 2, 7, 8, 10], dtype=np.uint8)
result = lib.downcast_int64(arr, na_values, use_unsigned=True)
tm.assert_numpy_array_equal(result, expected)

# still cast to int8 despite use_unsigned=True
# because of the negative number as an element
arr = np.array([1, 2, -7, 8, 10], dtype=np.int64)
expected = np.array([1, 2, -7, 8, 10], dtype=np.int8)
result = lib.downcast_int64(arr, na_values, use_unsigned=True)
tm.assert_numpy_array_equal(result, expected)

arr = np.array([1, 2, 7, 8, 300], dtype=np.int64)
expected = np.array([1, 2, 7, 8, 300], dtype=np.int16)
result = lib.downcast_int64(arr, na_values)
tm.assert_numpy_array_equal(result, expected)

int8_na = na_values[np.int8]
int64_na = na_values[np.int64]
arr = np.array([int64_na, 2, 3, 10, 15], dtype=np.int64)
expected = np.array([int8_na, 2, 3, 10, 15], dtype=np.int8)
result = lib.downcast_int64(arr, na_values)
tm.assert_numpy_array_equal(result, expected)
43 changes: 0 additions & 43 deletions pandas/tests/io/parser/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1371,49 +1371,6 @@ def test_raise_on_no_columns(self):
data = "\n\n\n"
pytest.raises(EmptyDataError, self.read_csv, StringIO(data))

def test_compact_ints_use_unsigned(self):
# see gh-13323
data = 'a,b,c\n1,9,258'

# sanity check
expected = DataFrame({
'a': np.array([1], dtype=np.int64),
'b': np.array([9], dtype=np.int64),
'c': np.array([258], dtype=np.int64),
})
out = self.read_csv(StringIO(data))
tm.assert_frame_equal(out, expected)

expected = DataFrame({
'a': np.array([1], dtype=np.int8),
'b': np.array([9], dtype=np.int8),
'c': np.array([258], dtype=np.int16),
})

# default behaviour for 'use_unsigned'
with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
out = self.read_csv(StringIO(data), compact_ints=True)
tm.assert_frame_equal(out, expected)

with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
out = self.read_csv(StringIO(data), compact_ints=True,
use_unsigned=False)
tm.assert_frame_equal(out, expected)

expected = DataFrame({
'a': np.array([1], dtype=np.uint8),
'b': np.array([9], dtype=np.uint8),
'c': np.array([258], dtype=np.uint16),
})

with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
out = self.read_csv(StringIO(data), compact_ints=True,
use_unsigned=True)
tm.assert_frame_equal(out, expected)

def test_memory_map(self):
mmap_file = os.path.join(self.dirpath, 'test_mmap.csv')
expected = DataFrame({
Expand Down
10 changes: 1 addition & 9 deletions pandas/tests/io/parser/test_unsupported.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,20 +128,12 @@ def read(self):
class TestDeprecatedFeatures(object):

@pytest.mark.parametrize("engine", ["c", "python"])
@pytest.mark.parametrize("kwargs", [{"compact_ints": True},
{"compact_ints": False},
{"use_unsigned": True},
{"use_unsigned": False},
{"tupleize_cols": True},
@pytest.mark.parametrize("kwargs", [{"tupleize_cols": True},
{"tupleize_cols": False}])
def test_deprecated_args(self, engine, kwargs):
data = "1,2,3"
arg, _ = list(kwargs.items())[0]

if engine == "python" and arg == "buffer_lines":
# unsupported --> exception is raised
return

with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
read_csv(StringIO(data), engine=engine, **kwargs)