Skip to content

BUG: Coercing bool types to int in qcut #28802

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ Other
- Compatibility with Python 3.8 in :meth:`DataFrame.query` (:issue:`27261`)
- Fix to ensure that tab-completion in an IPython console does not raise
warnings for deprecated attributes (:issue:`27900`).
- :func:`qcut` now handles bool ndarray/Series (:issue:`20303`)

.. _whatsnew_0.252.contributors:

Expand Down
12 changes: 9 additions & 3 deletions pandas/core/reshape/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from pandas.core.dtypes.common import (
_NS_DTYPE,
ensure_int64,
is_bool_dtype,
is_categorical_dtype,
is_datetime64_dtype,
is_datetime64tz_dtype,
Expand Down Expand Up @@ -423,8 +424,8 @@ def _bins_to_cuts(

def _coerce_to_type(x):
"""
if the passed data is of datetime/timedelta type,
this method converts it to numeric so that cut method can
if the passed data is of datetime/timedelta or bool type,
this method converts it to numeric so that cut or qcut method can
handle it
"""
dtype = None
Expand All @@ -437,10 +438,15 @@ def _coerce_to_type(x):
elif is_timedelta64_dtype(x):
x = to_timedelta(x)
dtype = np.dtype("timedelta64[ns]")
elif is_bool_dtype(x):
dtype = x.dtype

if dtype is not None:
# GH 19768: force NaT to NaN during integer conversion
x = np.where(x.notna(), x.view(np.int64), np.nan)
if is_bool_dtype(x):
Copy link
Contributor Author

@ryankarlos ryankarlos Oct 5, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback Not entirely sure where this should go - Adding x.astype(int) under elif is_bool_dtype(x) higher up throws an error in tests with the existing np.where(x.notna(), x.view(np.int64), np.nan) statement if x is ndarray - it passes if x is Series though.
AttributeError: 'numpy.ndarray' object has no attribute 'isnan'

It passes if adding is_bool_dtype(x) with new np.where condition using ~np.isnan(x) like i've done here to account for x being an ndarray

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so don't use np.isnan, change this to

x = np.where(notna(x), x.astype(np.int64, copy=False), np.nan)

Copy link
Contributor Author

@ryankarlos ryankarlos Oct 5, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've gone with doing the integer conversion in the elif block higher up and leaving dtype as None as @jschendel suggested, rather than making any changes here.

x = np.where(~np.isnan(x), x.astype(int), np.nan)
else:
x = np.where(x.notna(), x.view(np.int64), np.nan)

return x, dtype

Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/reshape/test_cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,3 +585,17 @@ def test_timedelta_cut_roundtrip():
["0 days 23:57:07.200000", "2 days 00:00:00", "3 days 00:00:00"]
)
tm.assert_index_equal(result_bins, expected_bins)


@pytest.mark.parametrize("bins", [6, 7])
@pytest.mark.parametrize(
"box, compare",
[(Series, tm.assert_series_equal), (np.array, tm.assert_categorical_equal)],
)
def test_cut_bool_coercion_to_int(bins, box, compare):
# issue 20303
x = box(np.random.randint(2, size=200))
expected = cut(x, bins, duplicates="drop")
data = x.astype(bool)
result = cut(data, bins, duplicates="drop")
compare(result, expected)
14 changes: 14 additions & 0 deletions pandas/tests/reshape/test_qcut.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,3 +236,17 @@ def test_date_like_qcut_bins(arg, expected_bins):
ser = Series(arg)
result, result_bins = qcut(ser, 2, retbins=True)
tm.assert_index_equal(result_bins, expected_bins)


@pytest.mark.parametrize("bins", [6, 7])
@pytest.mark.parametrize(
"box, compare",
[(Series, tm.assert_series_equal), (np.array, tm.assert_categorical_equal)],
)
def test_qcut_bool_coercion_to_int(bins, box, compare):
# issue 20303
x = box(np.random.randint(2, size=200))
expected = qcut(x, bins, duplicates="drop")
data = x.astype(bool)
result = qcut(data, bins, duplicates="drop")
compare(result, expected)