Skip to content

BUG: Fix qcut for nullable integers #31440

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Feb 2, 2020
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.0.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ ExtensionArray
^^^^^^^^^^^^^^

- Bug in dtype being lost in ``__invert__`` (``~`` operator) for extension-array backed ``Series`` and ``DataFrame`` (:issue:`23087`)
- Bug where :meth:`qcut` would raise when passed a nullable integer. (:issue:`31389`)
-


Expand Down
15 changes: 7 additions & 8 deletions pandas/core/reshape/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,17 +202,10 @@ def cut(
"""
# NOTE: this binning code is changed a bit from histogram for var(x) == 0

# for handling the cut for datetime and timedelta objects
original = x
x = _preprocess_for_cut(x)
x, dtype = _coerce_to_type(x)

# To support cut(IntegerArray), we convert to object dtype with NaN
# Will properly support in the future.
# https://github.com/pandas-dev/pandas/pull/31290
if is_extension_array_dtype(x.dtype) and is_integer_dtype(x.dtype):
x = x.to_numpy(dtype=object, na_value=np.nan)

if not np.iterable(bins):
if is_scalar(bins) and bins < 1:
raise ValueError("`bins` should be a positive integer.")
Expand Down Expand Up @@ -434,7 +427,7 @@ def _bins_to_cuts(

def _coerce_to_type(x):
"""
if the passed data is of datetime/timedelta or bool type,
if the passed data is of datetime/timedelta, bool or nullable int type,
this method converts it to numeric so that cut or qcut method can
handle it
"""
Expand All @@ -451,6 +444,12 @@ def _coerce_to_type(x):
elif is_bool_dtype(x):
# GH 20303
x = x.astype(np.int64)
# To support cut and qcut for IntegerArray we convert to float dtype.
# Will properly support in the future.
# https://github.com/pandas-dev/pandas/pull/31290
# https://github.com/pandas-dev/pandas/issues/31389
elif is_extension_array_dtype(x) and is_integer_dtype(x):
x = x.to_numpy(dtype=np.float64, na_value=np.nan)

if dtype is not None:
# GH 19768: force NaT to NaN during integer conversion
Expand Down
13 changes: 0 additions & 13 deletions pandas/tests/arrays/test_integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1061,19 +1061,6 @@ def test_value_counts_na():
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("bins", [3, [0, 5, 15]])
@pytest.mark.parametrize("right", [True, False])
@pytest.mark.parametrize("include_lowest", [True, False])
def test_cut(bins, right, include_lowest):
a = np.random.randint(0, 10, size=50).astype(object)
a[::2] = np.nan
result = pd.cut(
pd.array(a, dtype="Int64"), bins, right=right, include_lowest=include_lowest
)
expected = pd.cut(a, bins, right=right, include_lowest=include_lowest)
tm.assert_categorical_equal(result, expected)


def test_array_setitem_nullable_boolean_mask():
# GH 31446
ser = pd.Series([1, 2], dtype="Int64")
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/reshape/test_cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -612,3 +612,16 @@ def test_cut_incorrect_labels(labels):
msg = "Bin labels must either be False, None or passed in as a list-like argument"
with pytest.raises(ValueError, match=msg):
cut(values, 4, labels=labels)


@pytest.mark.parametrize("bins", [3, [0, 5, 15]])
@pytest.mark.parametrize("right", [True, False])
@pytest.mark.parametrize("include_lowest", [True, False])
def test_cut_nullable_integer(bins, right, include_lowest):
a = np.random.randint(0, 10, size=50).astype(float)
a[::2] = np.nan
result = cut(
pd.array(a, dtype="Int64"), bins, right=right, include_lowest=include_lowest
)
expected = cut(a, bins, right=right, include_lowest=include_lowest)
tm.assert_categorical_equal(result, expected)
12 changes: 12 additions & 0 deletions pandas/tests/reshape/test_qcut.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy as np
import pytest

import pandas as pd
from pandas import (
Categorical,
DatetimeIndex,
Expand Down Expand Up @@ -286,3 +287,14 @@ def test_qcut_bool_coercion_to_int(bins, box, compare):
expected = qcut(data_expected, bins, duplicates="drop")
result = qcut(data_result, bins, duplicates="drop")
compare(result, expected)


@pytest.mark.parametrize("q", [2, 5, 10])
def test_qcut_nullable_integer(q, any_nullable_int_dtype):
arr = pd.array(np.arange(100), dtype=any_nullable_int_dtype)
arr[::2] = pd.NA

result = qcut(arr, q)
expected = qcut(arr.astype(float), q)

tm.assert_categorical_equal(result, expected)