From e6ec3b2f17dd36429bb019b1864d460112970437 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 24 Jan 2020 12:14:39 -0600 Subject: [PATCH 1/3] BUG: Handle IntegerArray in pd.cut xref https://github.com/pandas-dev/pandas/issues/30944. I think this doesn't close it, since only the pd.cut compoment is fixed. --- pandas/core/reshape/tile.py | 40 +++++++++++++++++++++++++---- pandas/tests/arrays/test_integer.py | 14 ++++++++++ 2 files changed, 49 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 5a444d908b786..14998baf113cf 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -14,7 +14,9 @@ is_datetime64_dtype, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, + is_extension_array_dtype, is_integer, + is_integer_dtype, is_list_like, is_scalar, is_timedelta64_dtype, @@ -209,8 +211,17 @@ def cut( if is_scalar(bins) and bins < 1: raise ValueError("`bins` should be a positive integer.") - try: # for array-like - sz = x.size + # TODO: Support arbitrary Extension Arrays. We need + # For now, we're only attempting to support IntegerArray. + # See the note on _bins_to_cuts about what is needed. + is_nullable_integer = is_extension_array_dtype(x.dtype) and is_integer_dtype( + x.dtype + ) + try: + if is_extension_array_dtype(x) and is_integer_dtype(x): + sz = len(x) + else: + sz = x.size except AttributeError: x = np.asarray(x) sz = x.size @@ -218,7 +229,10 @@ def cut( if sz == 0: raise ValueError("Cannot cut empty array") - rng = (nanops.nanmin(x), nanops.nanmax(x)) + if is_nullable_integer: + rng = x._reduce("min"), x._reduce("max") + else: + rng = (nanops.nanmin(x), nanops.nanmax(x)) mn, mx = [mi + 0.0 for mi in rng] if np.isinf(mn) or np.isinf(mx): @@ -383,10 +397,26 @@ def _bins_to_cuts( bins = unique_bins side = "left" if right else "right" - ids = ensure_int64(bins.searchsorted(x, side=side)) + is_nullable_integer = is_extension_array_dtype(x.dtype) and is_integer_dtype( + x.dtype + ) + + if is_nullable_integer: + # TODO: Support other extension types somehow. We don't currently + # We *could* use factorize here, but that does more that we need. + # We just need some integer representation, and the NA values needn't + # even be marked specially. + x_int = x._ndarray_values + ids = ensure_int64(bins.searchsorted(x_int, side=side)) + else: + ids = ensure_int64(bins.searchsorted(x, side=side)) if include_lowest: - ids[x == bins[0]] = 1 + mask = x == bins[0] + if is_nullable_integer: + # when x is integer + mask = mask.to_numpy(na_value=False, dtype=bool) + ids[mask] = 1 na_mask = isna(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 96e676018a0d6..b7ce0c17c7a78 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -1061,6 +1061,20 @@ def test_value_counts_na(): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("bins", [3, [0, 5, 15]]) +@pytest.mark.parametrize("right", [True, False]) +@pytest.mark.parametrize("include_lowest", [True, False]) +def test_cut(bins, right, include_lowest): + a = np.random.randint(0, 10, size=50).astype(float) + a[::2] = np.nan + tm.assert_categorical_equal( + pd.cut( + pd.array(a, dtype="Int64"), bins, right=right, include_lowest=include_lowest + ), + pd.cut(a, bins, right=right, include_lowest=include_lowest), + ) + + # TODO(jreback) - these need testing / are broken # shift From cc1a8100cadf73756969ec68ff2fafef7971a6ef Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 27 Jan 2020 10:21:34 -0600 Subject: [PATCH 2/3] revert --- pandas/core/reshape/tile.py | 40 +++++-------------------------------- 1 file changed, 5 insertions(+), 35 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 14998baf113cf..5a444d908b786 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -14,9 +14,7 @@ is_datetime64_dtype, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, - is_extension_array_dtype, is_integer, - is_integer_dtype, is_list_like, is_scalar, is_timedelta64_dtype, @@ -211,17 +209,8 @@ def cut( if is_scalar(bins) and bins < 1: raise ValueError("`bins` should be a positive integer.") - # TODO: Support arbitrary Extension Arrays. We need - # For now, we're only attempting to support IntegerArray. - # See the note on _bins_to_cuts about what is needed. - is_nullable_integer = is_extension_array_dtype(x.dtype) and is_integer_dtype( - x.dtype - ) - try: - if is_extension_array_dtype(x) and is_integer_dtype(x): - sz = len(x) - else: - sz = x.size + try: # for array-like + sz = x.size except AttributeError: x = np.asarray(x) sz = x.size @@ -229,10 +218,7 @@ def cut( if sz == 0: raise ValueError("Cannot cut empty array") - if is_nullable_integer: - rng = x._reduce("min"), x._reduce("max") - else: - rng = (nanops.nanmin(x), nanops.nanmax(x)) + rng = (nanops.nanmin(x), nanops.nanmax(x)) mn, mx = [mi + 0.0 for mi in rng] if np.isinf(mn) or np.isinf(mx): @@ -397,26 +383,10 @@ def _bins_to_cuts( bins = unique_bins side = "left" if right else "right" - is_nullable_integer = is_extension_array_dtype(x.dtype) and is_integer_dtype( - x.dtype - ) - - if is_nullable_integer: - # TODO: Support other extension types somehow. We don't currently - # We *could* use factorize here, but that does more that we need. - # We just need some integer representation, and the NA values needn't - # even be marked specially. - x_int = x._ndarray_values - ids = ensure_int64(bins.searchsorted(x_int, side=side)) - else: - ids = ensure_int64(bins.searchsorted(x, side=side)) + ids = ensure_int64(bins.searchsorted(x, side=side)) if include_lowest: - mask = x == bins[0] - if is_nullable_integer: - # when x is integer - mask = mask.to_numpy(na_value=False, dtype=bool) - ids[mask] = 1 + ids[x == bins[0]] = 1 na_mask = isna(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() From 458b19fb84ee48fa15ad11fae1a707c6831ec047 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 27 Jan 2020 10:28:35 -0600 Subject: [PATCH 3/3] restore object, NaN behavior --- pandas/core/reshape/tile.py | 8 ++++++++ pandas/tests/arrays/test_integer.py | 11 +++++------ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 5a444d908b786..00a7645d0c7a5 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -14,7 +14,9 @@ is_datetime64_dtype, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, + is_extension_array_dtype, is_integer, + is_integer_dtype, is_list_like, is_scalar, is_timedelta64_dtype, @@ -205,6 +207,12 @@ def cut( x = _preprocess_for_cut(x) x, dtype = _coerce_to_type(x) + # To support cut(IntegerArray), we convert to object dtype with NaN + # Will properly support in the future. + # https://github.com/pandas-dev/pandas/pull/31290 + if is_extension_array_dtype(x.dtype) and is_integer_dtype(x.dtype): + x = x.to_numpy(dtype=object, na_value=np.nan) + if not np.iterable(bins): if is_scalar(bins) and bins < 1: raise ValueError("`bins` should be a positive integer.") diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index b7ce0c17c7a78..63e3c946df912 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -1065,14 +1065,13 @@ def test_value_counts_na(): @pytest.mark.parametrize("right", [True, False]) @pytest.mark.parametrize("include_lowest", [True, False]) def test_cut(bins, right, include_lowest): - a = np.random.randint(0, 10, size=50).astype(float) + a = np.random.randint(0, 10, size=50).astype(object) a[::2] = np.nan - tm.assert_categorical_equal( - pd.cut( - pd.array(a, dtype="Int64"), bins, right=right, include_lowest=include_lowest - ), - pd.cut(a, bins, right=right, include_lowest=include_lowest), + result = pd.cut( + pd.array(a, dtype="Int64"), bins, right=right, include_lowest=include_lowest ) + expected = pd.cut(a, bins, right=right, include_lowest=include_lowest) + tm.assert_categorical_equal(result, expected) # TODO(jreback) - these need testing / are broken