Skip to content

Commit 7538352

Browse files
committed
BUG: process Int64 as ints for preservable ops, not as float64
1 parent 3b66021 commit 7538352

File tree

6 files changed

+81
-14
lines changed

6 files changed

+81
-14
lines changed

pandas/core/arrays/integer.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -572,10 +572,13 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
572572
data = self._data
573573
mask = self._mask
574574

575+
preservable_ops = ["min", "max"]
576+
575577
# coerce to a nan-aware float if needed
576578
# (we explicitly use NaN within reductions)
577579
if self._hasna:
578-
data = self.to_numpy("float64", na_value=np.nan)
580+
if name not in preservable_ops or not skipna:
581+
data = self.to_numpy("float64", na_value=np.nan)
579582

580583
op = getattr(nanops, "nan" + name)
581584
result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)
@@ -589,9 +592,11 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
589592

590593
# if we have a preservable numeric op,
591594
# provide coercion back to an integer type if possible
592-
elif name in ["sum", "min", "max", "prod"]:
595+
elif name in preservable_ops + ["sum", "prod"]:
593596
# GH#31409 more performant than casting-then-checking
594597
result = com.cast_scalar_indexer(result)
598+
if isinstance(result, np.integer):
599+
result = int(result)
595600

596601
return result
597602

pandas/core/frame.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -7935,9 +7935,17 @@ def blk_func(values):
79357935
# TODO: can we de-duplicate parts of this with the next blocK?
79367936
result = np.bool_(result)
79377937
elif hasattr(result, "dtype") and is_object_dtype(result.dtype):
7938+
dtype_is_integer = self.dtypes.apply(lambda x: is_integer_dtype(x))
79387939
try:
79397940
if filter_type is None:
7940-
result = result.astype(np.float64)
7941+
if (
7942+
not dtype_is_integer.any()
7943+
or op not in ["min", "max"]
7944+
or not skipna
7945+
):
7946+
result = result.astype(np.float64)
7947+
elif axis == 0:
7948+
result = coerce_to_dtypes(result, self.dtypes)
79417949
elif filter_type == "bool" and notna(result).all():
79427950
result = result.astype(np.bool_)
79437951
except (ValueError, TypeError):

pandas/core/groupby/ops.py

+15-4
Original file line numberDiff line numberDiff line change
@@ -472,6 +472,7 @@ def _cython_operation(
472472

473473
is_datetimelike = needs_i8_conversion(values.dtype)
474474
is_numeric = is_numeric_dtype(values.dtype)
475+
is_extension = is_extension_array_dtype(values)
475476

476477
if is_datetimelike:
477478
values = values.view("int64")
@@ -481,10 +482,16 @@ def _cython_operation(
481482
elif is_integer_dtype(values):
482483
# we use iNaT for the missing value on ints
483484
# so pre-convert to guard this condition
484-
if (values == iNaT).any():
485-
values = ensure_float64(values)
485+
if is_extension and how in ["max", "min"]:
486+
if how == "max":
487+
values = values.to_numpy("int64", na_value=np.iinfo("int64").min)
488+
else:
489+
values = values.to_numpy("int64", na_value=np.iinfo("int64").max)
486490
else:
487-
values = ensure_int_or_float(values)
491+
if (values == iNaT).any():
492+
values = ensure_float64(values)
493+
else:
494+
values = ensure_int_or_float(values)
488495
elif is_numeric and not is_complex_dtype(values):
489496
values = ensure_float64(values)
490497
else:
@@ -538,7 +545,11 @@ def _cython_operation(
538545
result, values, codes, func, is_datetimelike, **kwargs
539546
)
540547

541-
if is_integer_dtype(result) and not is_datetimelike:
548+
if (
549+
is_integer_dtype(result)
550+
and not is_datetimelike
551+
and (how not in ["min", "max"] or not is_extension)
552+
):
542553
mask = result == iNaT
543554
if mask.any():
544555
result = result.astype("float64")

pandas/core/nanops.py

+12-6
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from pandas._typing import ArrayLike, Dtype, Scalar
1212
from pandas.compat._optional import import_optional_dependency
1313

14-
from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask
14+
from pandas.core.dtypes.cast import _int64_max, _int32_max, maybe_upcast_putmask
1515
from pandas.core.dtypes.common import (
1616
_get_dtype,
1717
is_any_int_dtype,
@@ -183,11 +183,17 @@ def _get_fill_value(
183183
if fill_value_typ is None:
184184
return iNaT
185185
else:
186-
if fill_value_typ == "+inf":
187-
# need the max int here
188-
return _int64_max
189-
else:
190-
return iNaT
186+
dtype = getattr(dtype, 'numpy_dtype', dtype)
187+
try:
188+
if fill_value_typ == "+inf":
189+
return np.iinfo(dtype).max
190+
else:
191+
return np.iinfo(dtype).min
192+
except ValueError:
193+
if fill_value_typ == "+inf":
194+
return _int64_max
195+
else:
196+
iNaT
191197

192198

193199
def _maybe_get_mask(

pandas/tests/arrays/test_integer.py

+33
Original file line numberDiff line numberDiff line change
@@ -932,6 +932,39 @@ def test_preserve_dtypes(op):
932932
tm.assert_frame_equal(result, expected)
933933

934934

935+
@pytest.mark.parametrize("op", ["min", "max"])
936+
def test_preserve_dtypes_int64(op):
937+
""" The above test case fails for large Int64s, so implement a better
938+
version of the test for functions that properly preserve the dtype.
939+
"""
940+
int64_iinfo = np.iinfo("int64")
941+
df = pd.DataFrame(
942+
{
943+
"A": ["a", "b", "b"],
944+
"B": [1, None, 3],
945+
"C": integer_array([1, None, 3], dtype="Int64"),
946+
"D": integer_array([int64_iinfo.min, None, int64_iinfo.max], dtype="Int64"),
947+
}
948+
)
949+
950+
# op
951+
result = getattr(df.D, op)()
952+
assert isinstance(result, int)
953+
954+
# groupby
955+
result = getattr(df.groupby("A"), op)()
956+
957+
expected = pd.DataFrame(
958+
{
959+
"B": np.array([1.0, 3.0]),
960+
"C": integer_array([1, 3], dtype="Int64"),
961+
"D": integer_array([int64_iinfo.min, int64_iinfo.max], dtype="Int64"),
962+
},
963+
index=pd.Index(["a", "b"], name="A"),
964+
)
965+
tm.assert_frame_equal(result, expected)
966+
967+
935968
@pytest.mark.parametrize("op", ["mean"])
936969
def test_reduce_to_float(op):
937970
# some reduce ops always return float, even if the result

pandas/tests/extension/test_integer.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,11 @@ def check_reduce(self, s, op_name, skipna):
238238
# overwrite to ensure pd.NA is tested instead of np.nan
239239
# https://github.com/pandas-dev/pandas/issues/30958
240240
result = getattr(s, op_name)(skipna=skipna)
241-
expected = getattr(s.astype("float64"), op_name)(skipna=skipna)
241+
preserved_ops = ["min", "max"]
242+
if skipna and op_name in preserved_ops:
243+
expected = getattr(s.dropna(), op_name)(skipna=True)
244+
else:
245+
expected = getattr(s.astype("float64"), op_name)(skipna=skipna)
242246
if np.isnan(expected):
243247
expected = pd.NA
244248
tm.assert_almost_equal(result, expected)

0 commit comments

Comments
 (0)