BUG: process Int64 as ints for preservable ops, not as float64

qwhelan · qwhelan · commit 753835278471 · 2020-03-14T03:06:27.000-07:00
diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -572,10 +572,13 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
         data = self._data
         mask = self._mask
 
+        preservable_ops = ["min", "max"]
+
         # coerce to a nan-aware float if needed
         # (we explicitly use NaN within reductions)
         if self._hasna:
-            data = self.to_numpy("float64", na_value=np.nan)
+            if name not in preservable_ops or not skipna:
+                data = self.to_numpy("float64", na_value=np.nan)
 
         op = getattr(nanops, "nan" + name)
         result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)
@@ -589,9 +592,11 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
 
         # if we have a preservable numeric op,
         # provide coercion back to an integer type if possible
-        elif name in ["sum", "min", "max", "prod"]:
+        elif name in preservable_ops + ["sum", "prod"]:
             # GH#31409 more performant than casting-then-checking
             result = com.cast_scalar_indexer(result)
+            if isinstance(result, np.integer):
+                result = int(result)
 
         return result
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -7935,9 +7935,17 @@ def blk_func(values):
             # TODO: can we de-duplicate parts of this with the next blocK?
             result = np.bool_(result)
         elif hasattr(result, "dtype") and is_object_dtype(result.dtype):
+            dtype_is_integer = self.dtypes.apply(lambda x: is_integer_dtype(x))
             try:
                 if filter_type is None:
-                    result = result.astype(np.float64)
+                    if (
+                        not dtype_is_integer.any()
+                        or op not in ["min", "max"]
+                        or not skipna
+                    ):
+                        result = result.astype(np.float64)
+                    elif axis == 0:
+                        result = coerce_to_dtypes(result, self.dtypes)
                 elif filter_type == "bool" and notna(result).all():
                     result = result.astype(np.bool_)
             except (ValueError, TypeError):
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -472,6 +472,7 @@ def _cython_operation(
 
         is_datetimelike = needs_i8_conversion(values.dtype)
         is_numeric = is_numeric_dtype(values.dtype)
+        is_extension = is_extension_array_dtype(values)
 
         if is_datetimelike:
             values = values.view("int64")
@@ -481,10 +482,16 @@ def _cython_operation(
         elif is_integer_dtype(values):
             # we use iNaT for the missing value on ints
             # so pre-convert to guard this condition
-            if (values == iNaT).any():
-                values = ensure_float64(values)
+            if is_extension and how in ["max", "min"]:
+                if how == "max":
+                    values = values.to_numpy("int64", na_value=np.iinfo("int64").min)
+                else:
+                    values = values.to_numpy("int64", na_value=np.iinfo("int64").max)
             else:
-                values = ensure_int_or_float(values)
+                if (values == iNaT).any():
+                    values = ensure_float64(values)
+                else:
+                    values = ensure_int_or_float(values)
         elif is_numeric and not is_complex_dtype(values):
             values = ensure_float64(values)
         else:
@@ -538,7 +545,11 @@ def _cython_operation(
                 result, values, codes, func, is_datetimelike, **kwargs
             )
 
-        if is_integer_dtype(result) and not is_datetimelike:
+        if (
+            is_integer_dtype(result)
+            and not is_datetimelike
+            and (how not in ["min", "max"] or not is_extension)
+        ):
             mask = result == iNaT
             if mask.any():
                 result = result.astype("float64")
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -11,7 +11,7 @@
 from pandas._typing import ArrayLike, Dtype, Scalar
 from pandas.compat._optional import import_optional_dependency
 
-from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask
+from pandas.core.dtypes.cast import _int64_max, _int32_max, maybe_upcast_putmask
 from pandas.core.dtypes.common import (
     _get_dtype,
     is_any_int_dtype,
@@ -183,11 +183,17 @@ def _get_fill_value(
         if fill_value_typ is None:
             return iNaT
         else:
-            if fill_value_typ == "+inf":
-                # need the max int here
-                return _int64_max
-            else:
-                return iNaT
+            dtype = getattr(dtype, 'numpy_dtype', dtype)
+            try:
+                if fill_value_typ == "+inf":
+                    return np.iinfo(dtype).max
+                else:
+                    return np.iinfo(dtype).min
+            except ValueError:
+                if fill_value_typ == "+inf":
+                    return _int64_max
+                else:
+                    iNaT
 
 
 def _maybe_get_mask(
diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py
@@ -932,6 +932,39 @@ def test_preserve_dtypes(op):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.parametrize("op", ["min", "max"])
+def test_preserve_dtypes_int64(op):
+    """ The above test case fails for large Int64s, so implement a better
+    version of the test for functions that properly preserve the dtype.
+    """
+    int64_iinfo = np.iinfo("int64")
+    df = pd.DataFrame(
+        {
+            "A": ["a", "b", "b"],
+            "B": [1, None, 3],
+            "C": integer_array([1, None, 3], dtype="Int64"),
+            "D": integer_array([int64_iinfo.min, None, int64_iinfo.max], dtype="Int64"),
+        }
+    )
+
+    # op
+    result = getattr(df.D, op)()
+    assert isinstance(result, int)
+
+    # groupby
+    result = getattr(df.groupby("A"), op)()
+
+    expected = pd.DataFrame(
+        {
+            "B": np.array([1.0, 3.0]),
+            "C": integer_array([1, 3], dtype="Int64"),
+            "D": integer_array([int64_iinfo.min, int64_iinfo.max], dtype="Int64"),
+        },
+        index=pd.Index(["a", "b"], name="A"),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
 @pytest.mark.parametrize("op", ["mean"])
 def test_reduce_to_float(op):
     # some reduce ops always return float, even if the result
diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py
@@ -238,7 +238,11 @@ def check_reduce(self, s, op_name, skipna):
         # overwrite to ensure pd.NA is tested instead of np.nan
         # https://github.com/pandas-dev/pandas/issues/30958
         result = getattr(s, op_name)(skipna=skipna)
-        expected = getattr(s.astype("float64"), op_name)(skipna=skipna)
+        preserved_ops = ["min", "max"]
+        if skipna and op_name in preserved_ops:
+            expected = getattr(s.dropna(), op_name)(skipna=True)
+        else:
+            expected = getattr(s.astype("float64"), op_name)(skipna=skipna)
         if np.isnan(expected):
             expected = pd.NA
         tm.assert_almost_equal(result, expected)