Backport PR pandas-dev#39069: REGR: diff_2d raising for int8, int16

mzeitlin11 · jreback · commit 2059a1377beb · 2021-01-09T20:00:52.000-05:00
diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst
@@ -24,6 +24,7 @@ Fixed regressions
 - Fixed regression in :meth:`.GroupBy.sem` where the presence of non-numeric columns would cause an error instead of being dropped (:issue:`38774`)
 - Fixed regression in :func:`read_excel` with non-rawbyte file handles (:issue:`38788`)
 - Bug in :meth:`read_csv` with ``float_precision="high"`` caused segfault or wrong parsing of long exponent strings. This resulted in a regression in some cases as the default for ``float_precision`` was changed in pandas 1.2.0 (:issue:`38753`)
+- Fixed regression in :meth:`DataFrameGroupBy.diff` raising for ``int8`` and ``int16`` columns (:issue:`39050`)
 - Fixed regression in :meth:`Rolling.skew` and :meth:`Rolling.kurt` modifying the object inplace (:issue:`38908`)
 - Fixed regression in :meth:`read_csv` and other read functions were the encoding error policy (``errors``) did not default to ``"replace"`` when no encoding was specified (:issue:`38989`)
 
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -1981,7 +1981,13 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3):
 
     elif is_integer_dtype(dtype):
         # We have to cast in order to be able to hold np.nan
-        dtype = np.float64
+
+        # int8, int16 are incompatible with float64,
+        # see https://github.com/cython/cython/issues/2646
+        if arr.dtype.name in ["int8", "int16"]:
+            dtype = np.float32
+        else:
+            dtype = np.float64
 
     orig_ndim = arr.ndim
     if orig_ndim == 1:
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -1697,64 +1697,6 @@ def test_sort(x):
     g.apply(test_sort)
 
 
-def test_group_shift_with_null_key():
-    # This test is designed to replicate the segfault in issue #13813.
-    n_rows = 1200
-
-    # Generate a moderately large dataframe with occasional missing
-    # values in column `B`, and then group by [`A`, `B`]. This should
-    # force `-1` in `labels` array of `g.grouper.group_info` exactly
-    # at those places, where the group-by key is partially missing.
-    df = DataFrame(
-        [(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)],
-        dtype=float,
-        columns=["A", "B", "Z"],
-        index=None,
-    )
-    g = df.groupby(["A", "B"])
-
-    expected = DataFrame(
-        [(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)],
-        dtype=float,
-        columns=["Z"],
-        index=None,
-    )
-    result = g.shift(-1)
-
-    tm.assert_frame_equal(result, expected)
-
-
-def test_group_shift_with_fill_value():
-    # GH #24128
-    n_rows = 24
-    df = DataFrame(
-        [(i % 12, i % 3, i) for i in range(n_rows)],
-        dtype=float,
-        columns=["A", "B", "Z"],
-        index=None,
-    )
-    g = df.groupby(["A", "B"])
-
-    expected = DataFrame(
-        [(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)],
-        dtype=float,
-        columns=["Z"],
-        index=None,
-    )
-    result = g.shift(-1, fill_value=0)[["Z"]]
-
-    tm.assert_frame_equal(result, expected)
-
-
-def test_group_shift_lose_timezone():
-    # GH 30134
-    now_dt = Timestamp.utcnow()
-    df = DataFrame({"a": [1, 1], "date": now_dt})
-    result = df.groupby("a").shift(0).iloc[0]
-    expected = Series({"date": now_dt}, name=result.name)
-    tm.assert_series_equal(result, expected)
-
-
 def test_pivot_table_values_key_error():
     # This test is designed to replicate the error in issue #14938
     df = DataFrame(
diff --git a/pandas/tests/groupby/test_groupby_shift_diff.py b/pandas/tests/groupby/test_groupby_shift_diff.py
@@ -0,0 +1,106 @@
+import numpy as np
+import pytest
+
+from pandas import DataFrame, NaT, Series, Timedelta, Timestamp
+import pandas._testing as tm
+
+
+def test_group_shift_with_null_key():
+    # This test is designed to replicate the segfault in issue #13813.
+    n_rows = 1200
+
+    # Generate a moderately large dataframe with occasional missing
+    # values in column `B`, and then group by [`A`, `B`]. This should
+    # force `-1` in `labels` array of `g.grouper.group_info` exactly
+    # at those places, where the group-by key is partially missing.
+    df = DataFrame(
+        [(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)],
+        dtype=float,
+        columns=["A", "B", "Z"],
+        index=None,
+    )
+    g = df.groupby(["A", "B"])
+
+    expected = DataFrame(
+        [(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)],
+        dtype=float,
+        columns=["Z"],
+        index=None,
+    )
+    result = g.shift(-1)
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_group_shift_with_fill_value():
+    # GH #24128
+    n_rows = 24
+    df = DataFrame(
+        [(i % 12, i % 3, i) for i in range(n_rows)],
+        dtype=float,
+        columns=["A", "B", "Z"],
+        index=None,
+    )
+    g = df.groupby(["A", "B"])
+
+    expected = DataFrame(
+        [(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)],
+        dtype=float,
+        columns=["Z"],
+        index=None,
+    )
+    result = g.shift(-1, fill_value=0)[["Z"]]
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_group_shift_lose_timezone():
+    # GH 30134
+    now_dt = Timestamp.utcnow()
+    df = DataFrame({"a": [1, 1], "date": now_dt})
+    result = df.groupby("a").shift(0).iloc[0]
+    expected = Series({"date": now_dt}, name=result.name)
+    tm.assert_series_equal(result, expected)
+
+
+def test_group_diff_real(any_real_dtype):
+    df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [1, 2, 3, 4, 5]}, dtype=any_real_dtype)
+    result = df.groupby("a")["b"].diff()
+    exp_dtype = "float"
+    if any_real_dtype in ["int8", "int16", "float32"]:
+        exp_dtype = "float32"
+    expected = Series([np.nan, np.nan, np.nan, 1.0, 3.0], dtype=exp_dtype, name="b")
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [
+            Timestamp("2013-01-01"),
+            Timestamp("2013-01-02"),
+            Timestamp("2013-01-03"),
+        ],
+        [Timedelta("5 days"), Timedelta("6 days"), Timedelta("7 days")],
+    ],
+)
+def test_group_diff_datetimelike(data):
+    df = DataFrame({"a": [1, 2, 2], "b": data})
+    result = df.groupby("a")["b"].diff()
+    expected = Series([NaT, NaT, Timedelta("1 days")], name="b")
+    tm.assert_series_equal(result, expected)
+
+
+def test_group_diff_bool():
+    df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]})
+    result = df.groupby("a")["b"].diff()
+    expected = Series([np.nan, np.nan, np.nan, False, False], name="b")
+    tm.assert_series_equal(result, expected)
+
+
+def test_group_diff_object_raises(object_dtype):
+    df = DataFrame(
+        {"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype
+    )
+    with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"):
+        df.groupby("a")["b"].diff()
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -2409,3 +2409,10 @@ def test_diff_ea_axis(self):
         msg = "cannot diff DatetimeArray on axis=1"
         with pytest.raises(ValueError, match=msg):
             algos.diff(dta, 1, axis=1)
+
+    @pytest.mark.parametrize("dtype", ["int8", "int16"])
+    def test_diff_low_precision_int(self, dtype):
+        arr = np.array([0, 1, 1, 0, 0], dtype=dtype)
+        result = algos.diff(arr, 1)
+        expected = np.array([np.nan, 1, 0, -1, 0], dtype="float32")
+        tm.assert_numpy_array_equal(result, expected)