From 72cd1ac9d5092f948870c73a989d3b036005bac2 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Sat, 9 Jan 2021 14:42:07 -0500
Subject: [PATCH 1/6] REGR: diff_2d low precision int

---
 pandas/core/algorithms.py  | 6 +++++-
 pandas/tests/test_algos.py | 7 +++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 1291fc25fc21d..cd63ca0d61a67 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -1991,7 +1991,11 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3):
 
     elif is_integer_dtype(dtype):
         # We have to cast in order to be able to hold np.nan
-        dtype = np.float64
+        # int8, int16 are incompatible with float64
+        if np.dtype(dtype) in [np.int8, np.int16]:
+            dtype = np.float32
+        else:
+            dtype = np.float64
 
     orig_ndim = arr.ndim
     if orig_ndim == 1:
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index a5c71b9ea3286..93900fa223966 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -2409,3 +2409,10 @@ def test_diff_ea_axis(self):
         msg = "cannot diff DatetimeArray on axis=1"
         with pytest.raises(ValueError, match=msg):
             algos.diff(dta, 1, axis=1)
+
+    @pytest.mark.parametrize("dtype", ["int8", "int16"])
+    def test_diff_low_precision_int(self, dtype):
+        arr = np.array([0, 1, 1, 0, 0], dtype=dtype)
+        result = algos.diff(arr, 1)
+        expected = np.array([np.nan, 1, 0, -1, 0], dtype="float32")
+        tm.assert_numpy_array_equal(result, expected)

From 5c818d8391776adbe152e4009311b1299d425110 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Sat, 9 Jan 2021 15:24:32 -0500
Subject: [PATCH 2/6] Add whatsnew

---
 doc/source/whatsnew/v1.2.1.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst
index 4b7a4180ee9f9..c2300a164c128 100644
--- a/doc/source/whatsnew/v1.2.1.rst
+++ b/doc/source/whatsnew/v1.2.1.rst
@@ -26,6 +26,7 @@ Fixed regressions
 - Bug in :meth:`read_csv` with ``float_precision="high"`` caused segfault or wrong parsing of long exponent strings. This resulted in a regression in some cases as the default for ``float_precision`` was changed in pandas 1.2.0 (:issue:`38753`)
 - Fixed regression in :meth:`Rolling.skew` and :meth:`Rolling.kurt` modifying the object inplace (:issue:`38908`)
 - Fixed regression in :meth:`read_csv` and other read functions were the encoding error policy (``errors``) did not default to ``"replace"`` when no encoding was specified (:issue:`38989`)
+- Fixed regression in :meth:`DataFrameGroupBy.diff` raising for ``int8`` and ``int16`` columns (:issue:`39050`)
 
 .. ---------------------------------------------------------------------------
 

From 4278126a5235faf5906be83f2ccefbf1acfb4115 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Sat, 9 Jan 2021 15:29:27 -0500
Subject: [PATCH 3/6] Use dtype name like other condition

---
 pandas/core/algorithms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index cd63ca0d61a67..35c7a9d94f53b 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -1992,7 +1992,7 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3):
     elif is_integer_dtype(dtype):
         # We have to cast in order to be able to hold np.nan
         # int8, int16 are incompatible with float64
-        if np.dtype(dtype) in [np.int8, np.int16]:
+        if arr.dtype.name in ["int8", "int16"]:
             dtype = np.float32
         else:
             dtype = np.float64

From ca5fa87a87730f19b45cec4d491609db5d09f544 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Sat, 9 Jan 2021 18:43:51 -0500
Subject: [PATCH 4/6] Address comments

---
 pandas/core/algorithms.py            |  4 +-
 pandas/tests/groupby/test_groupby.py | 58 ----------------------------
 2 files changed, 3 insertions(+), 59 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 35c7a9d94f53b..085ad5e6a0dcf 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -1991,7 +1991,9 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3):
 
     elif is_integer_dtype(dtype):
         # We have to cast in order to be able to hold np.nan
-        # int8, int16 are incompatible with float64
+
+        # int8, int16 are incompatible with float64,
+        # see https://github.com/cython/cython/issues/2646
         if arr.dtype.name in ["int8", "int16"]:
             dtype = np.float32
         else:
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 5735f895e33b6..dd836591d5965 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -1698,64 +1698,6 @@ def test_sort(x):
     g.apply(test_sort)
 
 
-def test_group_shift_with_null_key():
-    # This test is designed to replicate the segfault in issue #13813.
-    n_rows = 1200
-
-    # Generate a moderately large dataframe with occasional missing
-    # values in column `B`, and then group by [`A`, `B`]. This should
-    # force `-1` in `labels` array of `g.grouper.group_info` exactly
-    # at those places, where the group-by key is partially missing.
-    df = DataFrame(
-        [(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)],
-        dtype=float,
-        columns=["A", "B", "Z"],
-        index=None,
-    )
-    g = df.groupby(["A", "B"])
-
-    expected = DataFrame(
-        [(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)],
-        dtype=float,
-        columns=["Z"],
-        index=None,
-    )
-    result = g.shift(-1)
-
-    tm.assert_frame_equal(result, expected)
-
-
-def test_group_shift_with_fill_value():
-    # GH #24128
-    n_rows = 24
-    df = DataFrame(
-        [(i % 12, i % 3, i) for i in range(n_rows)],
-        dtype=float,
-        columns=["A", "B", "Z"],
-        index=None,
-    )
-    g = df.groupby(["A", "B"])
-
-    expected = DataFrame(
-        [(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)],
-        dtype=float,
-        columns=["Z"],
-        index=None,
-    )
-    result = g.shift(-1, fill_value=0)[["Z"]]
-
-    tm.assert_frame_equal(result, expected)
-
-
-def test_group_shift_lose_timezone():
-    # GH 30134
-    now_dt = Timestamp.utcnow()
-    df = DataFrame({"a": [1, 1], "date": now_dt})
-    result = df.groupby("a").shift(0).iloc[0]
-    expected = Series({"date": now_dt}, name=result.name)
-    tm.assert_series_equal(result, expected)
-
-
 def test_pivot_table_values_key_error():
     # This test is designed to replicate the error in issue #14938
     df = DataFrame(

From 01e1fddaaf23aa1543abdce0e2e29574d418f95e Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Sat, 9 Jan 2021 18:47:08 -0500
Subject: [PATCH 5/6] Add new test file

---
 .../tests/groupby/test_groupby_shift_diff.py  | 112 ++++++++++++++++++
 1 file changed, 112 insertions(+)
 create mode 100644 pandas/tests/groupby/test_groupby_shift_diff.py

diff --git a/pandas/tests/groupby/test_groupby_shift_diff.py b/pandas/tests/groupby/test_groupby_shift_diff.py
new file mode 100644
index 0000000000000..e9068a56b8efd
--- /dev/null
+++ b/pandas/tests/groupby/test_groupby_shift_diff.py
@@ -0,0 +1,112 @@
+import numpy as np
+import pytest
+
+from pandas import (
+    DataFrame,
+    NaT,
+    Series,
+    Timedelta,
+    Timestamp,
+)
+import pandas._testing as tm
+
+
+def test_group_shift_with_null_key():
+    # This test is designed to replicate the segfault in issue #13813.
+    n_rows = 1200
+
+    # Generate a moderately large dataframe with occasional missing
+    # values in column `B`, and then group by [`A`, `B`]. This should
+    # force `-1` in `labels` array of `g.grouper.group_info` exactly
+    # at those places, where the group-by key is partially missing.
+    df = DataFrame(
+        [(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)],
+        dtype=float,
+        columns=["A", "B", "Z"],
+        index=None,
+    )
+    g = df.groupby(["A", "B"])
+
+    expected = DataFrame(
+        [(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)],
+        dtype=float,
+        columns=["Z"],
+        index=None,
+    )
+    result = g.shift(-1)
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_group_shift_with_fill_value():
+    # GH #24128
+    n_rows = 24
+    df = DataFrame(
+        [(i % 12, i % 3, i) for i in range(n_rows)],
+        dtype=float,
+        columns=["A", "B", "Z"],
+        index=None,
+    )
+    g = df.groupby(["A", "B"])
+
+    expected = DataFrame(
+        [(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)],
+        dtype=float,
+        columns=["Z"],
+        index=None,
+    )
+    result = g.shift(-1, fill_value=0)[["Z"]]
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_group_shift_lose_timezone():
+    # GH 30134
+    now_dt = Timestamp.utcnow()
+    df = DataFrame({"a": [1, 1], "date": now_dt})
+    result = df.groupby("a").shift(0).iloc[0]
+    expected = Series({"date": now_dt}, name=result.name)
+    tm.assert_series_equal(result, expected)
+
+
+def test_group_diff_real(any_real_dtype):
+    df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [1, 2, 3, 4, 5]}, dtype=any_real_dtype)
+    result = df.groupby("a")["b"].diff()
+    exp_dtype = "float"
+    if any_real_dtype in ["int8", "int16", "float32"]:
+        exp_dtype = "float32"
+    expected = Series([np.nan, np.nan, np.nan, 1.0, 3.0], dtype=exp_dtype, name="b")
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [
+            Timestamp("2013-01-01"),
+            Timestamp("2013-01-02"),
+            Timestamp("2013-01-03"),
+        ],
+        [Timedelta("5 days"), Timedelta("6 days"), Timedelta("7 days")],
+    ],
+)
+def test_group_diff_datetimelike(data):
+    df = DataFrame({"a": [1, 2, 2], "b": data})
+    result = df.groupby("a")["b"].diff()
+    expected = Series([NaT, NaT, Timedelta("1 days")], name="b")
+    tm.assert_series_equal(result, expected)
+
+
+def test_group_diff_bool():
+    df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]})
+    result = df.groupby("a")["b"].diff()
+    expected = Series([np.nan, np.nan, np.nan, False, False], name="b")
+    tm.assert_series_equal(result, expected)
+
+
+def test_group_diff_object_raises(object_dtype):
+    df = DataFrame(
+        {"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype
+    )
+    with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"):
+        df.groupby("a")["b"].diff()
\ No newline at end of file

From ae7ed31983f811f568d2ae6febb72ef4d096eda8 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Sat, 9 Jan 2021 18:47:46 -0500
Subject: [PATCH 6/6] Precommit fixup

---
 pandas/tests/groupby/test_groupby_shift_diff.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/pandas/tests/groupby/test_groupby_shift_diff.py b/pandas/tests/groupby/test_groupby_shift_diff.py
index e9068a56b8efd..1410038274152 100644
--- a/pandas/tests/groupby/test_groupby_shift_diff.py
+++ b/pandas/tests/groupby/test_groupby_shift_diff.py
@@ -1,13 +1,7 @@
 import numpy as np
 import pytest
 
-from pandas import (
-    DataFrame,
-    NaT,
-    Series,
-    Timedelta,
-    Timestamp,
-)
+from pandas import DataFrame, NaT, Series, Timedelta, Timestamp
 import pandas._testing as tm
 
 
@@ -109,4 +103,4 @@ def test_group_diff_object_raises(object_dtype):
         {"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype
     )
     with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"):
-        df.groupby("a")["b"].diff()
\ No newline at end of file
+        df.groupby("a")["b"].diff()