Skip to content

Commit 1e900ec

Browse files
authored
REGR: diff_2d raising for int8, int16 (#39069)
1 parent efbd098 commit 1e900ec

File tree

5 files changed

+121
-59
lines changed

5 files changed

+121
-59
lines changed

doc/source/whatsnew/v1.2.1.rst

+1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ Fixed regressions
2727
- Fixed regression in :meth:`Rolling.skew` and :meth:`Rolling.kurt` modifying the object inplace (:issue:`38908`)
2828
- Fixed regression in :meth:`read_csv` and other read functions were the encoding error policy (``errors``) did not default to ``"replace"`` when no encoding was specified (:issue:`38989`)
2929
- Fixed regression in :meth:`DataFrame.replace` raising ValueError when :class:`DataFrame` has dtype ``bytes`` (:issue:`38900`)
30+
- Fixed regression in :meth:`DataFrameGroupBy.diff` raising for ``int8`` and ``int16`` columns (:issue:`39050`)
3031

3132
.. ---------------------------------------------------------------------------
3233

pandas/core/algorithms.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -1991,7 +1991,13 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3):
19911991

19921992
elif is_integer_dtype(dtype):
19931993
# We have to cast in order to be able to hold np.nan
1994-
dtype = np.float64
1994+
1995+
# int8, int16 are incompatible with float64,
1996+
# see https://github.com/cython/cython/issues/2646
1997+
if arr.dtype.name in ["int8", "int16"]:
1998+
dtype = np.float32
1999+
else:
2000+
dtype = np.float64
19952001

19962002
orig_ndim = arr.ndim
19972003
if orig_ndim == 1:

pandas/tests/groupby/test_groupby.py

-58
Original file line numberDiff line numberDiff line change
@@ -1698,64 +1698,6 @@ def test_sort(x):
16981698
g.apply(test_sort)
16991699

17001700

1701-
def test_group_shift_with_null_key():
1702-
# This test is designed to replicate the segfault in issue #13813.
1703-
n_rows = 1200
1704-
1705-
# Generate a moderately large dataframe with occasional missing
1706-
# values in column `B`, and then group by [`A`, `B`]. This should
1707-
# force `-1` in `labels` array of `g.grouper.group_info` exactly
1708-
# at those places, where the group-by key is partially missing.
1709-
df = DataFrame(
1710-
[(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)],
1711-
dtype=float,
1712-
columns=["A", "B", "Z"],
1713-
index=None,
1714-
)
1715-
g = df.groupby(["A", "B"])
1716-
1717-
expected = DataFrame(
1718-
[(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)],
1719-
dtype=float,
1720-
columns=["Z"],
1721-
index=None,
1722-
)
1723-
result = g.shift(-1)
1724-
1725-
tm.assert_frame_equal(result, expected)
1726-
1727-
1728-
def test_group_shift_with_fill_value():
1729-
# GH #24128
1730-
n_rows = 24
1731-
df = DataFrame(
1732-
[(i % 12, i % 3, i) for i in range(n_rows)],
1733-
dtype=float,
1734-
columns=["A", "B", "Z"],
1735-
index=None,
1736-
)
1737-
g = df.groupby(["A", "B"])
1738-
1739-
expected = DataFrame(
1740-
[(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)],
1741-
dtype=float,
1742-
columns=["Z"],
1743-
index=None,
1744-
)
1745-
result = g.shift(-1, fill_value=0)[["Z"]]
1746-
1747-
tm.assert_frame_equal(result, expected)
1748-
1749-
1750-
def test_group_shift_lose_timezone():
1751-
# GH 30134
1752-
now_dt = Timestamp.utcnow()
1753-
df = DataFrame({"a": [1, 1], "date": now_dt})
1754-
result = df.groupby("a").shift(0).iloc[0]
1755-
expected = Series({"date": now_dt}, name=result.name)
1756-
tm.assert_series_equal(result, expected)
1757-
1758-
17591701
def test_pivot_table_values_key_error():
17601702
# This test is designed to replicate the error in issue #14938
17611703
df = DataFrame(
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
import numpy as np
2+
import pytest
3+
4+
from pandas import DataFrame, NaT, Series, Timedelta, Timestamp
5+
import pandas._testing as tm
6+
7+
8+
def test_group_shift_with_null_key():
9+
# This test is designed to replicate the segfault in issue #13813.
10+
n_rows = 1200
11+
12+
# Generate a moderately large dataframe with occasional missing
13+
# values in column `B`, and then group by [`A`, `B`]. This should
14+
# force `-1` in `labels` array of `g.grouper.group_info` exactly
15+
# at those places, where the group-by key is partially missing.
16+
df = DataFrame(
17+
[(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)],
18+
dtype=float,
19+
columns=["A", "B", "Z"],
20+
index=None,
21+
)
22+
g = df.groupby(["A", "B"])
23+
24+
expected = DataFrame(
25+
[(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)],
26+
dtype=float,
27+
columns=["Z"],
28+
index=None,
29+
)
30+
result = g.shift(-1)
31+
32+
tm.assert_frame_equal(result, expected)
33+
34+
35+
def test_group_shift_with_fill_value():
36+
# GH #24128
37+
n_rows = 24
38+
df = DataFrame(
39+
[(i % 12, i % 3, i) for i in range(n_rows)],
40+
dtype=float,
41+
columns=["A", "B", "Z"],
42+
index=None,
43+
)
44+
g = df.groupby(["A", "B"])
45+
46+
expected = DataFrame(
47+
[(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)],
48+
dtype=float,
49+
columns=["Z"],
50+
index=None,
51+
)
52+
result = g.shift(-1, fill_value=0)[["Z"]]
53+
54+
tm.assert_frame_equal(result, expected)
55+
56+
57+
def test_group_shift_lose_timezone():
58+
# GH 30134
59+
now_dt = Timestamp.utcnow()
60+
df = DataFrame({"a": [1, 1], "date": now_dt})
61+
result = df.groupby("a").shift(0).iloc[0]
62+
expected = Series({"date": now_dt}, name=result.name)
63+
tm.assert_series_equal(result, expected)
64+
65+
66+
def test_group_diff_real(any_real_dtype):
67+
df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [1, 2, 3, 4, 5]}, dtype=any_real_dtype)
68+
result = df.groupby("a")["b"].diff()
69+
exp_dtype = "float"
70+
if any_real_dtype in ["int8", "int16", "float32"]:
71+
exp_dtype = "float32"
72+
expected = Series([np.nan, np.nan, np.nan, 1.0, 3.0], dtype=exp_dtype, name="b")
73+
tm.assert_series_equal(result, expected)
74+
75+
76+
@pytest.mark.parametrize(
77+
"data",
78+
[
79+
[
80+
Timestamp("2013-01-01"),
81+
Timestamp("2013-01-02"),
82+
Timestamp("2013-01-03"),
83+
],
84+
[Timedelta("5 days"), Timedelta("6 days"), Timedelta("7 days")],
85+
],
86+
)
87+
def test_group_diff_datetimelike(data):
88+
df = DataFrame({"a": [1, 2, 2], "b": data})
89+
result = df.groupby("a")["b"].diff()
90+
expected = Series([NaT, NaT, Timedelta("1 days")], name="b")
91+
tm.assert_series_equal(result, expected)
92+
93+
94+
def test_group_diff_bool():
95+
df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]})
96+
result = df.groupby("a")["b"].diff()
97+
expected = Series([np.nan, np.nan, np.nan, False, False], name="b")
98+
tm.assert_series_equal(result, expected)
99+
100+
101+
def test_group_diff_object_raises(object_dtype):
102+
df = DataFrame(
103+
{"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype
104+
)
105+
with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"):
106+
df.groupby("a")["b"].diff()

pandas/tests/test_algos.py

+7
Original file line numberDiff line numberDiff line change
@@ -2409,3 +2409,10 @@ def test_diff_ea_axis(self):
24092409
msg = "cannot diff DatetimeArray on axis=1"
24102410
with pytest.raises(ValueError, match=msg):
24112411
algos.diff(dta, 1, axis=1)
2412+
2413+
@pytest.mark.parametrize("dtype", ["int8", "int16"])
2414+
def test_diff_low_precision_int(self, dtype):
2415+
arr = np.array([0, 1, 1, 0, 0], dtype=dtype)
2416+
result = algos.diff(arr, 1)
2417+
expected = np.array([np.nan, 1, 0, -1, 0], dtype="float32")
2418+
tm.assert_numpy_array_equal(result, expected)

0 commit comments

Comments
 (0)