Skip to content

Commit 2059a13

Browse files
mzeitlin11jreback
authored andcommitted
Backport PR pandas-dev#39069: REGR: diff_2d raising for int8, int16
1 parent d53ff7a commit 2059a13

File tree

5 files changed

+121
-59
lines changed

5 files changed

+121
-59
lines changed

doc/source/whatsnew/v1.2.1.rst

+1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ Fixed regressions
2424
- Fixed regression in :meth:`.GroupBy.sem` where the presence of non-numeric columns would cause an error instead of being dropped (:issue:`38774`)
2525
- Fixed regression in :func:`read_excel` with non-rawbyte file handles (:issue:`38788`)
2626
- Bug in :meth:`read_csv` with ``float_precision="high"`` caused segfault or wrong parsing of long exponent strings. This resulted in a regression in some cases as the default for ``float_precision`` was changed in pandas 1.2.0 (:issue:`38753`)
27+
- Fixed regression in :meth:`DataFrameGroupBy.diff` raising for ``int8`` and ``int16`` columns (:issue:`39050`)
2728
- Fixed regression in :meth:`Rolling.skew` and :meth:`Rolling.kurt` modifying the object inplace (:issue:`38908`)
2829
- Fixed regression in :meth:`read_csv` and other read functions were the encoding error policy (``errors``) did not default to ``"replace"`` when no encoding was specified (:issue:`38989`)
2930

pandas/core/algorithms.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -1981,7 +1981,13 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3):
19811981

19821982
elif is_integer_dtype(dtype):
19831983
# We have to cast in order to be able to hold np.nan
1984-
dtype = np.float64
1984+
1985+
# int8, int16 are incompatible with float64,
1986+
# see https://github.com/cython/cython/issues/2646
1987+
if arr.dtype.name in ["int8", "int16"]:
1988+
dtype = np.float32
1989+
else:
1990+
dtype = np.float64
19851991

19861992
orig_ndim = arr.ndim
19871993
if orig_ndim == 1:

pandas/tests/groupby/test_groupby.py

-58
Original file line numberDiff line numberDiff line change
@@ -1697,64 +1697,6 @@ def test_sort(x):
16971697
g.apply(test_sort)
16981698

16991699

1700-
def test_group_shift_with_null_key():
1701-
# This test is designed to replicate the segfault in issue #13813.
1702-
n_rows = 1200
1703-
1704-
# Generate a moderately large dataframe with occasional missing
1705-
# values in column `B`, and then group by [`A`, `B`]. This should
1706-
# force `-1` in `labels` array of `g.grouper.group_info` exactly
1707-
# at those places, where the group-by key is partially missing.
1708-
df = DataFrame(
1709-
[(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)],
1710-
dtype=float,
1711-
columns=["A", "B", "Z"],
1712-
index=None,
1713-
)
1714-
g = df.groupby(["A", "B"])
1715-
1716-
expected = DataFrame(
1717-
[(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)],
1718-
dtype=float,
1719-
columns=["Z"],
1720-
index=None,
1721-
)
1722-
result = g.shift(-1)
1723-
1724-
tm.assert_frame_equal(result, expected)
1725-
1726-
1727-
def test_group_shift_with_fill_value():
1728-
# GH #24128
1729-
n_rows = 24
1730-
df = DataFrame(
1731-
[(i % 12, i % 3, i) for i in range(n_rows)],
1732-
dtype=float,
1733-
columns=["A", "B", "Z"],
1734-
index=None,
1735-
)
1736-
g = df.groupby(["A", "B"])
1737-
1738-
expected = DataFrame(
1739-
[(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)],
1740-
dtype=float,
1741-
columns=["Z"],
1742-
index=None,
1743-
)
1744-
result = g.shift(-1, fill_value=0)[["Z"]]
1745-
1746-
tm.assert_frame_equal(result, expected)
1747-
1748-
1749-
def test_group_shift_lose_timezone():
1750-
# GH 30134
1751-
now_dt = Timestamp.utcnow()
1752-
df = DataFrame({"a": [1, 1], "date": now_dt})
1753-
result = df.groupby("a").shift(0).iloc[0]
1754-
expected = Series({"date": now_dt}, name=result.name)
1755-
tm.assert_series_equal(result, expected)
1756-
1757-
17581700
def test_pivot_table_values_key_error():
17591701
# This test is designed to replicate the error in issue #14938
17601702
df = DataFrame(
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
import numpy as np
2+
import pytest
3+
4+
from pandas import DataFrame, NaT, Series, Timedelta, Timestamp
5+
import pandas._testing as tm
6+
7+
8+
def test_group_shift_with_null_key():
9+
# This test is designed to replicate the segfault in issue #13813.
10+
n_rows = 1200
11+
12+
# Generate a moderately large dataframe with occasional missing
13+
# values in column `B`, and then group by [`A`, `B`]. This should
14+
# force `-1` in `labels` array of `g.grouper.group_info` exactly
15+
# at those places, where the group-by key is partially missing.
16+
df = DataFrame(
17+
[(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)],
18+
dtype=float,
19+
columns=["A", "B", "Z"],
20+
index=None,
21+
)
22+
g = df.groupby(["A", "B"])
23+
24+
expected = DataFrame(
25+
[(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)],
26+
dtype=float,
27+
columns=["Z"],
28+
index=None,
29+
)
30+
result = g.shift(-1)
31+
32+
tm.assert_frame_equal(result, expected)
33+
34+
35+
def test_group_shift_with_fill_value():
36+
# GH #24128
37+
n_rows = 24
38+
df = DataFrame(
39+
[(i % 12, i % 3, i) for i in range(n_rows)],
40+
dtype=float,
41+
columns=["A", "B", "Z"],
42+
index=None,
43+
)
44+
g = df.groupby(["A", "B"])
45+
46+
expected = DataFrame(
47+
[(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)],
48+
dtype=float,
49+
columns=["Z"],
50+
index=None,
51+
)
52+
result = g.shift(-1, fill_value=0)[["Z"]]
53+
54+
tm.assert_frame_equal(result, expected)
55+
56+
57+
def test_group_shift_lose_timezone():
58+
# GH 30134
59+
now_dt = Timestamp.utcnow()
60+
df = DataFrame({"a": [1, 1], "date": now_dt})
61+
result = df.groupby("a").shift(0).iloc[0]
62+
expected = Series({"date": now_dt}, name=result.name)
63+
tm.assert_series_equal(result, expected)
64+
65+
66+
def test_group_diff_real(any_real_dtype):
67+
df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [1, 2, 3, 4, 5]}, dtype=any_real_dtype)
68+
result = df.groupby("a")["b"].diff()
69+
exp_dtype = "float"
70+
if any_real_dtype in ["int8", "int16", "float32"]:
71+
exp_dtype = "float32"
72+
expected = Series([np.nan, np.nan, np.nan, 1.0, 3.0], dtype=exp_dtype, name="b")
73+
tm.assert_series_equal(result, expected)
74+
75+
76+
@pytest.mark.parametrize(
77+
"data",
78+
[
79+
[
80+
Timestamp("2013-01-01"),
81+
Timestamp("2013-01-02"),
82+
Timestamp("2013-01-03"),
83+
],
84+
[Timedelta("5 days"), Timedelta("6 days"), Timedelta("7 days")],
85+
],
86+
)
87+
def test_group_diff_datetimelike(data):
88+
df = DataFrame({"a": [1, 2, 2], "b": data})
89+
result = df.groupby("a")["b"].diff()
90+
expected = Series([NaT, NaT, Timedelta("1 days")], name="b")
91+
tm.assert_series_equal(result, expected)
92+
93+
94+
def test_group_diff_bool():
95+
df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]})
96+
result = df.groupby("a")["b"].diff()
97+
expected = Series([np.nan, np.nan, np.nan, False, False], name="b")
98+
tm.assert_series_equal(result, expected)
99+
100+
101+
def test_group_diff_object_raises(object_dtype):
102+
df = DataFrame(
103+
{"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype
104+
)
105+
with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"):
106+
df.groupby("a")["b"].diff()

pandas/tests/test_algos.py

+7
Original file line numberDiff line numberDiff line change
@@ -2409,3 +2409,10 @@ def test_diff_ea_axis(self):
24092409
msg = "cannot diff DatetimeArray on axis=1"
24102410
with pytest.raises(ValueError, match=msg):
24112411
algos.diff(dta, 1, axis=1)
2412+
2413+
@pytest.mark.parametrize("dtype", ["int8", "int16"])
2414+
def test_diff_low_precision_int(self, dtype):
2415+
arr = np.array([0, 1, 1, 0, 0], dtype=dtype)
2416+
result = algos.diff(arr, 1)
2417+
expected = np.array([np.nan, 1, 0, -1, 0], dtype="float32")
2418+
tm.assert_numpy_array_equal(result, expected)

0 commit comments

Comments
 (0)