From 72cd1ac9d5092f948870c73a989d3b036005bac2 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 9 Jan 2021 14:42:07 -0500 Subject: [PATCH 1/6] REGR: diff_2d low precision int --- pandas/core/algorithms.py | 6 +++++- pandas/tests/test_algos.py | 7 +++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1291fc25fc21d..cd63ca0d61a67 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1991,7 +1991,11 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): elif is_integer_dtype(dtype): # We have to cast in order to be able to hold np.nan - dtype = np.float64 + # int8, int16 are incompatible with float64 + if np.dtype(dtype) in [np.int8, np.int16]: + dtype = np.float32 + else: + dtype = np.float64 orig_ndim = arr.ndim if orig_ndim == 1: diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index a5c71b9ea3286..93900fa223966 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2409,3 +2409,10 @@ def test_diff_ea_axis(self): msg = "cannot diff DatetimeArray on axis=1" with pytest.raises(ValueError, match=msg): algos.diff(dta, 1, axis=1) + + @pytest.mark.parametrize("dtype", ["int8", "int16"]) + def test_diff_low_precision_int(self, dtype): + arr = np.array([0, 1, 1, 0, 0], dtype=dtype) + result = algos.diff(arr, 1) + expected = np.array([np.nan, 1, 0, -1, 0], dtype="float32") + tm.assert_numpy_array_equal(result, expected) From 5c818d8391776adbe152e4009311b1299d425110 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 9 Jan 2021 15:24:32 -0500 Subject: [PATCH 2/6] Add whatsnew --- doc/source/whatsnew/v1.2.1.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst index 4b7a4180ee9f9..c2300a164c128 100644 --- a/doc/source/whatsnew/v1.2.1.rst +++ b/doc/source/whatsnew/v1.2.1.rst @@ -26,6 +26,7 @@ Fixed regressions - Bug in :meth:`read_csv` with ``float_precision="high"`` caused segfault or wrong parsing of long exponent strings. This resulted in a regression in some cases as the default for ``float_precision`` was changed in pandas 1.2.0 (:issue:`38753`) - Fixed regression in :meth:`Rolling.skew` and :meth:`Rolling.kurt` modifying the object inplace (:issue:`38908`) - Fixed regression in :meth:`read_csv` and other read functions were the encoding error policy (``errors``) did not default to ``"replace"`` when no encoding was specified (:issue:`38989`) +- Fixed regression in :meth:`DataFrameGroupBy.diff` raising for ``int8`` and ``int16`` columns (:issue:`39050`) .. --------------------------------------------------------------------------- From 4278126a5235faf5906be83f2ccefbf1acfb4115 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 9 Jan 2021 15:29:27 -0500 Subject: [PATCH 3/6] Use dtype name like other condition --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index cd63ca0d61a67..35c7a9d94f53b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1992,7 +1992,7 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): elif is_integer_dtype(dtype): # We have to cast in order to be able to hold np.nan # int8, int16 are incompatible with float64 - if np.dtype(dtype) in [np.int8, np.int16]: + if arr.dtype.name in ["int8", "int16"]: dtype = np.float32 else: dtype = np.float64 From ca5fa87a87730f19b45cec4d491609db5d09f544 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 9 Jan 2021 18:43:51 -0500 Subject: [PATCH 4/6] Address comments --- pandas/core/algorithms.py | 4 +- pandas/tests/groupby/test_groupby.py | 58 ---------------------------- 2 files changed, 3 insertions(+), 59 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 35c7a9d94f53b..085ad5e6a0dcf 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1991,7 +1991,9 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): elif is_integer_dtype(dtype): # We have to cast in order to be able to hold np.nan - # int8, int16 are incompatible with float64 + + # int8, int16 are incompatible with float64, + # see https://github.com/cython/cython/issues/2646 if arr.dtype.name in ["int8", "int16"]: dtype = np.float32 else: diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 5735f895e33b6..dd836591d5965 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1698,64 +1698,6 @@ def test_sort(x): g.apply(test_sort) -def test_group_shift_with_null_key(): - # This test is designed to replicate the segfault in issue #13813. - n_rows = 1200 - - # Generate a moderately large dataframe with occasional missing - # values in column `B`, and then group by [`A`, `B`]. This should - # force `-1` in `labels` array of `g.grouper.group_info` exactly - # at those places, where the group-by key is partially missing. - df = DataFrame( - [(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)], - dtype=float, - columns=["A", "B", "Z"], - index=None, - ) - g = df.groupby(["A", "B"]) - - expected = DataFrame( - [(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)], - dtype=float, - columns=["Z"], - index=None, - ) - result = g.shift(-1) - - tm.assert_frame_equal(result, expected) - - -def test_group_shift_with_fill_value(): - # GH #24128 - n_rows = 24 - df = DataFrame( - [(i % 12, i % 3, i) for i in range(n_rows)], - dtype=float, - columns=["A", "B", "Z"], - index=None, - ) - g = df.groupby(["A", "B"]) - - expected = DataFrame( - [(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)], - dtype=float, - columns=["Z"], - index=None, - ) - result = g.shift(-1, fill_value=0)[["Z"]] - - tm.assert_frame_equal(result, expected) - - -def test_group_shift_lose_timezone(): - # GH 30134 - now_dt = Timestamp.utcnow() - df = DataFrame({"a": [1, 1], "date": now_dt}) - result = df.groupby("a").shift(0).iloc[0] - expected = Series({"date": now_dt}, name=result.name) - tm.assert_series_equal(result, expected) - - def test_pivot_table_values_key_error(): # This test is designed to replicate the error in issue #14938 df = DataFrame( From 01e1fddaaf23aa1543abdce0e2e29574d418f95e Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 9 Jan 2021 18:47:08 -0500 Subject: [PATCH 5/6] Add new test file --- .../tests/groupby/test_groupby_shift_diff.py | 112 ++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 pandas/tests/groupby/test_groupby_shift_diff.py diff --git a/pandas/tests/groupby/test_groupby_shift_diff.py b/pandas/tests/groupby/test_groupby_shift_diff.py new file mode 100644 index 0000000000000..e9068a56b8efd --- /dev/null +++ b/pandas/tests/groupby/test_groupby_shift_diff.py @@ -0,0 +1,112 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + NaT, + Series, + Timedelta, + Timestamp, +) +import pandas._testing as tm + + +def test_group_shift_with_null_key(): + # This test is designed to replicate the segfault in issue #13813. + n_rows = 1200 + + # Generate a moderately large dataframe with occasional missing + # values in column `B`, and then group by [`A`, `B`]. This should + # force `-1` in `labels` array of `g.grouper.group_info` exactly + # at those places, where the group-by key is partially missing. + df = DataFrame( + [(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)], + dtype=float, + columns=["A", "B", "Z"], + index=None, + ) + g = df.groupby(["A", "B"]) + + expected = DataFrame( + [(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)], + dtype=float, + columns=["Z"], + index=None, + ) + result = g.shift(-1) + + tm.assert_frame_equal(result, expected) + + +def test_group_shift_with_fill_value(): + # GH #24128 + n_rows = 24 + df = DataFrame( + [(i % 12, i % 3, i) for i in range(n_rows)], + dtype=float, + columns=["A", "B", "Z"], + index=None, + ) + g = df.groupby(["A", "B"]) + + expected = DataFrame( + [(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)], + dtype=float, + columns=["Z"], + index=None, + ) + result = g.shift(-1, fill_value=0)[["Z"]] + + tm.assert_frame_equal(result, expected) + + +def test_group_shift_lose_timezone(): + # GH 30134 + now_dt = Timestamp.utcnow() + df = DataFrame({"a": [1, 1], "date": now_dt}) + result = df.groupby("a").shift(0).iloc[0] + expected = Series({"date": now_dt}, name=result.name) + tm.assert_series_equal(result, expected) + + +def test_group_diff_real(any_real_dtype): + df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [1, 2, 3, 4, 5]}, dtype=any_real_dtype) + result = df.groupby("a")["b"].diff() + exp_dtype = "float" + if any_real_dtype in ["int8", "int16", "float32"]: + exp_dtype = "float32" + expected = Series([np.nan, np.nan, np.nan, 1.0, 3.0], dtype=exp_dtype, name="b") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "data", + [ + [ + Timestamp("2013-01-01"), + Timestamp("2013-01-02"), + Timestamp("2013-01-03"), + ], + [Timedelta("5 days"), Timedelta("6 days"), Timedelta("7 days")], + ], +) +def test_group_diff_datetimelike(data): + df = DataFrame({"a": [1, 2, 2], "b": data}) + result = df.groupby("a")["b"].diff() + expected = Series([NaT, NaT, Timedelta("1 days")], name="b") + tm.assert_series_equal(result, expected) + + +def test_group_diff_bool(): + df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]}) + result = df.groupby("a")["b"].diff() + expected = Series([np.nan, np.nan, np.nan, False, False], name="b") + tm.assert_series_equal(result, expected) + + +def test_group_diff_object_raises(object_dtype): + df = DataFrame( + {"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype + ) + with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"): + df.groupby("a")["b"].diff() \ No newline at end of file From ae7ed31983f811f568d2ae6febb72ef4d096eda8 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 9 Jan 2021 18:47:46 -0500 Subject: [PATCH 6/6] Precommit fixup --- pandas/tests/groupby/test_groupby_shift_diff.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/pandas/tests/groupby/test_groupby_shift_diff.py b/pandas/tests/groupby/test_groupby_shift_diff.py index e9068a56b8efd..1410038274152 100644 --- a/pandas/tests/groupby/test_groupby_shift_diff.py +++ b/pandas/tests/groupby/test_groupby_shift_diff.py @@ -1,13 +1,7 @@ import numpy as np import pytest -from pandas import ( - DataFrame, - NaT, - Series, - Timedelta, - Timestamp, -) +from pandas import DataFrame, NaT, Series, Timedelta, Timestamp import pandas._testing as tm @@ -109,4 +103,4 @@ def test_group_diff_object_raises(object_dtype): {"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype ) with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"): - df.groupby("a")["b"].diff() \ No newline at end of file + df.groupby("a")["b"].diff()