From 9ac04ebf7577a3961cd32fe5339b2a68b4b97b89 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 30 Jan 2021 18:31:29 +0100 Subject: [PATCH 1/4] BUG: cumulative functions with ea dtype not handling NA correctly and casting to object --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/core/nanops.py | 24 +++++++++++++++++++++--- pandas/tests/frame/test_cumulative.py | 20 +++++++++++++++++++- 3 files changed, 41 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 9b1819a7d4d9f..451cd544ef650 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -424,7 +424,7 @@ ExtensionArray - Bug in :meth:`DataFrame.where` when ``other`` is a :class:`Series` with :class:`ExtensionArray` dtype (:issue:`38729`) - Fixed bug where :meth:`Series.idxmax`, :meth:`Series.idxmin` and ``argmax/min`` fail when the underlying data is :class:`ExtensionArray` (:issue:`32749`, :issue:`33719`, :issue:`36566`) -- +- Bug in cumulative functions (``cumsum``, ``cumprod``, ``cummax`` and ``cummin``) with extension dtypes not handling ``NA`` correctly and returning object dtype (:issue:`39479`) Other ^^^^^ diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 8d3363df0d132..442c8d740f703 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -18,6 +18,7 @@ is_bool_dtype, is_complex, is_datetime64_any_dtype, + is_extension_array_dtype, is_float, is_float_dtype, is_integer, @@ -32,7 +33,7 @@ from pandas.core.dtypes.dtypes import PeriodDtype from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna -from pandas.core.construction import extract_array +from pandas.core.construction import extract_array, sanitize_array bn = import_optional_dependency("bottleneck", errors="warn") _BOTTLENECK_INSTALLED = bn is not None @@ -1687,6 +1688,8 @@ def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike: np.minimum.accumulate: (np.inf, np.nan), }[accum_func] + is_ea_dtype = is_extension_array_dtype(values.dtype) + # We will be applying this function to block values if values.dtype.kind in ["m", "M"]: # GH#30460, GH#29058 @@ -1728,12 +1731,27 @@ def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike: result, dtype=orig_dtype ) - elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): + elif ( + skipna + and not issubclass(values.dtype.type, (np.integer, np.bool_)) + or is_ea_dtype + ): + if is_integer_dtype(values.dtype) and np.isinf(mask_a): + mask_a = { + np.maximum.accumulate: np.iinfo(values.dtype.numpy_dtype).min, + np.minimum.accumulate: np.iinfo(values.dtype.numpy_dtype).max, + }[accum_func] + vals = values.copy() mask = isna(vals) + mask_keep = np.copy(mask) if is_ea_dtype else mask vals[mask] = mask_a result = accum_func(vals, axis=0) - result[mask] = mask_b + result[mask_keep] = mask_b + + if is_ea_dtype: + result = sanitize_array(result, None, values.dtype) + else: result = accum_func(values, axis=0) diff --git a/pandas/tests/frame/test_cumulative.py b/pandas/tests/frame/test_cumulative.py index 248f3500c41df..32dda8eae2eb4 100644 --- a/pandas/tests/frame/test_cumulative.py +++ b/pandas/tests/frame/test_cumulative.py @@ -7,8 +7,9 @@ """ import numpy as np +import pytest -from pandas import DataFrame, Series +from pandas import NA, DataFrame, Series import pandas._testing as tm @@ -133,3 +134,20 @@ def test_cumulative_ops_preserve_dtypes(self): } ) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "func, exp", + [ + ("cumsum", [2, NA, 7, 6, 6]), + ("cumprod", [2, NA, 10, -10, 0]), + ("cummin", [2, NA, 2, -1, -1]), + ("cummax", [2, NA, 5, 5, 5]), + ], + ) + @pytest.mark.parametrize("dtype", ["Float64", "Int64"]) + def test_cummulative_ops_extension_dtype(self, frame_or_series, dtype, func, exp): + # GH#39479 + obj = frame_or_series([2, np.nan, 5, -1, 0], dtype=dtype) + result = getattr(obj, func)() + expected = frame_or_series(exp, dtype=dtype) + tm.assert_equal(result, expected) From 8d212a61a7426a686a7c2d0ea01f48781dd9c952 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 30 Jan 2021 19:44:16 +0100 Subject: [PATCH 2/4] Fix mypy issues --- pandas/core/nanops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 442c8d740f703..5e4cc031deb99 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1738,8 +1738,8 @@ def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike: ): if is_integer_dtype(values.dtype) and np.isinf(mask_a): mask_a = { - np.maximum.accumulate: np.iinfo(values.dtype.numpy_dtype).min, - np.minimum.accumulate: np.iinfo(values.dtype.numpy_dtype).max, + np.maximum.accumulate: np.iinfo(values.dtype.type).min, + np.minimum.accumulate: np.iinfo(values.dtype.type).max, }[accum_func] vals = values.copy() From ce49358b07694cd13023d1cada0b1c35c53f847c Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 30 Jan 2021 22:24:37 +0100 Subject: [PATCH 3/4] Refactor elif --- pandas/core/nanops.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 5e4cc031deb99..4cfdac77ad63b 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1688,8 +1688,6 @@ def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike: np.minimum.accumulate: (np.inf, np.nan), }[accum_func] - is_ea_dtype = is_extension_array_dtype(values.dtype) - # We will be applying this function to block values if values.dtype.kind in ["m", "M"]: # GH#30460, GH#29058 @@ -1731,11 +1729,7 @@ def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike: result, dtype=orig_dtype ) - elif ( - skipna - and not issubclass(values.dtype.type, (np.integer, np.bool_)) - or is_ea_dtype - ): + elif is_extension_array_dtype(values.dtype): if is_integer_dtype(values.dtype) and np.isinf(mask_a): mask_a = { np.maximum.accumulate: np.iinfo(values.dtype.type).min, @@ -1744,13 +1738,18 @@ def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike: vals = values.copy() mask = isna(vals) - mask_keep = np.copy(mask) if is_ea_dtype else mask + mask_copy = np.copy(mask) vals[mask] = mask_a result = accum_func(vals, axis=0) - result[mask_keep] = mask_b + result[mask_copy] = mask_b + result = sanitize_array(result, None, values.dtype) - if is_ea_dtype: - result = sanitize_array(result, None, values.dtype) + elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): + vals = values.copy() + mask = isna(vals) + vals[mask] = mask_a + result = accum_func(vals, axis=0) + result[mask] = mask_b else: result = accum_func(values, axis=0) From eb29bf2c7d888f335ac3fcd9433b4931bb694c0d Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 30 Jan 2021 22:25:29 +0100 Subject: [PATCH 4/4] Remove empty line --- pandas/core/nanops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 4cfdac77ad63b..adb1fc3dc7edd 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1750,7 +1750,6 @@ def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike: vals[mask] = mask_a result = accum_func(vals, axis=0) result[mask] = mask_b - else: result = accum_func(values, axis=0)