diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index ec6ad38bbc7cf..ffff720ed1c06 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -727,6 +727,7 @@ Deprecations - Support for multi-dimensional indexing (e.g. ``index[:, None]``) on a :class:`Index` is deprecated and will be removed in a future version, convert to a numpy array before indexing instead (:issue:`30588`) - The ``pandas.np`` submodule is now deprecated. Import numpy directly instead (:issue:`30296`) - The ``pandas.datetime`` class is now deprecated. Import from ``datetime`` instead (:issue:`30610`) +- :class:`~DataFrame.diff` will raise a ``TypeError`` rather than implicitly losing the dtype of extension types in the future. Convert to the correct dtype before calling ``diff`` instead (:issue:`31025`) **Selecting Columns from a Grouped DataFrame** @@ -1018,6 +1019,8 @@ Numeric - Bug in :meth:`DataFrame.round` where a :class:`DataFrame` with a :class:`CategoricalIndex` of :class:`IntervalIndex` columns would incorrectly raise a ``TypeError`` (:issue:`30063`) - Bug in :meth:`Series.pct_change` and :meth:`DataFrame.pct_change` when there are duplicated indices (:issue:`30463`) - Bug in :class:`DataFrame` cumulative operations (e.g. cumsum, cummax) incorrect casting to object-dtype (:issue:`19296`) +- Bug in :class:`~DataFrame.diff` losing the dtype for extension types (:issue:`30889`) +- Bug in :class:`DataFrame.diff` raising an ``IndexError`` when one of the columns was a nullable integer dtype (:issue:`30967`) Conversion ^^^^^^^^^^ @@ -1158,7 +1161,7 @@ Sparse ^^^^^^ - Bug in :class:`SparseDataFrame` arithmetic operations incorrectly casting inputs to float (:issue:`28107`) - Bug in ``DataFrame.sparse`` returning a ``Series`` when there was a column named ``sparse`` rather than the accessor (:issue:`30758`) -- +- Fixed :meth:`operator.xor` with a boolean-dtype ``SparseArray``. Now returns a sparse result, rather than object dtype (:issue:`31025`) ExtensionArray ^^^^^^^^^^^^^^ diff --git a/pandas/_libs/sparse_op_helper.pxi.in b/pandas/_libs/sparse_op_helper.pxi.in index 62ea477167b72..996da4ca2f92b 100644 --- a/pandas/_libs/sparse_op_helper.pxi.in +++ b/pandas/_libs/sparse_op_helper.pxi.in @@ -84,7 +84,8 @@ def get_op(tup): 'ge': '{0} >= {1}', 'and': '{0} & {1}', # logical op - 'or': '{0} | {1}'} + 'or': '{0} | {1}', + 'xor': '{0} ^ {1}'} return ops_dict[opname].format(lval, rval) @@ -94,7 +95,7 @@ def get_dispatch(dtypes): ops_list = ['add', 'sub', 'mul', 'div', 'mod', 'truediv', 'floordiv', 'pow', 'eq', 'ne', 'lt', 'gt', 'le', 'ge', - 'and', 'or'] + 'and', 'or', 'xor'] for opname in ops_list: for dtype, arith_comp_group, logical_group in dtypes: @@ -104,13 +105,13 @@ def get_dispatch(dtypes): elif opname in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'): # comparison op rdtype = 'uint8' - elif opname in ('and', 'or'): + elif opname in ('and', 'or', 'xor'): # logical op rdtype = 'uint8' else: rdtype = dtype - if opname in ('and', 'or'): + if opname in ('and', 'or', 'xor'): if logical_group: yield opname, dtype, rdtype else: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 59256f6924b79..8af9e2cc9790f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2,6 +2,7 @@ Generic data algorithms. This module is experimental at the moment and not intended for public consumption """ +import operator from textwrap import dedent from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union from warnings import catch_warnings, simplefilter, warn @@ -1812,7 +1813,7 @@ def searchsorted(arr, value, side="left", sorter=None): _diff_special = {"float64", "float32", "int64", "int32", "int16", "int8"} -def diff(arr, n: int, axis: int = 0): +def diff(arr, n: int, axis: int = 0, stacklevel=3): """ difference of n between self, analogous to s-s.shift(n) @@ -1824,16 +1825,42 @@ def diff(arr, n: int, axis: int = 0): number of periods axis : int axis to shift on + stacklevel : int + The stacklevel for the lost dtype warning. Returns ------- shifted """ + from pandas.core.arrays import PandasDtype n = int(n) na = np.nan dtype = arr.dtype + if dtype.kind == "b": + op = operator.xor + else: + op = operator.sub + + if isinstance(dtype, PandasDtype): + # PandasArray cannot necessarily hold shifted versions of itself. + arr = np.asarray(arr) + dtype = arr.dtype + + if is_extension_array_dtype(dtype): + if hasattr(arr, f"__{op.__name__}__"): + return op(arr, arr.shift(n)) + else: + warn( + "dtype lost in 'diff()'. In the future this will raise a " + "TypeError. Convert to a suitable dtype prior to calling 'diff'.", + FutureWarning, + stacklevel=stacklevel, + ) + arr = np.asarray(arr) + dtype = arr.dtype + is_timedelta = False is_bool = False if needs_i8_conversion(arr): diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index e2562a375515d..75dd603aa6c7b 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -141,7 +141,7 @@ def _sparse_array_op( left, right = right, left name = name[1:] - if name in ("and", "or") and dtype == "bool": + if name in ("and", "or", "xor") and dtype == "bool": opname = f"sparse_{name}_uint8" # to make template simple, cast here left_sp_values = left.sp_values.view(np.uint8) @@ -1459,6 +1459,7 @@ def _add_unary_ops(cls): def _add_comparison_ops(cls): cls.__and__ = cls._create_comparison_method(operator.and_) cls.__or__ = cls._create_comparison_method(operator.or_) + cls.__xor__ = cls._create_arithmetic_method(operator.xor) super()._add_comparison_ops() # ---------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 40a5ce25f4422..5d802e8a6a77f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6584,6 +6584,11 @@ def diff(self, periods=1, axis=0) -> "DataFrame": DataFrame.shift: Shift index by desired number of periods with an optional time freq. + Notes + ----- + For boolean dtypes, this uses :meth:`operator.xor` rather than + :meth:`operator.sub`. + Examples -------- Difference with previous row diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 58ee72ace4d37..5ce8a2d66d58f 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1280,7 +1280,10 @@ def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): def diff(self, n: int, axis: int = 1) -> List["Block"]: """ return block for the diff of the values """ - new_values = algos.diff(self.values, n, axis=axis) + new_values = algos.diff(self.values, n, axis=axis, stacklevel=7) + # We use block_shape for ExtensionBlock subclasses, which may call here + # via a super. + new_values = _block_shape(new_values, ndim=self.ndim) return [self.make_block(values=new_values)] def shift(self, periods, axis=0, fill_value=None): @@ -1860,6 +1863,12 @@ def interpolate( placement=self.mgr_locs, ) + def diff(self, n: int, axis: int = 1) -> List["Block"]: + if axis == 1: + # we are by definition 1D. + axis = 0 + return super().diff(n, axis) + def shift( self, periods: int, diff --git a/pandas/core/series.py b/pandas/core/series.py index ffe0642f799fa..e4883c4f0c38d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2241,6 +2241,11 @@ def diff(self, periods=1) -> "Series": optional time freq. DataFrame.diff: First discrete difference of object. + Notes + ----- + For boolean dtypes, this uses :meth:`operator.xor` rather than + :meth:`operator.sub`. + Examples -------- Difference with previous row @@ -2277,7 +2282,7 @@ def diff(self, periods=1) -> "Series": 5 NaN dtype: float64 """ - result = algorithms.diff(com.values_from_object(self), periods) + result = algorithms.diff(self.array, periods) return self._constructor(result, index=self.index).__finalize__(self) def autocorr(self, lag=1) -> float: diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py index 5ff0bb8ef0d78..835aa87a7c21b 100644 --- a/pandas/tests/arrays/categorical/test_algos.py +++ b/pandas/tests/arrays/categorical/test_algos.py @@ -90,6 +90,21 @@ def test_isin_empty(empty): tm.assert_numpy_array_equal(expected, result) +def test_diff(): + s = pd.Series([1, 2, 3], dtype="category") + with tm.assert_produces_warning(FutureWarning): + result = s.diff() + expected = pd.Series([np.nan, 1, 1]) + tm.assert_series_equal(result, expected) + + expected = expected.to_frame(name="A") + df = s.to_frame(name="A") + with tm.assert_produces_warning(FutureWarning): + result = df.diff() + + tm.assert_frame_equal(result, expected) + + class TestTake: # https://github.com/pandas-dev/pandas/issues/20664 diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py index 76442a63ccb0f..73652da78654f 100644 --- a/pandas/tests/arrays/sparse/test_arithmetics.py +++ b/pandas/tests/arrays/sparse/test_arithmetics.py @@ -388,6 +388,14 @@ def test_mixed_array_comparison(self, kind): assert b.dtype == SparseDtype(rdtype, fill_value=2) self._check_comparison_ops(a, b, values, rvalues) + def test_xor(self): + s = SparseArray([True, True, False, False]) + t = SparseArray([True, False, True, False]) + result = s ^ t + sp_index = pd.core.arrays.sparse.IntIndex(4, np.array([0, 1, 2], dtype="int32")) + expected = SparseArray([False, True, True], sparse_index=sp_index) + tm.assert_sp_array_equal(result, expected) + @pytest.mark.parametrize("op", [operator.eq, operator.add]) def test_with_list(op): diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index cc8d0cdcb518d..6e361b2810d54 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -879,3 +879,19 @@ def test_value_counts_na(): result = arr.value_counts(dropna=True) expected = pd.Series([1, 1], index=[True, False], dtype="Int64") tm.assert_series_equal(result, expected) + + +def test_diff(): + a = pd.array( + [True, True, False, False, True, None, True, None, False], dtype="boolean" + ) + result = pd.core.algorithms.diff(a, 1) + expected = pd.array( + [None, False, True, False, True, None, None, None, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = s.diff() + expected = pd.Series(expected) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 6b75176ebd35b..4a84a21084de2 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -1,6 +1,10 @@ +import operator + import numpy as np import pytest +from pandas.core.dtypes.common import is_bool_dtype + import pandas as pd import pandas._testing as tm from pandas.core.sorting import nargsort @@ -231,6 +235,32 @@ def test_container_shift(self, data, frame, periods, indices): compare(result, expected) + @pytest.mark.parametrize("periods", [1, -2]) + def test_diff(self, data, periods): + data = data[:5] + if is_bool_dtype(data.dtype): + op = operator.xor + else: + op = operator.sub + try: + # does this array implement ops? + op(data, data) + except Exception: + pytest.skip(f"{type(data)} does not support diff") + s = pd.Series(data) + result = s.diff(periods) + expected = pd.Series(op(data, data.shift(periods))) + self.assert_series_equal(result, expected) + + df = pd.DataFrame({"A": data, "B": [1.0] * 5}) + result = df.diff(periods) + if periods == 1: + b = [np.nan, 0, 0, 0, 0] + else: + b = [0, 0, 0, np.nan, np.nan] + expected = pd.DataFrame({"A": expected, "B": b}) + self.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "periods, indices", [[-4, [-1, -1]], [-1, [1, -1]], [0, [0, 1]], [1, [-1, 0]], [4, [-1, -1]]], diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 7db38f41d4573..8a820c8746857 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -248,6 +248,10 @@ def test_repeat(self, data, repeats, as_series, use_numpy): # Fails creating expected super().test_repeat(data, repeats, as_series, use_numpy) + @pytest.mark.xfail(reason="PandasArray.diff may fail on dtype") + def test_diff(self, data, periods): + return super().test_diff(data, periods) + @skip_nested class TestArithmetics(BaseNumPyTests, base.BaseArithmeticOpsTests):