From fcde96b2e16f4648aff821d16dce376b2b3f3d06 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 14 Jan 2020 18:08:20 -0600 Subject: [PATCH 01/15] Dispatch NDFrame.diff to EAs Closes https://github.com/pandas-dev/pandas/issues/30889 Closes https://github.com/pandas-dev/pandas/issues/30967 --- doc/source/whatsnew/v1.0.0.rst | 2 ++ pandas/core/algorithms.py | 3 +++ pandas/core/arrays/base.py | 5 +++++ pandas/core/arrays/numpy_.py | 6 +++++- pandas/core/internals/blocks.py | 8 ++++++++ pandas/core/series.py | 2 +- pandas/tests/extension/base/methods.py | 22 ++++++++++++++++++++++ 7 files changed, 46 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index c423933d4c438..f90f478aef79a 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -998,6 +998,8 @@ Numeric - Bug in :meth:`DataFrame.round` where a :class:`DataFrame` with a :class:`CategoricalIndex` of :class:`IntervalIndex` columns would incorrectly raise a ``TypeError`` (:issue:`30063`) - Bug in :meth:`Series.pct_change` and :meth:`DataFrame.pct_change` when there are duplicated indices (:issue:`30463`) - Bug in :class:`DataFrame` cumulative operations (e.g. cumsum, cummax) incorrect casting to object-dtype (:issue:`19296`) +- Bug in :class:`~DataFrame.diff` losing the dtype for extension types (:issue:`30889`) +- Bug in :class:`DataFrame.diff` raising an ``IndexError`` when one of the columns was a nullable integer dtype (:issue:`30967`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 39e8e9008a844..0beb149b03174 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1834,6 +1834,9 @@ def diff(arr, n: int, axis: int = 0): na = np.nan dtype = arr.dtype + if is_extension_array_dtype(dtype): + return arr.diff(n) + is_timedelta = False is_bool = False if needs_i8_conversion(arr): diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 9723343ea7af5..1c8698aedb712 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -578,6 +578,11 @@ def dropna(self): """ return self[~self.isna()] + def diff(self, periods: int = 1): + if hasattr(self, "__sub__"): + return self - self.shift(periods) + raise TypeError() + def shift(self, periods: int = 1, fill_value: object = None) -> ABCExtensionArray: """ Shift values by desired number. diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 4db3d3010adaf..4c9fa4931104b 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -15,7 +15,7 @@ from pandas.core.dtypes.missing import isna from pandas import compat -from pandas.core import nanops +from pandas.core import algorithms, nanops from pandas.core.algorithms import searchsorted, take, unique from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin import pandas.core.common as com @@ -164,6 +164,10 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): result = result.copy() return cls(result) + def diff(self, periods: int = 1): + result = algorithms.diff(com.values_from_object(self._ndarray), periods) + return type(self)(result) + @classmethod def _from_factorized(cls, values, original): return cls(values) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f74033924f64e..eb7668b5636f8 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1962,6 +1962,14 @@ class ObjectValuesExtensionBlock(ExtensionBlock): Series[T].values is an ndarray of objects. """ + def diff(self, n: int, axis: int = 1) -> List["Block"]: + # Block.shape vs. Block.values.shape mismatch + # Do the op, get the object-dtype ndarray, and reshape + # to put into an ObjectBlock + new_values = algos.diff(self.values, n, axis=axis) + new_values = np.atleast_2d(new_values) + return [self.make_block(values=new_values)] + def external_values(self, dtype=None): return self.values.astype(object) diff --git a/pandas/core/series.py b/pandas/core/series.py index 33565bbedade6..3a72fc32cb452 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2316,7 +2316,7 @@ def diff(self, periods=1) -> "Series": 5 NaN dtype: float64 """ - result = algorithms.diff(com.values_from_object(self), periods) + result = algorithms.diff(self.array, periods) return self._constructor(result, index=self.index).__finalize__(self) def autocorr(self, lag=1) -> float: diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 1e427c6319cab..00bb3a393b86d 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -231,6 +231,28 @@ def test_container_shift(self, data, frame, periods, indices): compare(result, expected) + @pytest.mark.parametrize("periods", [1, -2]) + def test_diff(self, data, periods): + data = data[:5] + try: + # does this array implement ops? + data - data + except Exception: + pytest.skip(f"{type(data)} does not support diff") + s = pd.Series(data) + result = s.diff(periods) + expected = pd.Series(data - data.shift(periods)) + self.assert_series_equal(result, expected) + + df = pd.DataFrame({"A": data, "B": [1.0] * 5}) + result = df.diff(periods) + if periods == 1: + b = [np.nan, 0, 0, 0, 0] + else: + b = [0, 0, 0, np.nan, np.nan] + expected = pd.DataFrame({"A": expected, "B": b}) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "periods, indices", [[-4, [-1, -1]], [-1, [1, -1]], [0, [0, 1]], [1, [-1, 0]], [4, [-1, -1]]], From 5017912f439b010d75913b9159472e2377798def Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 17 Jan 2020 09:59:28 -0600 Subject: [PATCH 02/15] wip --- pandas/core/algorithms.py | 8 +++++++- pandas/core/arrays/base.py | 5 ----- pandas/tests/arrays/test_boolean.py | 16 ++++++++++++++++ pandas/tests/extension/base/methods.py | 2 +- 4 files changed, 24 insertions(+), 7 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index caf5629aaae42..98cc01328ad1f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2,6 +2,7 @@ Generic data algorithms. This module is experimental at the moment and not intended for public consumption """ +import operator from textwrap import dedent from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union from warnings import catch_warnings, simplefilter, warn @@ -1834,8 +1835,13 @@ def diff(arr, n: int, axis: int = 0): na = np.nan dtype = arr.dtype + if dtype.kind == "b": + op = operator.xor + else: + op = operator.sub + if is_extension_array_dtype(dtype): - return arr.diff(n) + return op(arr, arr.shift(n)) is_timedelta = False is_bool = False diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 1c8698aedb712..9723343ea7af5 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -578,11 +578,6 @@ def dropna(self): """ return self[~self.isna()] - def diff(self, periods: int = 1): - if hasattr(self, "__sub__"): - return self - self.shift(periods) - raise TypeError() - def shift(self, periods: int = 1, fill_value: object = None) -> ABCExtensionArray: """ Shift values by desired number. diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index cc8d0cdcb518d..6e361b2810d54 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -879,3 +879,19 @@ def test_value_counts_na(): result = arr.value_counts(dropna=True) expected = pd.Series([1, 1], index=[True, False], dtype="Int64") tm.assert_series_equal(result, expected) + + +def test_diff(): + a = pd.array( + [True, True, False, False, True, None, True, None, False], dtype="boolean" + ) + result = pd.core.algorithms.diff(a, 1) + expected = pd.array( + [None, False, True, False, True, None, None, None, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = s.diff() + expected = pd.Series(expected) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 00bb3a393b86d..630a9ac13a592 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -251,7 +251,7 @@ def test_diff(self, data, periods): else: b = [0, 0, 0, np.nan, np.nan] expected = pd.DataFrame({"A": expected, "B": b}) - tm.assert_frame_equal(result, expected) + self.assert_frame_equal(result, expected) @pytest.mark.parametrize( "periods, indices", From dfea6a5f41a395ebfe9624466520f9df17872b58 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 17 Jan 2020 10:33:03 -0600 Subject: [PATCH 03/15] xor --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/_libs/sparse_op_helper.pxi.in | 9 +++++---- pandas/core/arrays/sparse/array.py | 3 ++- pandas/tests/arrays/sparse/test_arithmetics.py | 8 ++++++++ 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index f90f478aef79a..26faa6b655e1f 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -1140,7 +1140,7 @@ Sparse ^^^^^^ - Bug in :class:`SparseDataFrame` arithmetic operations incorrectly casting inputs to float (:issue:`28107`) - Bug in ``DataFrame.sparse`` returning a ``Series`` when there was a column named ``sparse`` rather than the accessor (:issue:`30758`) -- +- Fixed :meth:`operator.xor` with a boolean-dtype ``SparseArray``. Now returns a sparse result, rather than object dtype (:issue:`31025`) ExtensionArray ^^^^^^^^^^^^^^ diff --git a/pandas/_libs/sparse_op_helper.pxi.in b/pandas/_libs/sparse_op_helper.pxi.in index 62ea477167b72..996da4ca2f92b 100644 --- a/pandas/_libs/sparse_op_helper.pxi.in +++ b/pandas/_libs/sparse_op_helper.pxi.in @@ -84,7 +84,8 @@ def get_op(tup): 'ge': '{0} >= {1}', 'and': '{0} & {1}', # logical op - 'or': '{0} | {1}'} + 'or': '{0} | {1}', + 'xor': '{0} ^ {1}'} return ops_dict[opname].format(lval, rval) @@ -94,7 +95,7 @@ def get_dispatch(dtypes): ops_list = ['add', 'sub', 'mul', 'div', 'mod', 'truediv', 'floordiv', 'pow', 'eq', 'ne', 'lt', 'gt', 'le', 'ge', - 'and', 'or'] + 'and', 'or', 'xor'] for opname in ops_list: for dtype, arith_comp_group, logical_group in dtypes: @@ -104,13 +105,13 @@ def get_dispatch(dtypes): elif opname in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'): # comparison op rdtype = 'uint8' - elif opname in ('and', 'or'): + elif opname in ('and', 'or', 'xor'): # logical op rdtype = 'uint8' else: rdtype = dtype - if opname in ('and', 'or'): + if opname in ('and', 'or', 'xor'): if logical_group: yield opname, dtype, rdtype else: diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index e2562a375515d..75dd603aa6c7b 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -141,7 +141,7 @@ def _sparse_array_op( left, right = right, left name = name[1:] - if name in ("and", "or") and dtype == "bool": + if name in ("and", "or", "xor") and dtype == "bool": opname = f"sparse_{name}_uint8" # to make template simple, cast here left_sp_values = left.sp_values.view(np.uint8) @@ -1459,6 +1459,7 @@ def _add_unary_ops(cls): def _add_comparison_ops(cls): cls.__and__ = cls._create_comparison_method(operator.and_) cls.__or__ = cls._create_comparison_method(operator.or_) + cls.__xor__ = cls._create_arithmetic_method(operator.xor) super()._add_comparison_ops() # ---------- diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py index 76442a63ccb0f..73652da78654f 100644 --- a/pandas/tests/arrays/sparse/test_arithmetics.py +++ b/pandas/tests/arrays/sparse/test_arithmetics.py @@ -388,6 +388,14 @@ def test_mixed_array_comparison(self, kind): assert b.dtype == SparseDtype(rdtype, fill_value=2) self._check_comparison_ops(a, b, values, rvalues) + def test_xor(self): + s = SparseArray([True, True, False, False]) + t = SparseArray([True, False, True, False]) + result = s ^ t + sp_index = pd.core.arrays.sparse.IntIndex(4, np.array([0, 1, 2], dtype="int32")) + expected = SparseArray([False, True, True], sparse_index=sp_index) + tm.assert_sp_array_equal(result, expected) + @pytest.mark.parametrize("op", [operator.eq, operator.add]) def test_with_list(op): From fc6eef0cf48f6c872d478fc77475f447a3469e90 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 17 Jan 2020 10:37:09 -0600 Subject: [PATCH 04/15] doc --- pandas/core/frame.py | 5 +++++ pandas/core/series.py | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6dd3a415297db..a743b7f95249d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6560,6 +6560,11 @@ def diff(self, periods=1, axis=0) -> "DataFrame": ------- DataFrame + Notes + ----- + For boolean dtypes, this uses :meth:`operator.xor` rather than + :meth:`operator.sub`. + See Also -------- Series.diff: First discrete difference for a Series. diff --git a/pandas/core/series.py b/pandas/core/series.py index 8f90642fed299..ec5c1ffd1fde6 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2244,6 +2244,11 @@ def diff(self, periods=1) -> "Series": Series First differences of the Series. + Notes + ----- + For boolean dtypes, this uses :meth:`operator.xor` rather than + :meth:`operator.sub`. + See Also -------- Series.pct_change: Percent change over given number of periods. From 84e5e9329c97d66a917e12bc2b845b67deaedb22 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 17 Jan 2020 10:39:27 -0600 Subject: [PATCH 05/15] cleanup --- pandas/core/arrays/numpy_.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 4c9fa4931104b..4db3d3010adaf 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -15,7 +15,7 @@ from pandas.core.dtypes.missing import isna from pandas import compat -from pandas.core import algorithms, nanops +from pandas.core import nanops from pandas.core.algorithms import searchsorted, take, unique from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin import pandas.core.common as com @@ -164,10 +164,6 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): result = result.copy() return cls(result) - def diff(self, periods: int = 1): - result = algorithms.diff(com.values_from_object(self._ndarray), periods) - return type(self)(result) - @classmethod def _from_factorized(cls, values, original): return cls(values) From 4183b5b877212e3c9bf1b5da870d6cddcd0c8a55 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 17 Jan 2020 13:06:32 -0600 Subject: [PATCH 06/15] skip pandasraray --- pandas/core/algorithms.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 98cc01328ad1f..8083e8d9675ee 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1830,6 +1830,7 @@ def diff(arr, n: int, axis: int = 0): ------- shifted """ + from pandas.core.arrays import PandasDtype n = int(n) na = np.nan @@ -1840,6 +1841,11 @@ def diff(arr, n: int, axis: int = 0): else: op = operator.sub + if isinstance(dtype, PandasDtype): + # PandasArray cannot necessarily hold shifted versions of itself. + arr = np.asarray(arr) + dtype = arr.dtype + if is_extension_array_dtype(dtype): return op(arr, arr.shift(n)) From 2f5d55febada2f6553ab1896d8a755e421bacd94 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 17 Jan 2020 13:28:08 -0600 Subject: [PATCH 07/15] docstrings --- pandas/core/frame.py | 10 +++++----- pandas/core/series.py | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a743b7f95249d..ab63034569fc5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6560,11 +6560,6 @@ def diff(self, periods=1, axis=0) -> "DataFrame": ------- DataFrame - Notes - ----- - For boolean dtypes, this uses :meth:`operator.xor` rather than - :meth:`operator.sub`. - See Also -------- Series.diff: First discrete difference for a Series. @@ -6572,6 +6567,11 @@ def diff(self, periods=1, axis=0) -> "DataFrame": DataFrame.shift: Shift index by desired number of periods with an optional time freq. + Notes + ----- + For boolean dtypes, this uses :meth:`operator.xor` rather than + :meth:`operator.sub`. + Examples -------- Difference with previous row diff --git a/pandas/core/series.py b/pandas/core/series.py index ec5c1ffd1fde6..75d710ca9fecc 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2244,11 +2244,6 @@ def diff(self, periods=1) -> "Series": Series First differences of the Series. - Notes - ----- - For boolean dtypes, this uses :meth:`operator.xor` rather than - :meth:`operator.sub`. - See Also -------- Series.pct_change: Percent change over given number of periods. @@ -2256,6 +2251,11 @@ def diff(self, periods=1) -> "Series": optional time freq. DataFrame.diff: First discrete difference of object. + Notes + ----- + For boolean dtypes, this uses :meth:`operator.xor` rather than + :meth:`operator.sub`. + Examples -------- Difference with previous row From e0ce8bea7048cd9e73c4cc3d920b8232c21845fa Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 17 Jan 2020 14:39:25 -0600 Subject: [PATCH 08/15] skpi --- pandas/tests/extension/test_numpy.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 7db38f41d4573..71f1c651d648a 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -248,6 +248,10 @@ def test_repeat(self, data, repeats, as_series, use_numpy): # Fails creating expected super().test_repeat(data, repeats, as_series, use_numpy) + @pytest.mark.skip(reason="algorithms.diff skips PandasArray") + def test_diff(self, data, periods): + return super().test_diff(data, periods) + @skip_nested class TestArithmetics(BaseNumPyTests, base.BaseArithmeticOpsTests): From 1c0a9fe6bda3961a64c024e57f82e18f31cf80dc Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 21 Jan 2020 08:23:49 -0600 Subject: [PATCH 09/15] block shape --- pandas/core/internals/blocks.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index d0cf1af33b82a..627790d4fd689 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1976,8 +1976,7 @@ def diff(self, n: int, axis: int = 1) -> List["Block"]: # Block.shape vs. Block.values.shape mismatch # Do the op, get the object-dtype ndarray, and reshape # to put into an ObjectBlock - new_values = algos.diff(self.values, n, axis=axis) - new_values = np.atleast_2d(new_values) + new_values = _block_shape(algos.diff(self.values, n, axis=axis), ndim=self.ndim) return [self.make_block(values=new_values)] def external_values(self): From f3af8f5ee629617d430f31a0dfcf564c900d2b49 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 21 Jan 2020 09:27:04 -0600 Subject: [PATCH 10/15] Updates * deprecate old behavior --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/algorithms.py | 16 ++++++++++++++-- pandas/core/internals/blocks.py | 13 +++++-------- pandas/tests/arrays/categorical/test_algos.py | 15 +++++++++++++++ 4 files changed, 35 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 0b4a671233ad1..e2907b04ba22c 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -727,6 +727,7 @@ Deprecations - Support for multi-dimensional indexing (e.g. ``index[:, None]``) on a :class:`Index` is deprecated and will be removed in a future version, convert to a numpy array before indexing instead (:issue:`30588`) - The ``pandas.np`` submodule is now deprecated. Import numpy directly instead (:issue:`30296`) - The ``pandas.datetime`` class is now deprecated. Import from ``datetime`` instead (:issue:`30610`) +- :class:`~DataFrame.diff` will raise a ``TypeError`` rather than implicitly losing the dtype of extension types in the future. Convert to the correct dtype before calling ``diff` instead (:issue:`31025`) **Selecting Columns from a Grouped DataFrame** diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 8083e8d9675ee..749e1bfbef762 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1813,7 +1813,7 @@ def searchsorted(arr, value, side="left", sorter=None): _diff_special = {"float64", "float32", "int64", "int32", "int16", "int8"} -def diff(arr, n: int, axis: int = 0): +def diff(arr, n: int, axis: int = 0, stacklevel=3): """ difference of n between self, analogous to s-s.shift(n) @@ -1825,6 +1825,8 @@ def diff(arr, n: int, axis: int = 0): number of periods axis : int axis to shift on + stacklevel : int + The stacklevel for the lost dtype warning. Returns ------- @@ -1847,7 +1849,17 @@ def diff(arr, n: int, axis: int = 0): dtype = arr.dtype if is_extension_array_dtype(dtype): - return op(arr, arr.shift(n)) + if hasattr(arr, f"__{op.__name__}__"): + return op(arr, arr.shift(n)) + else: + warn( + "dtype lost in 'algorithms.diff'. In the future this will raise a " + "TypeError. Convert to a suitable dtype prior to calling 'diff'.", + FutureWarning, + stacklevel=stacklevel, + ) + arr = com.values_from_object(arr) + dtype = arr.dtype is_timedelta = False is_bool = False diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 627790d4fd689..6d9c4d212440d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1280,7 +1280,11 @@ def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): def diff(self, n: int, axis: int = 1) -> List["Block"]: """ return block for the diff of the values """ - new_values = algos.diff(self.values, n, axis=axis) + if axis and self.values.ndim == 1: + # Handle 1D EAs within a 2D Block + axis = 0 + new_values = algos.diff(self.values, n, axis=axis, stacklevel=6) + new_values = _block_shape(new_values, ndim=self.ndim) return [self.make_block(values=new_values)] def shift(self, periods, axis=0, fill_value=None): @@ -1972,13 +1976,6 @@ class ObjectValuesExtensionBlock(ExtensionBlock): Series[T].values is an ndarray of objects. """ - def diff(self, n: int, axis: int = 1) -> List["Block"]: - # Block.shape vs. Block.values.shape mismatch - # Do the op, get the object-dtype ndarray, and reshape - # to put into an ObjectBlock - new_values = _block_shape(algos.diff(self.values, n, axis=axis), ndim=self.ndim) - return [self.make_block(values=new_values)] - def external_values(self): return self.values.astype(object) diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py index 5ff0bb8ef0d78..835aa87a7c21b 100644 --- a/pandas/tests/arrays/categorical/test_algos.py +++ b/pandas/tests/arrays/categorical/test_algos.py @@ -90,6 +90,21 @@ def test_isin_empty(empty): tm.assert_numpy_array_equal(expected, result) +def test_diff(): + s = pd.Series([1, 2, 3], dtype="category") + with tm.assert_produces_warning(FutureWarning): + result = s.diff() + expected = pd.Series([np.nan, 1, 1]) + tm.assert_series_equal(result, expected) + + expected = expected.to_frame(name="A") + df = s.to_frame(name="A") + with tm.assert_produces_warning(FutureWarning): + result = df.diff() + + tm.assert_frame_equal(result, expected) + + class TestTake: # https://github.com/pandas-dev/pandas/issues/20664 From 4d0c5cf6485055d018f3c36b93b11bbd09706081 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 21 Jan 2020 09:32:09 -0600 Subject: [PATCH 11/15] localize --- pandas/core/internals/blocks.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6d9c4d212440d..25619290116fd 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1280,10 +1280,9 @@ def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): def diff(self, n: int, axis: int = 1) -> List["Block"]: """ return block for the diff of the values """ - if axis and self.values.ndim == 1: - # Handle 1D EAs within a 2D Block - axis = 0 - new_values = algos.diff(self.values, n, axis=axis, stacklevel=6) + new_values = algos.diff(self.values, n, axis=axis, stacklevel=7) + # We use block_shape for ExtensionBlock subclasses, which may call here + # via a super. new_values = _block_shape(new_values, ndim=self.ndim) return [self.make_block(values=new_values)] @@ -1864,6 +1863,12 @@ def interpolate( placement=self.mgr_locs, ) + def diff(self, n: int, axis: int = 1) -> List["Block"]: + if axis == 1: + # we are by definition 1D. + axis = 0 + return super().diff(n, axis) + def shift( self, periods: int, From 6843e2bb0dfc0f909028ffeb4635bb6d77c773cb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 21 Jan 2020 11:40:40 -0600 Subject: [PATCH 12/15] missing backtick --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index e2907b04ba22c..ffff720ed1c06 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -727,7 +727,7 @@ Deprecations - Support for multi-dimensional indexing (e.g. ``index[:, None]``) on a :class:`Index` is deprecated and will be removed in a future version, convert to a numpy array before indexing instead (:issue:`30588`) - The ``pandas.np`` submodule is now deprecated. Import numpy directly instead (:issue:`30296`) - The ``pandas.datetime`` class is now deprecated. Import from ``datetime`` instead (:issue:`30610`) -- :class:`~DataFrame.diff` will raise a ``TypeError`` rather than implicitly losing the dtype of extension types in the future. Convert to the correct dtype before calling ``diff` instead (:issue:`31025`) +- :class:`~DataFrame.diff` will raise a ``TypeError`` rather than implicitly losing the dtype of extension types in the future. Convert to the correct dtype before calling ``diff`` instead (:issue:`31025`) **Selecting Columns from a Grouped DataFrame** From bd6c1571efe74b75af38ce9a4ac166ffb29dc902 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 22 Jan 2020 08:22:48 -0600 Subject: [PATCH 13/15] fixup --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 749e1bfbef762..5d80a01378e0d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1853,7 +1853,7 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): return op(arr, arr.shift(n)) else: warn( - "dtype lost in 'algorithms.diff'. In the future this will raise a " + "dtype lost in 'diff()'. In the future this will raise a " "TypeError. Convert to a suitable dtype prior to calling 'diff'.", FutureWarning, stacklevel=stacklevel, From 8fa2836a8c0c11954eac0576fb8b2c19170624d6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 23 Jan 2020 08:49:40 -0600 Subject: [PATCH 14/15] asarray --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 5d80a01378e0d..8af9e2cc9790f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1858,7 +1858,7 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): FutureWarning, stacklevel=stacklevel, ) - arr = com.values_from_object(arr) + arr = np.asarray(arr) dtype = arr.dtype is_timedelta = False From d34ffe31e8368b46f8cee395c0f4ead3db461633 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 23 Jan 2020 08:53:22 -0600 Subject: [PATCH 15/15] xfails, boolean --- pandas/tests/extension/base/methods.py | 12 ++++++++++-- pandas/tests/extension/test_numpy.py | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index db972f437825e..4a84a21084de2 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -1,6 +1,10 @@ +import operator + import numpy as np import pytest +from pandas.core.dtypes.common import is_bool_dtype + import pandas as pd import pandas._testing as tm from pandas.core.sorting import nargsort @@ -234,14 +238,18 @@ def test_container_shift(self, data, frame, periods, indices): @pytest.mark.parametrize("periods", [1, -2]) def test_diff(self, data, periods): data = data[:5] + if is_bool_dtype(data.dtype): + op = operator.xor + else: + op = operator.sub try: # does this array implement ops? - data - data + op(data, data) except Exception: pytest.skip(f"{type(data)} does not support diff") s = pd.Series(data) result = s.diff(periods) - expected = pd.Series(data - data.shift(periods)) + expected = pd.Series(op(data, data.shift(periods))) self.assert_series_equal(result, expected) df = pd.DataFrame({"A": data, "B": [1.0] * 5}) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 71f1c651d648a..8a820c8746857 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -248,7 +248,7 @@ def test_repeat(self, data, repeats, as_series, use_numpy): # Fails creating expected super().test_repeat(data, repeats, as_series, use_numpy) - @pytest.mark.skip(reason="algorithms.diff skips PandasArray") + @pytest.mark.xfail(reason="PandasArray.diff may fail on dtype") def test_diff(self, data, periods): return super().test_diff(data, periods)