pandas-dev · jorisvandenbossche · Jan 23, 2020 · Jan 15, 2020 · Jan 15, 2020 · Jan 16, 2020
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -998,6 +998,8 @@ Numeric
 - Bug in :meth:`DataFrame.round` where a :class:`DataFrame` with a :class:`CategoricalIndex` of :class:`IntervalIndex` columns would incorrectly raise a ``TypeError`` (:issue:`30063`)
 - Bug in :meth:`Series.pct_change` and :meth:`DataFrame.pct_change` when there are duplicated indices (:issue:`30463`)
 - Bug in :class:`DataFrame` cumulative operations (e.g. cumsum, cummax) incorrect casting to object-dtype (:issue:`19296`)
+- Bug in :class:`~DataFrame.diff` losing the dtype for extension types (:issue:`30889`)
+- Bug in :class:`DataFrame.diff` raising an ``IndexError`` when one of the columns was a nullable integer dtype (:issue:`30967`)
 
 Conversion
 ^^^^^^^^^^
@@ -1138,7 +1140,7 @@ Sparse
 ^^^^^^
 - Bug in :class:`SparseDataFrame` arithmetic operations incorrectly casting inputs to float (:issue:`28107`)
 - Bug in ``DataFrame.sparse`` returning a ``Series`` when there was a column named ``sparse`` rather than the accessor (:issue:`30758`)
--
+- Fixed :meth:`operator.xor` with a boolean-dtype ``SparseArray``. Now returns a sparse result, rather than object dtype (:issue:`31025`)
 
 ExtensionArray
 ^^^^^^^^^^^^^^

diff --git a/pandas/_libs/sparse_op_helper.pxi.in b/pandas/_libs/sparse_op_helper.pxi.in
@@ -84,7 +84,8 @@ def get_op(tup):
                 'ge': '{0} >= {1}',
 
                 'and': '{0} & {1}',     # logical op
-                'or': '{0} | {1}'}
+                'or': '{0} | {1}',
+                'xor': '{0} ^ {1}'}
 
     return ops_dict[opname].format(lval, rval)
 
@@ -94,7 +95,7 @@ def get_dispatch(dtypes):
     ops_list = ['add', 'sub', 'mul', 'div', 'mod', 'truediv',
                 'floordiv', 'pow',
                 'eq', 'ne', 'lt', 'gt', 'le', 'ge',
-                'and', 'or']
+                'and', 'or', 'xor']
 
     for opname in ops_list:
         for dtype, arith_comp_group, logical_group in dtypes:
@@ -104,13 +105,13 @@ def get_dispatch(dtypes):
             elif opname in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'):
                 # comparison op
                 rdtype = 'uint8'
-            elif opname in ('and', 'or'):
+            elif opname in ('and', 'or', 'xor'):
                 # logical op
                 rdtype = 'uint8'
             else:
                 rdtype = dtype
 
-            if opname in ('and', 'or'):
+            if opname in ('and', 'or', 'xor'):
                 if logical_group:
                     yield opname, dtype, rdtype
             else:

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -2,6 +2,7 @@
 Generic data algorithms. This module is experimental at the moment and not
 intended for public consumption
 """
+import operator
 from textwrap import dedent
 from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
 from warnings import catch_warnings, simplefilter, warn
@@ -1829,11 +1830,25 @@ def diff(arr, n: int, axis: int = 0):
     -------
     shifted
     """
+    from pandas.core.arrays import PandasDtype
 
     n = int(n)
     na = np.nan
     dtype = arr.dtype
 
+    if dtype.kind == "b":
+        op = operator.xor
+    else:
+        op = operator.sub
+
+    if isinstance(dtype, PandasDtype):
+        # PandasArray cannot necessarily hold shifted versions of itself.
+        arr = np.asarray(arr)
+        dtype = arr.dtype
+
+    if is_extension_array_dtype(dtype):
+        return op(arr, arr.shift(n))
+
     is_timedelta = False
     is_bool = False
     if needs_i8_conversion(arr):

diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -141,7 +141,7 @@ def _sparse_array_op(
             left, right = right, left
             name = name[1:]
 
-        if name in ("and", "or") and dtype == "bool":
+        if name in ("and", "or", "xor") and dtype == "bool":
             opname = f"sparse_{name}_uint8"
             # to make template simple, cast here
             left_sp_values = left.sp_values.view(np.uint8)
@@ -1459,6 +1459,7 @@ def _add_unary_ops(cls):
     def _add_comparison_ops(cls):
         cls.__and__ = cls._create_comparison_method(operator.and_)
         cls.__or__ = cls._create_comparison_method(operator.or_)
+        cls.__xor__ = cls._create_arithmetic_method(operator.xor)
         super()._add_comparison_ops()
 
     # ----------

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -6567,6 +6567,11 @@ def diff(self, periods=1, axis=0) -> "DataFrame":
         DataFrame.shift: Shift index by desired number of periods with an
             optional time freq.
 
+        Notes
+        -----
+        For boolean dtypes, this uses :meth:`operator.xor` rather than
+        :meth:`operator.sub`.
+
         Examples
         --------
         Difference with previous row

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -1972,6 +1972,14 @@ class ObjectValuesExtensionBlock(ExtensionBlock):
     Series[T].values is an ndarray of objects.
     """
 
+    def diff(self, n: int, axis: int = 1) -> List["Block"]:
+        # Block.shape vs. Block.values.shape mismatch
+        # Do the op, get the object-dtype ndarray, and reshape
+        # to put into an ObjectBlock
+        new_values = algos.diff(self.values, n, axis=axis)
+        new_values = np.atleast_2d(new_values)
+        return [self.make_block(values=new_values)]
+
     def external_values(self):
         return self.values.astype(object)
 

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -2251,6 +2251,11 @@ def diff(self, periods=1) -> "Series":
             optional time freq.
         DataFrame.diff: First discrete difference of object.
 
+        Notes
+        -----
+        For boolean dtypes, this uses :meth:`operator.xor` rather than
+        :meth:`operator.sub`.
+
         Examples
         --------
         Difference with previous row
@@ -2287,7 +2292,7 @@ def diff(self, periods=1) -> "Series":
         5    NaN
         dtype: float64
         """
-        result = algorithms.diff(com.values_from_object(self), periods)
+        result = algorithms.diff(self.array, periods)
         return self._constructor(result, index=self.index).__finalize__(self)
 
     def autocorr(self, lag=1) -> float:

diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py
@@ -388,6 +388,14 @@ def test_mixed_array_comparison(self, kind):
         assert b.dtype == SparseDtype(rdtype, fill_value=2)
         self._check_comparison_ops(a, b, values, rvalues)
 
+    def test_xor(self):
+        s = SparseArray([True, True, False, False])
+        t = SparseArray([True, False, True, False])
+        result = s ^ t
+        sp_index = pd.core.arrays.sparse.IntIndex(4, np.array([0, 1, 2], dtype="int32"))
+        expected = SparseArray([False, True, True], sparse_index=sp_index)
+        tm.assert_sp_array_equal(result, expected)
+
 
 @pytest.mark.parametrize("op", [operator.eq, operator.add])
 def test_with_list(op):

diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py
@@ -879,3 +879,19 @@ def test_value_counts_na():
     result = arr.value_counts(dropna=True)
     expected = pd.Series([1, 1], index=[True, False], dtype="Int64")
     tm.assert_series_equal(result, expected)
+
+
+def test_diff():
+    a = pd.array(
+        [True, True, False, False, True, None, True, None, False], dtype="boolean"
+    )
+    result = pd.core.algorithms.diff(a, 1)
+    expected = pd.array(
+        [None, False, True, False, True, None, None, None, None], dtype="boolean"
+    )
+    tm.assert_extension_array_equal(result, expected)
+
+    s = pd.Series(a)
+    result = s.diff()
+    expected = pd.Series(expected)
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
@@ -231,6 +231,28 @@ def test_container_shift(self, data, frame, periods, indices):
 
         compare(result, expected)
 
+    @pytest.mark.parametrize("periods", [1, -2])
+    def test_diff(self, data, periods):
+        data = data[:5]
+        try:
+            # does this array implement ops?
+            data - data
+        except Exception:
+            pytest.skip(f"{type(data)} does not support diff")
+        s = pd.Series(data)
+        result = s.diff(periods)
+        expected = pd.Series(data - data.shift(periods))
+        self.assert_series_equal(result, expected)
+
+        df = pd.DataFrame({"A": data, "B": [1.0] * 5})
+        result = df.diff(periods)
+        if periods == 1:
+            b = [np.nan, 0, 0, 0, 0]
+        else:
+            b = [0, 0, 0, np.nan, np.nan]
+        expected = pd.DataFrame({"A": expected, "B": b})
+        self.assert_frame_equal(result, expected)
+
     @pytest.mark.parametrize(
         "periods, indices",
         [[-4, [-1, -1]], [-1, [1, -1]], [0, [0, 1]], [1, [-1, 0]], [4, [-1, -1]]],

diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py
@@ -248,6 +248,10 @@ def test_repeat(self, data, repeats, as_series, use_numpy):
         # Fails creating expected
         super().test_repeat(data, repeats, as_series, use_numpy)
 
+    @pytest.mark.skip(reason="algorithms.diff skips PandasArray")
+    def test_diff(self, data, periods):
+        return super().test_diff(data, periods)
+
 
 @skip_nested
 class TestArithmetics(BaseNumPyTests, base.BaseArithmeticOpsTests):