Skip to content

Commit 01f9742

Browse files
meeseeksmachineWillAyd
authored andcommitted
Backport PR pandas-dev#31025: ENH: Handle extension arrays in algorithms.diff (pandas-dev#31255)
1 parent 0bb2fa1 commit 01f9742

File tree

12 files changed

+133
-9
lines changed

12 files changed

+133
-9
lines changed

doc/source/whatsnew/v1.0.0.rst

+4-1
Original file line numberDiff line numberDiff line change
@@ -727,6 +727,7 @@ Deprecations
727727
- Support for multi-dimensional indexing (e.g. ``index[:, None]``) on a :class:`Index` is deprecated and will be removed in a future version, convert to a numpy array before indexing instead (:issue:`30588`)
728728
- The ``pandas.np`` submodule is now deprecated. Import numpy directly instead (:issue:`30296`)
729729
- The ``pandas.datetime`` class is now deprecated. Import from ``datetime`` instead (:issue:`30610`)
730+
- :class:`~DataFrame.diff` will raise a ``TypeError`` rather than implicitly losing the dtype of extension types in the future. Convert to the correct dtype before calling ``diff`` instead (:issue:`31025`)
730731

731732
**Selecting Columns from a Grouped DataFrame**
732733

@@ -1018,6 +1019,8 @@ Numeric
10181019
- Bug in :meth:`DataFrame.round` where a :class:`DataFrame` with a :class:`CategoricalIndex` of :class:`IntervalIndex` columns would incorrectly raise a ``TypeError`` (:issue:`30063`)
10191020
- Bug in :meth:`Series.pct_change` and :meth:`DataFrame.pct_change` when there are duplicated indices (:issue:`30463`)
10201021
- Bug in :class:`DataFrame` cumulative operations (e.g. cumsum, cummax) incorrect casting to object-dtype (:issue:`19296`)
1022+
- Bug in :class:`~DataFrame.diff` losing the dtype for extension types (:issue:`30889`)
1023+
- Bug in :class:`DataFrame.diff` raising an ``IndexError`` when one of the columns was a nullable integer dtype (:issue:`30967`)
10211024

10221025
Conversion
10231026
^^^^^^^^^^
@@ -1158,7 +1161,7 @@ Sparse
11581161
^^^^^^
11591162
- Bug in :class:`SparseDataFrame` arithmetic operations incorrectly casting inputs to float (:issue:`28107`)
11601163
- Bug in ``DataFrame.sparse`` returning a ``Series`` when there was a column named ``sparse`` rather than the accessor (:issue:`30758`)
1161-
-
1164+
- Fixed :meth:`operator.xor` with a boolean-dtype ``SparseArray``. Now returns a sparse result, rather than object dtype (:issue:`31025`)
11621165

11631166
ExtensionArray
11641167
^^^^^^^^^^^^^^

pandas/_libs/sparse_op_helper.pxi.in

+5-4
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,8 @@ def get_op(tup):
8484
'ge': '{0} >= {1}',
8585

8686
'and': '{0} & {1}', # logical op
87-
'or': '{0} | {1}'}
87+
'or': '{0} | {1}',
88+
'xor': '{0} ^ {1}'}
8889

8990
return ops_dict[opname].format(lval, rval)
9091

@@ -94,7 +95,7 @@ def get_dispatch(dtypes):
9495
ops_list = ['add', 'sub', 'mul', 'div', 'mod', 'truediv',
9596
'floordiv', 'pow',
9697
'eq', 'ne', 'lt', 'gt', 'le', 'ge',
97-
'and', 'or']
98+
'and', 'or', 'xor']
9899

99100
for opname in ops_list:
100101
for dtype, arith_comp_group, logical_group in dtypes:
@@ -104,13 +105,13 @@ def get_dispatch(dtypes):
104105
elif opname in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'):
105106
# comparison op
106107
rdtype = 'uint8'
107-
elif opname in ('and', 'or'):
108+
elif opname in ('and', 'or', 'xor'):
108109
# logical op
109110
rdtype = 'uint8'
110111
else:
111112
rdtype = dtype
112113

113-
if opname in ('and', 'or'):
114+
if opname in ('and', 'or', 'xor'):
114115
if logical_group:
115116
yield opname, dtype, rdtype
116117
else:

pandas/core/algorithms.py

+28-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Generic data algorithms. This module is experimental at the moment and not
33
intended for public consumption
44
"""
5+
import operator
56
from textwrap import dedent
67
from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
78
from warnings import catch_warnings, simplefilter, warn
@@ -1812,7 +1813,7 @@ def searchsorted(arr, value, side="left", sorter=None):
18121813
_diff_special = {"float64", "float32", "int64", "int32", "int16", "int8"}
18131814

18141815

1815-
def diff(arr, n: int, axis: int = 0):
1816+
def diff(arr, n: int, axis: int = 0, stacklevel=3):
18161817
"""
18171818
difference of n between self,
18181819
analogous to s-s.shift(n)
@@ -1824,16 +1825,42 @@ def diff(arr, n: int, axis: int = 0):
18241825
number of periods
18251826
axis : int
18261827
axis to shift on
1828+
stacklevel : int
1829+
The stacklevel for the lost dtype warning.
18271830
18281831
Returns
18291832
-------
18301833
shifted
18311834
"""
1835+
from pandas.core.arrays import PandasDtype
18321836

18331837
n = int(n)
18341838
na = np.nan
18351839
dtype = arr.dtype
18361840

1841+
if dtype.kind == "b":
1842+
op = operator.xor
1843+
else:
1844+
op = operator.sub
1845+
1846+
if isinstance(dtype, PandasDtype):
1847+
# PandasArray cannot necessarily hold shifted versions of itself.
1848+
arr = np.asarray(arr)
1849+
dtype = arr.dtype
1850+
1851+
if is_extension_array_dtype(dtype):
1852+
if hasattr(arr, f"__{op.__name__}__"):
1853+
return op(arr, arr.shift(n))
1854+
else:
1855+
warn(
1856+
"dtype lost in 'diff()'. In the future this will raise a "
1857+
"TypeError. Convert to a suitable dtype prior to calling 'diff'.",
1858+
FutureWarning,
1859+
stacklevel=stacklevel,
1860+
)
1861+
arr = np.asarray(arr)
1862+
dtype = arr.dtype
1863+
18371864
is_timedelta = False
18381865
is_bool = False
18391866
if needs_i8_conversion(arr):

pandas/core/arrays/sparse/array.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ def _sparse_array_op(
141141
left, right = right, left
142142
name = name[1:]
143143

144-
if name in ("and", "or") and dtype == "bool":
144+
if name in ("and", "or", "xor") and dtype == "bool":
145145
opname = f"sparse_{name}_uint8"
146146
# to make template simple, cast here
147147
left_sp_values = left.sp_values.view(np.uint8)
@@ -1459,6 +1459,7 @@ def _add_unary_ops(cls):
14591459
def _add_comparison_ops(cls):
14601460
cls.__and__ = cls._create_comparison_method(operator.and_)
14611461
cls.__or__ = cls._create_comparison_method(operator.or_)
1462+
cls.__xor__ = cls._create_arithmetic_method(operator.xor)
14621463
super()._add_comparison_ops()
14631464

14641465
# ----------

pandas/core/frame.py

+5
Original file line numberDiff line numberDiff line change
@@ -6533,6 +6533,11 @@ def diff(self, periods=1, axis=0) -> "DataFrame":
65336533
DataFrame.shift: Shift index by desired number of periods with an
65346534
optional time freq.
65356535
6536+
Notes
6537+
-----
6538+
For boolean dtypes, this uses :meth:`operator.xor` rather than
6539+
:meth:`operator.sub`.
6540+
65366541
Examples
65376542
--------
65386543
Difference with previous row

pandas/core/internals/blocks.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -1270,7 +1270,10 @@ def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None):
12701270

12711271
def diff(self, n: int, axis: int = 1) -> List["Block"]:
12721272
""" return block for the diff of the values """
1273-
new_values = algos.diff(self.values, n, axis=axis)
1273+
new_values = algos.diff(self.values, n, axis=axis, stacklevel=7)
1274+
# We use block_shape for ExtensionBlock subclasses, which may call here
1275+
# via a super.
1276+
new_values = _block_shape(new_values, ndim=self.ndim)
12741277
return [self.make_block(values=new_values)]
12751278

12761279
def shift(self, periods, axis=0, fill_value=None):
@@ -1850,6 +1853,12 @@ def interpolate(
18501853
placement=self.mgr_locs,
18511854
)
18521855

1856+
def diff(self, n: int, axis: int = 1) -> List["Block"]:
1857+
if axis == 1:
1858+
# we are by definition 1D.
1859+
axis = 0
1860+
return super().diff(n, axis)
1861+
18531862
def shift(
18541863
self,
18551864
periods: int,

pandas/core/series.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -2264,6 +2264,11 @@ def diff(self, periods=1):
22642264
optional time freq.
22652265
DataFrame.diff: First discrete difference of object.
22662266
2267+
Notes
2268+
-----
2269+
For boolean dtypes, this uses :meth:`operator.xor` rather than
2270+
:meth:`operator.sub`.
2271+
22672272
Examples
22682273
--------
22692274
Difference with previous row
@@ -2300,7 +2305,7 @@ def diff(self, periods=1):
23002305
5 NaN
23012306
dtype: float64
23022307
"""
2303-
result = algorithms.diff(com.values_from_object(self), periods)
2308+
result = algorithms.diff(self.array, periods)
23042309
return self._constructor(result, index=self.index).__finalize__(self)
23052310

23062311
def autocorr(self, lag=1):

pandas/tests/arrays/categorical/test_algos.py

+15
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,21 @@ def test_isin_empty(empty):
9090
tm.assert_numpy_array_equal(expected, result)
9191

9292

93+
def test_diff():
94+
s = pd.Series([1, 2, 3], dtype="category")
95+
with tm.assert_produces_warning(FutureWarning):
96+
result = s.diff()
97+
expected = pd.Series([np.nan, 1, 1])
98+
tm.assert_series_equal(result, expected)
99+
100+
expected = expected.to_frame(name="A")
101+
df = s.to_frame(name="A")
102+
with tm.assert_produces_warning(FutureWarning):
103+
result = df.diff()
104+
105+
tm.assert_frame_equal(result, expected)
106+
107+
93108
class TestTake:
94109
# https://github.com/pandas-dev/pandas/issues/20664
95110

pandas/tests/arrays/sparse/test_arithmetics.py

+8
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,14 @@ def test_mixed_array_comparison(self, kind):
388388
assert b.dtype == SparseDtype(rdtype, fill_value=2)
389389
self._check_comparison_ops(a, b, values, rvalues)
390390

391+
def test_xor(self):
392+
s = SparseArray([True, True, False, False])
393+
t = SparseArray([True, False, True, False])
394+
result = s ^ t
395+
sp_index = pd.core.arrays.sparse.IntIndex(4, np.array([0, 1, 2], dtype="int32"))
396+
expected = SparseArray([False, True, True], sparse_index=sp_index)
397+
tm.assert_sp_array_equal(result, expected)
398+
391399

392400
@pytest.mark.parametrize("op", [operator.eq, operator.add])
393401
def test_with_list(op):

pandas/tests/arrays/test_boolean.py

+16
Original file line numberDiff line numberDiff line change
@@ -879,3 +879,19 @@ def test_value_counts_na():
879879
result = arr.value_counts(dropna=True)
880880
expected = pd.Series([1, 1], index=[True, False], dtype="Int64")
881881
tm.assert_series_equal(result, expected)
882+
883+
884+
def test_diff():
885+
a = pd.array(
886+
[True, True, False, False, True, None, True, None, False], dtype="boolean"
887+
)
888+
result = pd.core.algorithms.diff(a, 1)
889+
expected = pd.array(
890+
[None, False, True, False, True, None, None, None, None], dtype="boolean"
891+
)
892+
tm.assert_extension_array_equal(result, expected)
893+
894+
s = pd.Series(a)
895+
result = s.diff()
896+
expected = pd.Series(expected)
897+
tm.assert_series_equal(result, expected)

pandas/tests/extension/base/methods.py

+30
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
1+
import operator
2+
13
import numpy as np
24
import pytest
35

6+
from pandas.core.dtypes.common import is_bool_dtype
7+
48
import pandas as pd
59
import pandas._testing as tm
610
from pandas.core.sorting import nargsort
@@ -231,6 +235,32 @@ def test_container_shift(self, data, frame, periods, indices):
231235

232236
compare(result, expected)
233237

238+
@pytest.mark.parametrize("periods", [1, -2])
239+
def test_diff(self, data, periods):
240+
data = data[:5]
241+
if is_bool_dtype(data.dtype):
242+
op = operator.xor
243+
else:
244+
op = operator.sub
245+
try:
246+
# does this array implement ops?
247+
op(data, data)
248+
except Exception:
249+
pytest.skip(f"{type(data)} does not support diff")
250+
s = pd.Series(data)
251+
result = s.diff(periods)
252+
expected = pd.Series(op(data, data.shift(periods)))
253+
self.assert_series_equal(result, expected)
254+
255+
df = pd.DataFrame({"A": data, "B": [1.0] * 5})
256+
result = df.diff(periods)
257+
if periods == 1:
258+
b = [np.nan, 0, 0, 0, 0]
259+
else:
260+
b = [0, 0, 0, np.nan, np.nan]
261+
expected = pd.DataFrame({"A": expected, "B": b})
262+
self.assert_frame_equal(result, expected)
263+
234264
@pytest.mark.parametrize(
235265
"periods, indices",
236266
[[-4, [-1, -1]], [-1, [1, -1]], [0, [0, 1]], [1, [-1, 0]], [4, [-1, -1]]],

pandas/tests/extension/test_numpy.py

+4
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,10 @@ def test_repeat(self, data, repeats, as_series, use_numpy):
248248
# Fails creating expected
249249
super().test_repeat(data, repeats, as_series, use_numpy)
250250

251+
@pytest.mark.xfail(reason="PandasArray.diff may fail on dtype")
252+
def test_diff(self, data, periods):
253+
return super().test_diff(data, periods)
254+
251255

252256
@skip_nested
253257
class TestArithmetics(BaseNumPyTests, base.BaseArithmeticOpsTests):

0 commit comments

Comments
 (0)