Skip to content

ENH: Handle extension arrays in algorithms.diff #31025

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 23 commits into from
Jan 23, 2020
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -998,6 +998,8 @@ Numeric
- Bug in :meth:`DataFrame.round` where a :class:`DataFrame` with a :class:`CategoricalIndex` of :class:`IntervalIndex` columns would incorrectly raise a ``TypeError`` (:issue:`30063`)
- Bug in :meth:`Series.pct_change` and :meth:`DataFrame.pct_change` when there are duplicated indices (:issue:`30463`)
- Bug in :class:`DataFrame` cumulative operations (e.g. cumsum, cummax) incorrect casting to object-dtype (:issue:`19296`)
- Bug in :class:`~DataFrame.diff` losing the dtype for extension types (:issue:`30889`)
- Bug in :class:`DataFrame.diff` raising an ``IndexError`` when one of the columns was a nullable integer dtype (:issue:`30967`)

Conversion
^^^^^^^^^^
Expand Down Expand Up @@ -1138,7 +1140,7 @@ Sparse
^^^^^^
- Bug in :class:`SparseDataFrame` arithmetic operations incorrectly casting inputs to float (:issue:`28107`)
- Bug in ``DataFrame.sparse`` returning a ``Series`` when there was a column named ``sparse`` rather than the accessor (:issue:`30758`)
-
- Fixed :meth:`operator.xor` with a boolean-dtype ``SparseArray``. Now returns a sparse result, rather than object dtype (:issue:`31025`)

ExtensionArray
^^^^^^^^^^^^^^
Expand Down
9 changes: 5 additions & 4 deletions pandas/_libs/sparse_op_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ def get_op(tup):
'ge': '{0} >= {1}',

'and': '{0} & {1}', # logical op
'or': '{0} | {1}'}
'or': '{0} | {1}',
'xor': '{0} ^ {1}'}

return ops_dict[opname].format(lval, rval)

Expand All @@ -94,7 +95,7 @@ def get_dispatch(dtypes):
ops_list = ['add', 'sub', 'mul', 'div', 'mod', 'truediv',
'floordiv', 'pow',
'eq', 'ne', 'lt', 'gt', 'le', 'ge',
'and', 'or']
'and', 'or', 'xor']

for opname in ops_list:
for dtype, arith_comp_group, logical_group in dtypes:
Expand All @@ -104,13 +105,13 @@ def get_dispatch(dtypes):
elif opname in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'):
# comparison op
rdtype = 'uint8'
elif opname in ('and', 'or'):
elif opname in ('and', 'or', 'xor'):
# logical op
rdtype = 'uint8'
else:
rdtype = dtype

if opname in ('and', 'or'):
if opname in ('and', 'or', 'xor'):
if logical_group:
yield opname, dtype, rdtype
else:
Expand Down
15 changes: 15 additions & 0 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Generic data algorithms. This module is experimental at the moment and not
intended for public consumption
"""
import operator
from textwrap import dedent
from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
from warnings import catch_warnings, simplefilter, warn
Expand Down Expand Up @@ -1829,11 +1830,25 @@ def diff(arr, n: int, axis: int = 0):
-------
shifted
"""
from pandas.core.arrays import PandasDtype

n = int(n)
na = np.nan
dtype = arr.dtype

if dtype.kind == "b":
op = operator.xor
else:
op = operator.sub

if isinstance(dtype, PandasDtype):
# PandasArray cannot necessarily hold shifted versions of itself.
arr = np.asarray(arr)
dtype = arr.dtype
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would it be more idiomatic to do extract_array up front?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would say not because we don't need to "extract" the array, since arrays are already passed (this doesn't get passed Series or Index objects). You can of course use extract_array to get rid of PandasArrays, but I think the above is more explicit.


if is_extension_array_dtype(dtype):
return op(arr, arr.shift(n))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cc @jbrockmendel this routine likely can use some TLC when you have a chance.


is_timedelta = False
is_bool = False
if needs_i8_conversion(arr):
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def _sparse_array_op(
left, right = right, left
name = name[1:]

if name in ("and", "or") and dtype == "bool":
if name in ("and", "or", "xor") and dtype == "bool":
opname = f"sparse_{name}_uint8"
# to make template simple, cast here
left_sp_values = left.sp_values.view(np.uint8)
Expand Down Expand Up @@ -1459,6 +1459,7 @@ def _add_unary_ops(cls):
def _add_comparison_ops(cls):
cls.__and__ = cls._create_comparison_method(operator.and_)
cls.__or__ = cls._create_comparison_method(operator.or_)
cls.__xor__ = cls._create_arithmetic_method(operator.xor)
super()._add_comparison_ops()

# ----------
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6567,6 +6567,11 @@ def diff(self, periods=1, axis=0) -> "DataFrame":
DataFrame.shift: Shift index by desired number of periods with an
optional time freq.

Notes
-----
For boolean dtypes, this uses :meth:`operator.xor` rather than
:meth:`operator.sub`.

Examples
--------
Difference with previous row
Expand Down
8 changes: 8 additions & 0 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1972,6 +1972,14 @@ class ObjectValuesExtensionBlock(ExtensionBlock):
Series[T].values is an ndarray of objects.
"""

def diff(self, n: int, axis: int = 1) -> List["Block"]:
# Block.shape vs. Block.values.shape mismatch
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

More technical debt from the 1D arrays inside 2D blocks :(

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, and this is on ObjectValuesExtensionBlock, but is only useful for PeriodArray. IntervalArray is the only other array to use this, and doesn't implement __sub__. Is it worth creating a PeriodBlock just for this (IMO no)?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this only needed for ObjectValuesExtensionBlock, and not for ExtensionBlock?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose that in principle, we can hit this from ExtensionBlock.

We hit the problem when going from a NonConsolidatable block type (like period) to a consolidatable one (like object). In that case, the values passed to make_block will be 1d, but the block is expecting them to be 2d.

In practice, I think that for most EAs, ExtensionArray.__sub__ will return another extension array. So while ExtensionBlock.diff isn't 100% correct for all EAs, I think trying to handle this case is not worth it. What do you think?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So while ExtensionBlock.diff isn't 100% correct for all EAs, I think trying to handle this case is not worth it

It certainly seems fine to ignore this corner case for now.

# Do the op, get the object-dtype ndarray, and reshape
# to put into an ObjectBlock
new_values = algos.diff(self.values, n, axis=axis)
new_values = np.atleast_2d(new_values)
return [self.make_block(values=new_values)]

def external_values(self):
return self.values.astype(object)

Expand Down
7 changes: 6 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2251,6 +2251,11 @@ def diff(self, periods=1) -> "Series":
optional time freq.
DataFrame.diff: First discrete difference of object.

Notes
-----
For boolean dtypes, this uses :meth:`operator.xor` rather than
:meth:`operator.sub`.

Examples
--------
Difference with previous row
Expand Down Expand Up @@ -2287,7 +2292,7 @@ def diff(self, periods=1) -> "Series":
5 NaN
dtype: float64
"""
result = algorithms.diff(com.values_from_object(self), periods)
result = algorithms.diff(self.array, periods)
return self._constructor(result, index=self.index).__finalize__(self)

def autocorr(self, lag=1) -> float:
Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/arrays/sparse/test_arithmetics.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,14 @@ def test_mixed_array_comparison(self, kind):
assert b.dtype == SparseDtype(rdtype, fill_value=2)
self._check_comparison_ops(a, b, values, rvalues)

def test_xor(self):
s = SparseArray([True, True, False, False])
t = SparseArray([True, False, True, False])
result = s ^ t
sp_index = pd.core.arrays.sparse.IntIndex(4, np.array([0, 1, 2], dtype="int32"))
expected = SparseArray([False, True, True], sparse_index=sp_index)
tm.assert_sp_array_equal(result, expected)


@pytest.mark.parametrize("op", [operator.eq, operator.add])
def test_with_list(op):
Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/arrays/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -879,3 +879,19 @@ def test_value_counts_na():
result = arr.value_counts(dropna=True)
expected = pd.Series([1, 1], index=[True, False], dtype="Int64")
tm.assert_series_equal(result, expected)


def test_diff():
a = pd.array(
[True, True, False, False, True, None, True, None, False], dtype="boolean"
)
result = pd.core.algorithms.diff(a, 1)
expected = pd.array(
[None, False, True, False, True, None, None, None, None], dtype="boolean"
)
tm.assert_extension_array_equal(result, expected)

s = pd.Series(a)
result = s.diff()
expected = pd.Series(expected)
tm.assert_series_equal(result, expected)
22 changes: 22 additions & 0 deletions pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,28 @@ def test_container_shift(self, data, frame, periods, indices):

compare(result, expected)

@pytest.mark.parametrize("periods", [1, -2])
def test_diff(self, data, periods):
data = data[:5]
try:
# does this array implement ops?
data - data
except Exception:
pytest.skip(f"{type(data)} does not support diff")
s = pd.Series(data)
result = s.diff(periods)
expected = pd.Series(data - data.shift(periods))
self.assert_series_equal(result, expected)

df = pd.DataFrame({"A": data, "B": [1.0] * 5})
result = df.diff(periods)
if periods == 1:
b = [np.nan, 0, 0, 0, 0]
else:
b = [0, 0, 0, np.nan, np.nan]
expected = pd.DataFrame({"A": expected, "B": b})
self.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
"periods, indices",
[[-4, [-1, -1]], [-1, [1, -1]], [0, [0, 1]], [1, [-1, 0]], [4, [-1, -1]]],
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/extension/test_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,10 @@ def test_repeat(self, data, repeats, as_series, use_numpy):
# Fails creating expected
super().test_repeat(data, repeats, as_series, use_numpy)

@pytest.mark.skip(reason="algorithms.diff skips PandasArray")
def test_diff(self, data, periods):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this skipped?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Either a bug or not implemented behavior in PandasArray. PandasArray.shift() doesn't allow changing the dtype from int to float, so diff (which uses shift) would fail.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe xfail it then?

return super().test_diff(data, periods)


@skip_nested
class TestArithmetics(BaseNumPyTests, base.BaseArithmeticOpsTests):
Expand Down