Skip to content

REF: implement _should_compare #38105

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Dec 2, 2020
66 changes: 66 additions & 0 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4937,6 +4937,43 @@ def get_indexer_for(self, target, **kwargs):
indexer, _ = self.get_indexer_non_unique(target)
return indexer

def _get_indexer_non_comparable(self, target: "Index", method, unique: bool = True):
"""
Called from get_indexer or get_indexer_non_unique when the target
is of a non-comparable dtype.

For get_indexer lookups with method=None, get_indexer is an _equality_
check, so non-comparable dtypes mean we will always have no matches.

For get_indexer lookups with a method, get_indexer is an _inequality_
check, so non-comparable dtypes mean we will always raise TypeError.

Parameters
----------
target : Index
method : str or None
unique : bool, default True
* True if called from get_indexer.
* False if called from get_indexer_non_unique.

Raises
------
TypeError
If doing an inequality check, i.e. method is not None.
"""
if method is not None:
other = _unpack_nested_dtype(target)
raise TypeError(f"Cannot compare dtypes {self.dtype} and {other.dtype}")

no_matches = -1 * np.ones(target.shape, dtype=np.intp)
if unique:
# This is for get_indexer
return no_matches
else:
# This is for get_indexer_non_unique
missing = np.arange(len(target), dtype=np.intp)
return no_matches, missing

@property
def _index_as_unique(self):
"""
Expand Down Expand Up @@ -4972,6 +5009,14 @@ def _maybe_promote(self, other: "Index"):

return self, other

def _should_compare(self, other: "Index") -> bool:
"""
Check if `self == other` can ever have non-False entries.
"""
other = _unpack_nested_dtype(other)
dtype = other.dtype
return self._is_comparable_dtype(dtype) or is_object_dtype(dtype)

def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
"""
Can we compare values of the given dtype to our own?
Expand Down Expand Up @@ -6119,3 +6164,24 @@ def get_unanimous_names(*indexes: Index) -> Tuple[Label, ...]:
name_sets = [{*ns} for ns in zip_longest(*name_tups)]
names = tuple(ns.pop() if len(ns) == 1 else None for ns in name_sets)
return names


def _unpack_nested_dtype(other: Index) -> Index:
"""
When checking if our dtype is comparable with another, we need
to unpack CategoricalDtype to look at its categories.dtype.

Parameters
----------
other : Index

Returns
-------
Index
"""
dtype = other.dtype
if is_categorical_dtype(dtype):
# If there is ever a SparseIndex, this could get dispatched
# here too.
return dtype.categories
return other
9 changes: 3 additions & 6 deletions pandas/core/indexes/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,13 +452,10 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False)
def get_indexer(self, target, method=None, limit=None, tolerance=None):
target = ensure_index(target)

if isinstance(target, PeriodIndex):
if not self._is_comparable_dtype(target.dtype):
# i.e. target.freq != self.freq
# No matches
no_matches = -1 * np.ones(self.shape, dtype=np.intp)
return no_matches
if not self._should_compare(target):
return self._get_indexer_non_comparable(target, method, unique=True)

if isinstance(target, PeriodIndex):
target = target._get_engine_target() # i.e. target.asi8
self_index = self._int64index
else:
Expand Down
53 changes: 53 additions & 0 deletions pandas/tests/indexes/period/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,28 @@
)
import pandas._testing as tm

dti4 = date_range("2016-01-01", periods=4)
dti = dti4[:-1]
rng = pd.Index(range(3))


@pytest.fixture(
params=[
dti,
dti.tz_localize("UTC"),
dti.to_period("W"),
dti - dti[0],
rng,
pd.Index([1, 2, 3]),
pd.Index([2.0, 3.0, 4.0]),
pd.Index([4, 5, 6], dtype="u8"),
pd.IntervalIndex.from_breaks(dti4),
]
)
def non_comparable_idx(request):
# All have length 3
return request.param


class TestGetItem:
def test_ellipsis(self):
Expand Down Expand Up @@ -438,6 +460,37 @@ def test_get_indexer_mismatched_dtype(self):
result = pi.get_indexer_non_unique(pi2)[0]
tm.assert_numpy_array_equal(result, expected)

def test_get_indexer_mismatched_dtype_different_length(self, non_comparable_idx):
# without method we arent checking inequalities, so get all-missing
# but do not raise
dti = date_range("2016-01-01", periods=3)
pi = dti.to_period("D")

other = non_comparable_idx

res = pi[:-1].get_indexer(other)
expected = -np.ones(other.shape, dtype=np.intp)
tm.assert_numpy_array_equal(res, expected)

@pytest.mark.parametrize("method", ["pad", "backfill", "nearest"])
def test_get_indexer_mismatched_dtype_with_method(self, non_comparable_idx, method):
dti = date_range("2016-01-01", periods=3)
pi = dti.to_period("D")

other = non_comparable_idx

msg = re.escape(f"Cannot compare dtypes {pi.dtype} and {other.dtype}")
with pytest.raises(TypeError, match=msg):
pi.get_indexer(other, method=method)

for dtype in ["object", "category"]:
other2 = other.astype(dtype)
if dtype == "object" and isinstance(other, PeriodIndex):
continue
# For object dtype we are liable to get a different exception message
with pytest.raises(TypeError):
pi.get_indexer(other2, method=method)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback notice in these cases we are currently raising in master bc the scalar comparisons raise

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this new?

get_indexer is not super public but i believe it will never raise

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you're right that get_indexer with method=None should never raise (maybe with tzawareness corner cases), but with method="ffill" the following raises on master:

dti = pd.date_range("2016-01-01", periods=3)
rng = pd.Index(range(5))

>>> dti.get_indexer(rng, method="ffill")
TypeError: '<' not supported between instances of 'int' and 'Timestamp'


def test_get_indexer_non_unique(self):
# GH 17717
p1 = Period("2017-09-02")
Expand Down