Skip to content

REF: implement _should_compare #38105

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Dec 2, 2020
35 changes: 35 additions & 0 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4938,6 +4938,25 @@ def get_indexer_for(self, target, **kwargs):
indexer, _ = self.get_indexer_non_unique(target)
return indexer

def _get_indexer_non_comparable(self, target, method, unique: bool = True):
"""
If we have non-comparable dtypes, there will never be any matches.

For get_indexer calls with a method, this is an _inequality_ check
which raises TypeError.
"""
if method is not None:
other = self._get_other_deep(target)
raise TypeError(f"Cannot compare dtypes {self.dtype} and {other.dtype}")

no_matches = -1 * np.ones(target.shape, dtype=np.intp)
if unique:
# This is for get_indexer
return no_matches
else:
# This is for get_indexer_non_unique
return no_matches, no_matches
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback theres some ambiguity as to what we're supposed to be returning here. ATM in Index.get_indexer_non_unique we are returning two ndarrays of minus-ones, but in IntervalIndex the second one is an np.arange(len(target))

The other inconsistency is that ATM PeriodIndex.get_indexer defines no_matches = -1 * np.ones(self.shape, dtype=np.intp) whereas i expected it to use target.shape

Can you clear this up for me?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looking at this more, i increasingly think the second one should be np.arange(target.shape, dtype=np.intp)


@property
def _index_as_unique(self):
"""
Expand Down Expand Up @@ -4973,6 +4992,22 @@ def _maybe_promote(self, other: "Index"):

return self, other

def _get_other_deep(self, other: "Index") -> "Index":
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should just be a function, no? its also a funny name. I think we have other similar things (e.g. this look like what .to_dense() does but for dtypes and not values).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated to _unpack_nested_dtypes function

dtype = other.dtype
if is_categorical_dtype(dtype):
# If there is ever a SparseIndex, this could get dispatched
# here too.
return dtype.categories
return other

def _should_compare(self, other: "Index") -> bool:
"""
Check if `self == other` can ever have non-False entries.
"""
other = self._get_other_deep(other)
dtype = other.dtype
return self._is_comparable_dtype(dtype) or is_object_dtype(dtype)

def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
"""
Can we compare values of the given dtype to our own?
Expand Down
9 changes: 3 additions & 6 deletions pandas/core/indexes/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,13 +452,10 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False)
def get_indexer(self, target, method=None, limit=None, tolerance=None):
target = ensure_index(target)

if isinstance(target, PeriodIndex):
if not self._is_comparable_dtype(target.dtype):
# i.e. target.freq != self.freq
# No matches
no_matches = -1 * np.ones(self.shape, dtype=np.intp)
return no_matches
if not self._should_compare(target):
return self._get_indexer_non_comparable(target, method, unique=True)

if isinstance(target, PeriodIndex):
target = target._get_engine_target() # i.e. target.asi8
self_index = self._int64index
else:
Expand Down
53 changes: 53 additions & 0 deletions pandas/tests/indexes/period/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,28 @@
)
import pandas._testing as tm

dti4 = date_range("2016-01-01", periods=4)
dti = dti4[:-1]
rng = pd.Index(range(3))


@pytest.fixture(
params=[
dti,
dti.tz_localize("UTC"),
dti.to_period("W"),
dti - dti[0],
rng,
pd.Index([1, 2, 3]),
pd.Index([2.0, 3.0, 4.0]),
pd.Index([4, 5, 6], dtype="u8"),
pd.IntervalIndex.from_breaks(dti4),
]
)
def non_comparable(request):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

non_comparable_idx

# All have length 3
return request.param


class TestGetItem:
def test_ellipsis(self):
Expand Down Expand Up @@ -438,6 +460,37 @@ def test_get_indexer_mismatched_dtype(self):
result = pi.get_indexer_non_unique(pi2)[0]
tm.assert_numpy_array_equal(result, expected)

def test_get_indexer_mismatched_dtype_different_length(self, non_comparable):
# without method we arent checking inequalities, so get all-missing
# but do not raise
dti = date_range("2016-01-01", periods=3)
pi = dti.to_period("D")

other = non_comparable

res = pi[:-1].get_indexer(other)
expected = -np.ones(other.shape, dtype=np.intp)
tm.assert_numpy_array_equal(res, expected)

@pytest.mark.parametrize("method", ["pad", "backfill", "nearest"])
def test_get_indexer_mismatched_dtype_with_method(self, non_comparable, method):
dti = date_range("2016-01-01", periods=3)
pi = dti.to_period("D")

other = non_comparable

msg = re.escape(f"Cannot compare dtypes {pi.dtype} and {other.dtype}")
with pytest.raises(TypeError, match=msg):
pi.get_indexer(other, method=method)

for dtype in ["object", "category"]:
other2 = other.astype(dtype)
if dtype == "object" and isinstance(other, PeriodIndex):
continue
# For object dtype we are liable to get a different exception message
with pytest.raises(TypeError):
pi.get_indexer(other2, method=method)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback notice in these cases we are currently raising in master bc the scalar comparisons raise

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this new?

get_indexer is not super public but i believe it will never raise

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you're right that get_indexer with method=None should never raise (maybe with tzawareness corner cases), but with method="ffill" the following raises on master:

dti = pd.date_range("2016-01-01", periods=3)
rng = pd.Index(range(5))

>>> dti.get_indexer(rng, method="ffill")
TypeError: '<' not supported between instances of 'int' and 'Timestamp'


def test_get_indexer_non_unique(self):
# GH 17717
p1 = Period("2017-09-02")
Expand Down