diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index dc3bb09c1b462..b908fa2c65e4d 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -264,6 +264,9 @@ def left_join_indexer_unique( ndarray[numeric_object_t] left, ndarray[numeric_object_t] right ): + """ + Both left and right are strictly monotonic increasing. + """ cdef: Py_ssize_t i, j, nleft, nright ndarray[intp_t] indexer @@ -311,6 +314,9 @@ def left_join_indexer_unique( def left_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] right): """ Two-pass algorithm for monotonic indexes. Handles many-to-one merges. + + Both left and right are monotonic increasing, but at least one of them + is non-unique (if both were unique we'd use left_join_indexer_unique). """ cdef: Py_ssize_t i, j, k, nright, nleft, count @@ -321,6 +327,7 @@ def left_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] nleft = len(left) nright = len(right) + # First pass is to find the size 'count' of our output indexers. i = 0 j = 0 count = 0 @@ -334,6 +341,8 @@ def left_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] rval = right[j] if lval == rval: + # This block is identical across + # left_join_indexer, inner_join_indexer, outer_join_indexer count += 1 if i < nleft - 1: if j < nright - 1 and right[j + 1] == rval: @@ -398,12 +407,14 @@ def left_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] # end of the road break elif lval < rval: + # i.e. lval not in right; we keep for left_join_indexer lindexer[count] = i rindexer[count] = -1 - result[count] = left[i] + result[count] = lval count += 1 i += 1 else: + # i.e. rval not in left; we discard for left_join_indexer j += 1 return result, lindexer, rindexer @@ -414,6 +425,8 @@ def left_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] def inner_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] right): """ Two-pass algorithm for monotonic indexes. Handles many-to-one merges. + + Both left and right are monotonic increasing but not necessarily unique. """ cdef: Py_ssize_t i, j, k, nright, nleft, count @@ -424,6 +437,7 @@ def inner_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] nleft = len(left) nright = len(right) + # First pass is to find the size 'count' of our output indexers. i = 0 j = 0 count = 0 @@ -453,8 +467,10 @@ def inner_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] # end of the road break elif lval < rval: + # i.e. lval not in right; we discard for inner_indexer i += 1 else: + # i.e. rval not in left; we discard for inner_indexer j += 1 # do it again now that result size is known @@ -478,7 +494,7 @@ def inner_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] if lval == rval: lindexer[count] = i rindexer[count] = j - result[count] = rval + result[count] = lval count += 1 if i < nleft - 1: if j < nright - 1 and right[j + 1] == rval: @@ -495,8 +511,10 @@ def inner_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] # end of the road break elif lval < rval: + # i.e. lval not in right; we discard for inner_indexer i += 1 else: + # i.e. rval not in left; we discard for inner_indexer j += 1 return result, lindexer, rindexer @@ -505,6 +523,9 @@ def inner_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] @cython.wraparound(False) @cython.boundscheck(False) def outer_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] right): + """ + Both left and right are monotonic increasing but not necessarily unique. + """ cdef: Py_ssize_t i, j, nright, nleft, count numeric_object_t lval, rval @@ -514,6 +535,9 @@ def outer_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] nleft = len(left) nright = len(right) + # First pass is to find the size 'count' of our output indexers. + # count will be length of left plus the number of elements of right not in + # left (counting duplicates) i = 0 j = 0 count = 0 @@ -616,12 +640,14 @@ def outer_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] # end of the road break elif lval < rval: + # i.e. lval not in right; we keep for outer_join_indexer lindexer[count] = i rindexer[count] = -1 result[count] = lval count += 1 i += 1 else: + # i.e. rval not in left; we keep for outer_join_indexer lindexer[count] = -1 rindexer[count] = j result[count] = rval diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 2c4b420656259..c1915e719f515 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -198,7 +198,7 @@ cdef inline bint _is_on_month(int month, int compare_month, int modby) nogil: @cython.wraparound(False) @cython.boundscheck(False) def get_start_end_field(const int64_t[:] dtindex, str field, - object freqstr=None, int month_kw=12): + str freqstr=None, int month_kw=12): """ Given an int64-based datetime index return array of indicators of whether timestamps are at the start/end of the month/quarter/year diff --git a/pandas/core/array_algos/putmask.py b/pandas/core/array_algos/putmask.py index a8f69497d4019..ac27aaa42d151 100644 --- a/pandas/core/array_algos/putmask.py +++ b/pandas/core/array_algos/putmask.py @@ -9,7 +9,10 @@ import numpy as np from pandas._libs import lib -from pandas._typing import ArrayLike +from pandas._typing import ( + ArrayLike, + npt, +) from pandas.core.dtypes.cast import ( convert_scalar_for_putitemlike, @@ -26,13 +29,14 @@ from pandas.core.arrays import ExtensionArray -def putmask_inplace(values: ArrayLike, mask: np.ndarray, value: Any) -> None: +def putmask_inplace(values: ArrayLike, mask: npt.NDArray[np.bool_], value: Any) -> None: """ ExtensionArray-compatible implementation of np.putmask. The main difference is we do not handle repeating or truncating like numpy. Parameters ---------- + values: np.ndarray or ExtensionArray mask : np.ndarray[bool] We assume extract_bool_array has already been called. value : Any @@ -51,6 +55,7 @@ def putmask_inplace(values: ArrayLike, mask: np.ndarray, value: Any) -> None: ) ): # GH#19266 using np.putmask gives unexpected results with listlike value + # along with object dtype if is_list_like(value) and len(value) == len(values): values[mask] = value[mask] else: diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 6be2e803b5910..21675ca0cdc7c 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1259,7 +1259,6 @@ def __from_arrow__( return IntervalArray._concat_same_type(results) def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: - # NB: this doesn't handle checking for closed match if not all(isinstance(x, IntervalDtype) for x in dtypes): return None diff --git a/pandas/core/indexers/utils.py b/pandas/core/indexers/utils.py index bf901683de602..b1824413512c5 100644 --- a/pandas/core/indexers/utils.py +++ b/pandas/core/indexers/utils.py @@ -104,14 +104,14 @@ def is_scalar_indexer(indexer, ndim: int) -> bool: return False -def is_empty_indexer(indexer, arr_value: np.ndarray) -> bool: +def is_empty_indexer(indexer, arr_value: ArrayLike) -> bool: """ Check if we have an empty indexer. Parameters ---------- indexer : object - arr_value : np.ndarray + arr_value : np.ndarray or ExtensionArray Returns ------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 926ab0b544abd..0cbe16c9aaf13 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3123,7 +3123,9 @@ def _union(self, other: Index, sort): and not (self.has_duplicates and other.has_duplicates) and self._can_use_libjoin ): - # Both are unique and monotonic, so can use outer join + # Both are monotonic and at least one is unique, so can use outer join + # (actually don't need either unique, but without this restriction + # test_union_same_value_duplicated_in_both fails) try: return self._outer_indexer(other)[0] except (TypeError, IncompatibleFrequency): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 8327e5f1bb532..751cf41a09f14 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -918,7 +918,7 @@ def setitem(self, indexer, value): check_setitem_lengths(indexer, value, values) if is_empty_indexer(indexer, arr_value): - # GH#8669 empty indexers + # GH#8669 empty indexers, test_loc_setitem_boolean_mask_allfalse pass elif is_scalar_indexer(indexer, self.ndim): @@ -1698,7 +1698,7 @@ def putmask(self, mask, new) -> list[Block]: mask = extract_bool_array(mask) if not self._can_hold_element(new): - return self.astype(_dtype_obj).putmask(mask, new) + return self.coerce_to_target_dtype(new).putmask(mask, new) arr = self.values arr.T.putmask(mask, new) @@ -1755,7 +1755,9 @@ def fillna( # We support filling a DatetimeTZ with a `value` whose timezone # is different by coercing to object. # TODO: don't special-case td64 - return self.astype(_dtype_obj).fillna(value, limit, inplace, downcast) + return self.coerce_to_target_dtype(value).fillna( + value, limit, inplace, downcast + ) values = self.values values = values if inplace else values.copy() diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 7765c29ee59c8..f0d01f8727d5a 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -2075,7 +2075,7 @@ def test_td64arr_div_numeric_array( with pytest.raises(TypeError, match=pattern): vector.astype(object) / tdser - def test_td64arr_mul_int_series(self, box_with_array, names, request): + def test_td64arr_mul_int_series(self, box_with_array, names): # GH#19042 test for correct name attachment box = box_with_array exname = get_expected_name(box, names)