From 4cab96a8ac055dfba84ca78ccf2832530ffcc437 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 23 Mar 2021 16:22:57 -0700 Subject: [PATCH 1/5] CLN libjoin int64->intp --- pandas/_libs/join.pyx | 62 +++++++++--------- pandas/core/reshape/merge.py | 112 +++++++++++++++++++++------------ pandas/tests/libs/test_join.py | 2 +- 3 files changed, 103 insertions(+), 73 deletions(-) diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 7888a15a7cb26..b69b89c0de019 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -231,7 +231,7 @@ cdef ndarray[intp_t] _get_result_indexer( return res -def ffill_indexer(const intp_t[:] indexer): +def ffill_indexer(const intp_t[:] indexer) -> np.ndarray: cdef: Py_ssize_t i, n = len(indexer) ndarray[intp_t] result @@ -275,7 +275,7 @@ ctypedef fused join_t: def left_join_indexer_unique(ndarray[join_t] left, ndarray[join_t] right): cdef: Py_ssize_t i, j, nleft, nright - ndarray[int64_t] indexer + ndarray[intp_t] indexer join_t lval, rval i = 0 @@ -283,7 +283,7 @@ def left_join_indexer_unique(ndarray[join_t] left, ndarray[join_t] right): nleft = len(left) nright = len(right) - indexer = np.empty(nleft, dtype=np.int64) + indexer = np.empty(nleft, dtype=np.intp) while True: if i == nleft: break @@ -324,7 +324,7 @@ def left_join_indexer(ndarray[join_t] left, ndarray[join_t] right): cdef: Py_ssize_t i, j, k, nright, nleft, count join_t lval, rval - ndarray[int64_t] lindexer, rindexer + ndarray[intp_t] lindexer, rindexer ndarray[join_t] result nleft = len(left) @@ -366,8 +366,8 @@ def left_join_indexer(ndarray[join_t] left, ndarray[join_t] right): # do it again now that result size is known - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) + lindexer = np.empty(count, dtype=np.intp) + rindexer = np.empty(count, dtype=np.intp) result = np.empty(count, dtype=left.dtype) i = 0 @@ -427,7 +427,7 @@ def inner_join_indexer(ndarray[join_t] left, ndarray[join_t] right): cdef: Py_ssize_t i, j, k, nright, nleft, count join_t lval, rval - ndarray[int64_t] lindexer, rindexer + ndarray[intp_t] lindexer, rindexer ndarray[join_t] result nleft = len(left) @@ -468,8 +468,8 @@ def inner_join_indexer(ndarray[join_t] left, ndarray[join_t] right): # do it again now that result size is known - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) + lindexer = np.empty(count, dtype=np.intp) + rindexer = np.empty(count, dtype=np.intp) result = np.empty(count, dtype=left.dtype) i = 0 @@ -517,7 +517,7 @@ def outer_join_indexer(ndarray[join_t] left, ndarray[join_t] right): cdef: Py_ssize_t i, j, nright, nleft, count join_t lval, rval - ndarray[int64_t] lindexer, rindexer + ndarray[intp_t] lindexer, rindexer ndarray[join_t] result nleft = len(left) @@ -564,8 +564,8 @@ def outer_join_indexer(ndarray[join_t] left, ndarray[join_t] right): count += 1 j += 1 - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) + lindexer = np.empty(count, dtype=np.intp) + rindexer = np.empty(count, dtype=np.intp) result = np.empty(count, dtype=left.dtype) # do it again, but populate the indexers / result @@ -673,12 +673,12 @@ def asof_join_backward_on_X_by_Y(asof_t[:] left_values, asof_t[:] right_values, by_t[:] left_by_values, by_t[:] right_by_values, - bint allow_exact_matches=1, + bint allow_exact_matches=True, tolerance=None): cdef: Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos - ndarray[int64_t] left_indexer, right_indexer + ndarray[intp_t] left_indexer, right_indexer bint has_tolerance = False asof_t tolerance_ = 0 asof_t diff = 0 @@ -693,8 +693,8 @@ def asof_join_backward_on_X_by_Y(asof_t[:] left_values, left_size = len(left_values) right_size = len(right_values) - left_indexer = np.empty(left_size, dtype=np.int64) - right_indexer = np.empty(left_size, dtype=np.int64) + left_indexer = np.empty(left_size, dtype=np.intp) + right_indexer = np.empty(left_size, dtype=np.intp) if by_t is object: hash_table = PyObjectHashTable(right_size) @@ -747,7 +747,7 @@ def asof_join_forward_on_X_by_Y(asof_t[:] left_values, cdef: Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos - ndarray[int64_t] left_indexer, right_indexer + ndarray[intp_t] left_indexer, right_indexer bint has_tolerance = False asof_t tolerance_ = 0 asof_t diff = 0 @@ -762,8 +762,8 @@ def asof_join_forward_on_X_by_Y(asof_t[:] left_values, left_size = len(left_values) right_size = len(right_values) - left_indexer = np.empty(left_size, dtype=np.int64) - right_indexer = np.empty(left_size, dtype=np.int64) + left_indexer = np.empty(left_size, dtype=np.intp) + right_indexer = np.empty(left_size, dtype=np.intp) if by_t is object: hash_table = PyObjectHashTable(right_size) @@ -816,14 +816,14 @@ def asof_join_nearest_on_X_by_Y(asof_t[:] left_values, cdef: Py_ssize_t left_size, right_size, i - ndarray[int64_t] left_indexer, right_indexer, bli, bri, fli, fri + ndarray[intp_t] left_indexer, right_indexer, bli, bri, fli, fri asof_t bdiff, fdiff left_size = len(left_values) right_size = len(right_values) - left_indexer = np.empty(left_size, dtype=np.int64) - right_indexer = np.empty(left_size, dtype=np.int64) + left_indexer = np.empty(left_size, dtype=np.intp) + right_indexer = np.empty(left_size, dtype=np.intp) # search both forward and backward bli, bri = asof_join_backward_on_X_by_Y( @@ -867,7 +867,7 @@ def asof_join_backward(asof_t[:] left_values, cdef: Py_ssize_t left_pos, right_pos, left_size, right_size - ndarray[int64_t] left_indexer, right_indexer + ndarray[intp_t] left_indexer, right_indexer bint has_tolerance = False asof_t tolerance_ = 0 asof_t diff = 0 @@ -880,8 +880,8 @@ def asof_join_backward(asof_t[:] left_values, left_size = len(left_values) right_size = len(right_values) - left_indexer = np.empty(left_size, dtype=np.int64) - right_indexer = np.empty(left_size, dtype=np.int64) + left_indexer = np.empty(left_size, dtype=np.intp) + right_indexer = np.empty(left_size, dtype=np.intp) right_pos = 0 for left_pos in range(left_size): @@ -920,7 +920,7 @@ def asof_join_forward(asof_t[:] left_values, cdef: Py_ssize_t left_pos, right_pos, left_size, right_size - ndarray[int64_t] left_indexer, right_indexer + ndarray[intp_t] left_indexer, right_indexer bint has_tolerance = False asof_t tolerance_ = 0 asof_t diff = 0 @@ -933,8 +933,8 @@ def asof_join_forward(asof_t[:] left_values, left_size = len(left_values) right_size = len(right_values) - left_indexer = np.empty(left_size, dtype=np.int64) - right_indexer = np.empty(left_size, dtype=np.int64) + left_indexer = np.empty(left_size, dtype=np.intp) + right_indexer = np.empty(left_size, dtype=np.intp) right_pos = right_size - 1 for left_pos in range(left_size - 1, -1, -1): @@ -974,14 +974,14 @@ def asof_join_nearest(asof_t[:] left_values, cdef: Py_ssize_t left_size, right_size, i - ndarray[int64_t] left_indexer, right_indexer, bli, bri, fli, fri + ndarray[intp_t] left_indexer, right_indexer, bli, bri, fli, fri asof_t bdiff, fdiff left_size = len(left_values) right_size = len(right_values) - left_indexer = np.empty(left_size, dtype=np.int64) - right_indexer = np.empty(left_size, dtype=np.int64) + left_indexer = np.empty(left_size, dtype=np.intp) + right_indexer = np.empty(left_size, dtype=np.intp) # search both forward and backward bli, bri = asof_join_backward(left_values, right_values, diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0cb9725b70f44..1bbc7ba2bec9c 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -28,6 +28,7 @@ ) from pandas._typing import ( ArrayLike, + DtypeObj, FrameOrSeries, FrameOrSeriesUnion, IndexLabel, @@ -286,7 +287,7 @@ def merge_ordered( 9 e 3 b 3.0 """ - def _merger(x, y): + def _merger(x, y) -> DataFrame: # perform the ordered merge operation op = _OrderedMerge( x, @@ -741,7 +742,9 @@ def get_result(self) -> DataFrame: return result.__finalize__(self, method="merge") - def _maybe_drop_cross_column(self, result: DataFrame, cross_col: Optional[str]): + def _maybe_drop_cross_column( + self, result: DataFrame, cross_col: Optional[str] + ) -> None: if cross_col is not None: result.drop(columns=cross_col, inplace=True) @@ -824,7 +827,15 @@ def _maybe_restore_index_levels(self, result: DataFrame) -> None: if names_to_restore: result.set_index(names_to_restore, inplace=True) - def _maybe_add_join_keys(self, result, left_indexer, right_indexer): + def _maybe_add_join_keys( + self, + result: DataFrame, + left_indexer: Optional[np.ndarray], + right_indexer: Optional[np.ndarray], + ) -> None: + + assert left_indexer is None or left_indexer.dtype == np.intp + assert right_indexer is None or right_indexer.dtype == np.intp left_has_missing = None right_has_missing = None @@ -891,9 +902,14 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): # make sure to just use the right values or vice-versa mask_left = left_indexer == -1 mask_right = right_indexer == -1 - if mask_left.all(): + # error: Item "bool" of "Union[Any, bool]" has no attribute "all" + if mask_left.all(): # type: ignore[union-attr] key_col = Index(rvals) - elif right_indexer is not None and mask_right.all(): + # error: Item "bool" of "Union[Any, bool]" has no attribute "all" + elif ( + right_indexer is not None + and mask_right.all() # type: ignore[union-attr] + ): key_col = Index(lvals) else: key_col = Index(lvals).where(~mask_left, rvals) @@ -916,13 +932,17 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): else: result.insert(i, name or f"key_{i}", key_col) - def _get_join_indexers(self): + def _get_join_indexers(self) -> tuple[np.ndarray, np.ndarray]: """ return the join indexers """ + # Both returned ndarrays are np.intp return get_join_indexers( self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how ) - def _get_join_info(self): + def _get_join_info( + self, + ) -> tuple[Index, np.ndarray | None, np.ndarray | None]: + # Both returned ndarrays are np.intp (if not None) left_ax = self.left.axes[self.axis] right_ax = self.right.axes[self.axis] @@ -952,7 +972,7 @@ def _get_join_info(self): ) else: join_index = self.right.index.take(right_indexer) - left_indexer = np.array([-1] * len(join_index)) + left_indexer = np.array([-1] * len(join_index), dtype=np.intp) elif self.left_index: if len(self.right) > 0: join_index = self._create_join_index( @@ -963,7 +983,7 @@ def _get_join_info(self): ) else: join_index = self.left.index.take(left_indexer) - right_indexer = np.array([-1] * len(join_index)) + right_indexer = np.array([-1] * len(join_index), dtype=np.intp) else: join_index = Index(np.arange(len(left_indexer))) @@ -975,7 +995,7 @@ def _create_join_index( self, index: Index, other_index: Index, - indexer, + indexer: np.ndarray, how: str = "left", ) -> Index: """ @@ -983,14 +1003,15 @@ def _create_join_index( Parameters ---------- - index: Index being rearranged - other_index: Index used to supply values not found in index - indexer: how to rearrange index - how: replacement is only necessary if indexer based on other_index + index : Index being rearranged + other_index : Index used to supply values not found in index + indexer : np.ndarray[np.intp] how to rearrange index + how : str + Replacement is only necessary if indexer based on other_index. Returns ------- - join_index + Index """ if self.how in (how, "outer") and not isinstance(other_index, MultiIndex): # if final index requires values in other_index but not target @@ -1263,8 +1284,8 @@ def _create_cross_configuration( Parameters ---------- - left: DataFrame - right DataFrame + left : DataFrame + right : DataFrame Returns ------- @@ -1419,21 +1440,22 @@ def _validate(self, validate: str) -> None: def get_join_indexers( left_keys, right_keys, sort: bool = False, how: str = "inner", **kwargs -): +) -> tuple[np.ndarray, np.ndarray]: """ Parameters ---------- - left_keys: ndarray, Index, Series - right_keys: ndarray, Index, Series - sort: bool, default False - how: string {'inner', 'outer', 'left', 'right'}, default 'inner' + left_keys : ndarray, Index, Series + right_keys : ndarray, Index, Series + sort : bool, default False + how : {'inner', 'outer', 'left', 'right'}, default 'inner' Returns ------- - tuple of (left_indexer, right_indexer) - indexers into the left_keys, right_keys - + np.ndarray[np.intp] + Indexer into the left_keys. + np.ndarray[np.intp] + Indexer into the right_keys. """ assert len(left_keys) == len( right_keys @@ -1499,9 +1521,9 @@ def restore_dropped_levels_multijoin( join_index : Index the index of the join between the common levels of left and right - lindexer : intp array + lindexer : np.ndarray[np.intp] left indexer - rindexer : intp array + rindexer : np.ndarray[np.intp] right indexer Returns @@ -1515,7 +1537,7 @@ def restore_dropped_levels_multijoin( """ - def _convert_to_multiindex(index) -> MultiIndex: + def _convert_to_multiindex(index: Index) -> MultiIndex: if isinstance(index, MultiIndex): return index else: @@ -1649,7 +1671,7 @@ def _asof_by_function(direction: str): } -def _get_cython_type_upcast(dtype) -> str: +def _get_cython_type_upcast(dtype: DtypeObj) -> str: """ Upcast a dtype to 'int64_t', 'double', or 'object' """ if is_integer_dtype(dtype): return "int64_t" @@ -1706,7 +1728,7 @@ def __init__( fill_method=fill_method, ) - def _validate_specification(self): + def _validate_specification(self) -> None: super()._validate_specification() # we only allow on to be a single item for on @@ -1839,7 +1861,8 @@ def _get_merge_keys(self): return left_join_keys, right_join_keys, join_names - def _get_join_indexers(self): + def _get_join_indexers(self) -> tuple[np.ndarray, np.ndarray]: + # Both returned ndarrays are np.intp """ return the join indexers """ def flip(xs) -> np.ndarray: @@ -1929,7 +1952,10 @@ def flip(xs) -> np.ndarray: return func(left_values, right_values, self.allow_exact_matches, tolerance) -def _get_multiindex_indexer(join_keys, index: MultiIndex, sort: bool): +def _get_multiindex_indexer( + join_keys, index: MultiIndex, sort: bool +) -> tuple[np.ndarray, np.ndarray]: + # Both returned ndarrays are np.intp # left & right join labels and num. of levels at each location mapped = ( @@ -1965,17 +1991,19 @@ def _get_multiindex_indexer(join_keys, index: MultiIndex, sort: bool): return libjoin.left_outer_join(lkey, rkey, count, sort=sort) -def _get_single_indexer(join_key, index, sort: bool = False): - left_key, right_key, count = _factorize_keys(join_key, index, sort=sort) - - left_indexer, right_indexer = libjoin.left_outer_join( - left_key, right_key, count, sort=sort - ) +def _get_single_indexer( + join_key, index: Index, sort: bool = False +) -> tuple[np.ndarray, np.ndarray]: + # Both returned ndarrays are np.intp + left_key, right_key, count = _factorize_keys(join_key, index._values, sort=sort) - return left_indexer, right_indexer + return libjoin.left_outer_join(left_key, right_key, count, sort=sort) -def _left_join_on_index(left_ax: Index, right_ax: Index, join_keys, sort: bool = False): +def _left_join_on_index( + left_ax: Index, right_ax: Index, join_keys, sort: bool = False +) -> tuple[Index, np.ndarray | None, np.ndarray]: + # Both returned ndarrays are np.intp (if not None) if len(join_keys) > 1: if not ( isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels @@ -2212,7 +2240,9 @@ def _validate_operand(obj: FrameOrSeries) -> DataFrame: ) -def _items_overlap_with_suffix(left: Index, right: Index, suffixes: Suffixes): +def _items_overlap_with_suffix( + left: Index, right: Index, suffixes: Suffixes +) -> tuple[Index, Index]: """ Suffixes type validation. diff --git a/pandas/tests/libs/test_join.py b/pandas/tests/libs/test_join.py index eeb66f8941260..fe8a928c61ae7 100644 --- a/pandas/tests/libs/test_join.py +++ b/pandas/tests/libs/test_join.py @@ -148,7 +148,7 @@ def test_left_join_indexer_unique(readonly): b.setflags(write=False) result = libjoin.left_join_indexer_unique(b, a) - expected = np.array([1, 1, 2, 3, 3], dtype=np.int64) + expected = np.array([1, 1, 2, 3, 3], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) From 1ce37255cfa1fbb09d13182d7084a744f76da890 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 23 Mar 2021 17:19:09 -0700 Subject: [PATCH 2/5] update join test expecteds --- pandas/tests/indexes/period/test_join.py | 4 +- pandas/tests/libs/test_join.py | 48 ++++++++++++------------ 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/pandas/tests/indexes/period/test_join.py b/pandas/tests/indexes/period/test_join.py index aa2393aceee52..77dcd38b239ec 100644 --- a/pandas/tests/indexes/period/test_join.py +++ b/pandas/tests/indexes/period/test_join.py @@ -17,8 +17,8 @@ def test_join_outer_indexer(self): result = pi._outer_indexer(pi._values, pi._values) tm.assert_extension_array_equal(result[0], pi._values) - tm.assert_numpy_array_equal(result[1], np.arange(len(pi), dtype=np.int64)) - tm.assert_numpy_array_equal(result[2], np.arange(len(pi), dtype=np.int64)) + tm.assert_numpy_array_equal(result[1], np.arange(len(pi), dtype=np.intp)) + tm.assert_numpy_array_equal(result[2], np.arange(len(pi), dtype=np.intp)) def test_joins(self, join_type): index = period_range("1/1/2000", "1/20/2000", freq="D") diff --git a/pandas/tests/libs/test_join.py b/pandas/tests/libs/test_join.py index fe8a928c61ae7..17601d30739e3 100644 --- a/pandas/tests/libs/test_join.py +++ b/pandas/tests/libs/test_join.py @@ -26,23 +26,23 @@ def test_outer_join_indexer(self, dtype): assert isinstance(lindexer, np.ndarray) assert isinstance(rindexer, np.ndarray) tm.assert_numpy_array_equal(result, np.arange(5, dtype=dtype)) - exp = np.array([0, 1, 2, -1, -1], dtype=np.int64) + exp = np.array([0, 1, 2, -1, -1], dtype=np.intp) tm.assert_numpy_array_equal(lindexer, exp) - exp = np.array([-1, -1, 0, 1, 2], dtype=np.int64) + exp = np.array([-1, -1, 0, 1, 2], dtype=np.intp) tm.assert_numpy_array_equal(rindexer, exp) result, lindexer, rindexer = indexer(empty, right) tm.assert_numpy_array_equal(result, right) - exp = np.array([-1, -1, -1], dtype=np.int64) + exp = np.array([-1, -1, -1], dtype=np.intp) tm.assert_numpy_array_equal(lindexer, exp) - exp = np.array([0, 1, 2], dtype=np.int64) + exp = np.array([0, 1, 2], dtype=np.intp) tm.assert_numpy_array_equal(rindexer, exp) result, lindexer, rindexer = indexer(left, empty) tm.assert_numpy_array_equal(result, left) - exp = np.array([0, 1, 2], dtype=np.int64) + exp = np.array([0, 1, 2], dtype=np.intp) tm.assert_numpy_array_equal(lindexer, exp) - exp = np.array([-1, -1, -1], dtype=np.int64) + exp = np.array([-1, -1, -1], dtype=np.intp) tm.assert_numpy_array_equal(rindexer, exp) def test_cython_left_outer_join(self): @@ -283,8 +283,8 @@ def test_inner_join_indexer(): index_exp = np.array([3, 5], dtype=np.int64) tm.assert_almost_equal(index, index_exp) - aexp = np.array([2, 4], dtype=np.int64) - bexp = np.array([1, 2], dtype=np.int64) + aexp = np.array([2, 4], dtype=np.intp) + bexp = np.array([1, 2], dtype=np.intp) tm.assert_almost_equal(ares, aexp) tm.assert_almost_equal(bres, bexp) @@ -293,8 +293,8 @@ def test_inner_join_indexer(): index, ares, bres = libjoin.inner_join_indexer(a, b) tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) - tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) - tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) + tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.intp)) + tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.intp)) def test_outer_join_indexer(): @@ -306,8 +306,8 @@ def test_outer_join_indexer(): index_exp = np.array([0, 1, 2, 3, 4, 5, 7, 9], dtype=np.int64) tm.assert_almost_equal(index, index_exp) - aexp = np.array([-1, 0, 1, 2, 3, 4, -1, -1], dtype=np.int64) - bexp = np.array([0, -1, -1, 1, -1, 2, 3, 4], dtype=np.int64) + aexp = np.array([-1, 0, 1, 2, 3, 4, -1, -1], dtype=np.intp) + bexp = np.array([0, -1, -1, 1, -1, 2, 3, 4], dtype=np.intp) tm.assert_almost_equal(ares, aexp) tm.assert_almost_equal(bres, bexp) @@ -316,8 +316,8 @@ def test_outer_join_indexer(): index, ares, bres = libjoin.outer_join_indexer(a, b) tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) - tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) - tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) + tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.intp)) + tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.intp)) def test_left_join_indexer(): @@ -328,8 +328,8 @@ def test_left_join_indexer(): tm.assert_almost_equal(index, a) - aexp = np.array([0, 1, 2, 3, 4], dtype=np.int64) - bexp = np.array([-1, -1, 1, -1, 2], dtype=np.int64) + aexp = np.array([0, 1, 2, 3, 4], dtype=np.intp) + bexp = np.array([-1, -1, 1, -1, 2], dtype=np.intp) tm.assert_almost_equal(ares, aexp) tm.assert_almost_equal(bres, bexp) @@ -338,8 +338,8 @@ def test_left_join_indexer(): index, ares, bres = libjoin.left_join_indexer(a, b) tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) - tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) - tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) + tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.intp)) + tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.intp)) def test_left_join_indexer2(): @@ -351,10 +351,10 @@ def test_left_join_indexer2(): exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) tm.assert_almost_equal(res, exp_res) - exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) + exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) tm.assert_almost_equal(lidx, exp_lidx) - exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) + exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) tm.assert_almost_equal(ridx, exp_ridx) @@ -367,10 +367,10 @@ def test_outer_join_indexer2(): exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) tm.assert_almost_equal(res, exp_res) - exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) + exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) tm.assert_almost_equal(lidx, exp_lidx) - exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) + exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) tm.assert_almost_equal(ridx, exp_ridx) @@ -383,8 +383,8 @@ def test_inner_join_indexer2(): exp_res = np.array([1, 1, 2, 5], dtype=np.int64) tm.assert_almost_equal(res, exp_res) - exp_lidx = np.array([0, 0, 1, 2], dtype=np.int64) + exp_lidx = np.array([0, 0, 1, 2], dtype=np.intp) tm.assert_almost_equal(lidx, exp_lidx) - exp_ridx = np.array([0, 1, 2, 3], dtype=np.int64) + exp_ridx = np.array([0, 1, 2, 3], dtype=np.intp) tm.assert_almost_equal(ridx, exp_ridx) From 9260d0c32a72003b70a16a8279981b7a8713de8f Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 24 Mar 2021 07:53:33 -0700 Subject: [PATCH 3/5] troubleshoot windows build --- pandas/core/reshape/merge.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 1bbc7ba2bec9c..7c1b9161c34aa 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -834,8 +834,8 @@ def _maybe_add_join_keys( right_indexer: Optional[np.ndarray], ) -> None: - assert left_indexer is None or left_indexer.dtype == np.intp - assert right_indexer is None or right_indexer.dtype == np.intp + _assert_intp_or_none(left_indexer) + _assert_intp_or_none(right_indexer) left_has_missing = None right_has_missing = None @@ -950,17 +950,26 @@ def _get_join_info( join_index, left_indexer, right_indexer = left_ax.join( right_ax, how=self.how, return_indexers=True, sort=self.sort ) + _assert_intp_or_none(left_indexer) + _assert_intp_or_none(right_indexer) + elif self.right_index and self.how == "left": join_index, left_indexer, right_indexer = _left_join_on_index( left_ax, right_ax, self.left_join_keys, sort=self.sort ) + _assert_intp_or_none(left_indexer) + _assert_intp_or_none(right_indexer) elif self.left_index and self.how == "right": join_index, right_indexer, left_indexer = _left_join_on_index( right_ax, left_ax, self.right_join_keys, sort=self.sort ) + _assert_intp_or_none(left_indexer) + _assert_intp_or_none(right_indexer) else: (left_indexer, right_indexer) = self._get_join_indexers() + _assert_intp_or_none(left_indexer) + _assert_intp_or_none(right_indexer) if self.right_index: if len(self.left) > 0: @@ -2293,3 +2302,9 @@ def renamer(x, suffix): rrenamer = partial(renamer, suffix=rsuffix) return (left._transform_index(lrenamer), right._transform_index(rrenamer)) + + +def _assert_intp_or_none(obj: Optional[np.ndarray]) -> None: + if obj is not None: + assert isinstance(obj, np.ndarray) + assert obj.dtype == np.intp, obj.dtype From 0689fa20aa15de8b9fddf9473d1446fb9ec8f37f Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 24 Mar 2021 08:40:17 -0700 Subject: [PATCH 4/5] troubleshoot windows builds --- pandas/core/indexes/base.py | 6 ++++-- pandas/core/reshape/merge.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 094f4a67d2e61..0062a6dc24cd0 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3925,7 +3925,7 @@ def join( if len(other) == 0 and how in ("left", "outer"): join_index = self._view() if return_indexers: - rindexer = np.repeat(-1, len(join_index)) + rindexer = np.repeat(np.intp(-1), len(join_index)) return join_index, None, rindexer else: return join_index @@ -3933,7 +3933,7 @@ def join( if len(self) == 0 and how in ("right", "outer"): join_index = other._view() if return_indexers: - lindexer = np.repeat(-1, len(join_index)) + lindexer = np.repeat(np.intp(-1), len(join_index)) return join_index, lindexer, None else: return join_index @@ -4006,10 +4006,12 @@ def join( lindexer = None else: lindexer = self.get_indexer(join_index) + assert lindexer.dtype == np.intp, (lindexer.dtype, lindexer) if join_index is other: rindexer = None else: rindexer = other.get_indexer(join_index) + assert rindexer.dtype == np.intp, (rindexer.dtype, rindexer) return join_index, lindexer, rindexer else: return join_index diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 7c1b9161c34aa..17fb04f6c2f97 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2307,4 +2307,4 @@ def renamer(x, suffix): def _assert_intp_or_none(obj: Optional[np.ndarray]) -> None: if obj is not None: assert isinstance(obj, np.ndarray) - assert obj.dtype == np.intp, obj.dtype + assert obj.dtype == np.intp, (obj.dtype, obj) From c6db8e594e2d66bb8cdca90b04bab2172e32c1e8 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 24 Mar 2021 09:30:52 -0700 Subject: [PATCH 5/5] strip troubleshooting code --- pandas/core/indexes/base.py | 2 -- pandas/core/reshape/merge.py | 17 ----------------- 2 files changed, 19 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 0062a6dc24cd0..af3315dd2ade6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4006,12 +4006,10 @@ def join( lindexer = None else: lindexer = self.get_indexer(join_index) - assert lindexer.dtype == np.intp, (lindexer.dtype, lindexer) if join_index is other: rindexer = None else: rindexer = other.get_indexer(join_index) - assert rindexer.dtype == np.intp, (rindexer.dtype, rindexer) return join_index, lindexer, rindexer else: return join_index diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 17fb04f6c2f97..a9faf0098b6d4 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -834,9 +834,6 @@ def _maybe_add_join_keys( right_indexer: Optional[np.ndarray], ) -> None: - _assert_intp_or_none(left_indexer) - _assert_intp_or_none(right_indexer) - left_has_missing = None right_has_missing = None @@ -950,26 +947,18 @@ def _get_join_info( join_index, left_indexer, right_indexer = left_ax.join( right_ax, how=self.how, return_indexers=True, sort=self.sort ) - _assert_intp_or_none(left_indexer) - _assert_intp_or_none(right_indexer) elif self.right_index and self.how == "left": join_index, left_indexer, right_indexer = _left_join_on_index( left_ax, right_ax, self.left_join_keys, sort=self.sort ) - _assert_intp_or_none(left_indexer) - _assert_intp_or_none(right_indexer) elif self.left_index and self.how == "right": join_index, right_indexer, left_indexer = _left_join_on_index( right_ax, left_ax, self.right_join_keys, sort=self.sort ) - _assert_intp_or_none(left_indexer) - _assert_intp_or_none(right_indexer) else: (left_indexer, right_indexer) = self._get_join_indexers() - _assert_intp_or_none(left_indexer) - _assert_intp_or_none(right_indexer) if self.right_index: if len(self.left) > 0: @@ -2302,9 +2291,3 @@ def renamer(x, suffix): rrenamer = partial(renamer, suffix=rsuffix) return (left._transform_index(lrenamer), right._transform_index(rrenamer)) - - -def _assert_intp_or_none(obj: Optional[np.ndarray]) -> None: - if obj is not None: - assert isinstance(obj, np.ndarray) - assert obj.dtype == np.intp, (obj.dtype, obj)