diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index b74f93942c411..805c5970409fc 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -296,7 +296,7 @@ Other Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Performance improvement in :func:`concat` with ``axis=1`` and objects with unaligned indexes (:issue:`55084`) -- Performance improvement in :func:`merge_asof` when ``by`` contains more than one key (:issue:`55580`) +- Performance improvement in :func:`merge_asof` when ``by`` is not ``None`` (:issue:`55580`, :issue:`55678`) - Performance improvement in :func:`read_stata` for files with many variables (:issue:`55515`) - Performance improvement in :func:`to_dict` on converting DataFrame to dictionary (:issue:`50990`) - Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`) @@ -411,6 +411,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ - Bug in :func:`concat` ignoring ``sort`` parameter when passed :class:`DatetimeIndex` indexes (:issue:`54769`) +- Bug in :func:`merge_asof` raising ``TypeError`` when ``by`` dtype is not ``object``, ``int64``, or ``uint64`` (:issue:`22794`) - Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) - Bug in :meth:`pandas.DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) - diff --git a/pandas/_libs/join.pyi b/pandas/_libs/join.pyi index c7761cbf8aba9..1d4e8c90bc559 100644 --- a/pandas/_libs/join.pyi +++ b/pandas/_libs/join.pyi @@ -53,8 +53,8 @@ def outer_join_indexer( def asof_join_backward_on_X_by_Y( left_values: np.ndarray, # ndarray[numeric_t] right_values: np.ndarray, # ndarray[numeric_t] - left_by_values: np.ndarray, # ndarray[by_t] - right_by_values: np.ndarray, # ndarray[by_t] + left_by_values: np.ndarray, # const int64_t[:] + right_by_values: np.ndarray, # const int64_t[:] allow_exact_matches: bool = ..., tolerance: np.number | float | None = ..., use_hashtable: bool = ..., @@ -62,8 +62,8 @@ def asof_join_backward_on_X_by_Y( def asof_join_forward_on_X_by_Y( left_values: np.ndarray, # ndarray[numeric_t] right_values: np.ndarray, # ndarray[numeric_t] - left_by_values: np.ndarray, # ndarray[by_t] - right_by_values: np.ndarray, # ndarray[by_t] + left_by_values: np.ndarray, # const int64_t[:] + right_by_values: np.ndarray, # const int64_t[:] allow_exact_matches: bool = ..., tolerance: np.number | float | None = ..., use_hashtable: bool = ..., @@ -71,8 +71,8 @@ def asof_join_forward_on_X_by_Y( def asof_join_nearest_on_X_by_Y( left_values: np.ndarray, # ndarray[numeric_t] right_values: np.ndarray, # ndarray[numeric_t] - left_by_values: np.ndarray, # ndarray[by_t] - right_by_values: np.ndarray, # ndarray[by_t] + left_by_values: np.ndarray, # const int64_t[:] + right_by_values: np.ndarray, # const int64_t[:] allow_exact_matches: bool = ..., tolerance: np.number | float | None = ..., use_hashtable: bool = ..., diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 385a089a8c59d..368abe60d7237 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -7,7 +7,6 @@ from numpy cimport ( int64_t, intp_t, ndarray, - uint64_t, ) cnp.import_array() @@ -679,23 +678,13 @@ def outer_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] # asof_join_by # ---------------------------------------------------------------------- -from pandas._libs.hashtable cimport ( - HashTable, - Int64HashTable, - PyObjectHashTable, - UInt64HashTable, -) - -ctypedef fused by_t: - object - int64_t - uint64_t +from pandas._libs.hashtable cimport Int64HashTable def asof_join_backward_on_X_by_Y(ndarray[numeric_t] left_values, ndarray[numeric_t] right_values, - ndarray[by_t] left_by_values, - ndarray[by_t] right_by_values, + const int64_t[:] left_by_values, + const int64_t[:] right_by_values, bint allow_exact_matches=True, tolerance=None, bint use_hashtable=True): @@ -706,8 +695,7 @@ def asof_join_backward_on_X_by_Y(ndarray[numeric_t] left_values, bint has_tolerance = False numeric_t tolerance_ = 0 numeric_t diff = 0 - HashTable hash_table - by_t by_value + Int64HashTable hash_table # if we are using tolerance, set our objects if tolerance is not None: @@ -721,12 +709,7 @@ def asof_join_backward_on_X_by_Y(ndarray[numeric_t] left_values, right_indexer = np.empty(left_size, dtype=np.intp) if use_hashtable: - if by_t is object: - hash_table = PyObjectHashTable(right_size) - elif by_t is int64_t: - hash_table = Int64HashTable(right_size) - elif by_t is uint64_t: - hash_table = UInt64HashTable(right_size) + hash_table = Int64HashTable(right_size) right_pos = 0 for left_pos in range(left_size): @@ -771,8 +754,8 @@ def asof_join_backward_on_X_by_Y(ndarray[numeric_t] left_values, def asof_join_forward_on_X_by_Y(ndarray[numeric_t] left_values, ndarray[numeric_t] right_values, - ndarray[by_t] left_by_values, - ndarray[by_t] right_by_values, + const int64_t[:] left_by_values, + const int64_t[:] right_by_values, bint allow_exact_matches=1, tolerance=None, bint use_hashtable=True): @@ -783,8 +766,7 @@ def asof_join_forward_on_X_by_Y(ndarray[numeric_t] left_values, bint has_tolerance = False numeric_t tolerance_ = 0 numeric_t diff = 0 - HashTable hash_table - by_t by_value + Int64HashTable hash_table # if we are using tolerance, set our objects if tolerance is not None: @@ -798,12 +780,7 @@ def asof_join_forward_on_X_by_Y(ndarray[numeric_t] left_values, right_indexer = np.empty(left_size, dtype=np.intp) if use_hashtable: - if by_t is object: - hash_table = PyObjectHashTable(right_size) - elif by_t is int64_t: - hash_table = Int64HashTable(right_size) - elif by_t is uint64_t: - hash_table = UInt64HashTable(right_size) + hash_table = Int64HashTable(right_size) right_pos = right_size - 1 for left_pos in range(left_size - 1, -1, -1): @@ -849,8 +826,8 @@ def asof_join_forward_on_X_by_Y(ndarray[numeric_t] left_values, def asof_join_nearest_on_X_by_Y(ndarray[numeric_t] left_values, ndarray[numeric_t] right_values, - ndarray[by_t] left_by_values, - ndarray[by_t] right_by_values, + const int64_t[:] left_by_values, + const int64_t[:] right_by_values, bint allow_exact_matches=True, tolerance=None, bint use_hashtable=True): diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 8d826ff6ae0d8..8bddde9c05dad 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2153,54 +2153,37 @@ def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp] if self.left_by is not None: # remove 'on' parameter from values if one existed if self.left_index and self.right_index: - left_by_values = self.left_join_keys - right_by_values = self.right_join_keys + left_join_keys = self.left_join_keys + right_join_keys = self.right_join_keys else: - left_by_values = self.left_join_keys[0:-1] - right_by_values = self.right_join_keys[0:-1] - - # get tuple representation of values if more than one - if len(left_by_values) == 1: - lbv = left_by_values[0] - rbv = right_by_values[0] - - # TODO: conversions for EAs that can be no-copy. - lbv = np.asarray(lbv) - rbv = np.asarray(rbv) - if needs_i8_conversion(lbv.dtype): - lbv = lbv.view("i8") - if needs_i8_conversion(rbv.dtype): - rbv = rbv.view("i8") + left_join_keys = self.left_join_keys[0:-1] + right_join_keys = self.right_join_keys[0:-1] + + mapped = [ + _factorize_keys( + left_join_keys[n], + right_join_keys[n], + sort=False, + how="left", + ) + for n in range(len(left_join_keys)) + ] + + if len(left_join_keys) == 1: + left_by_values = mapped[0][0] + right_by_values = mapped[0][1] else: - # We get here with non-ndarrays in test_merge_by_col_tz_aware - # and test_merge_groupby_multiple_column_with_categorical_column - mapped = [ - _factorize_keys( - left_by_values[n], - right_by_values[n], - sort=False, - how="left", - ) - for n in range(len(left_by_values)) - ] arrs = [np.concatenate(m[:2]) for m in mapped] shape = tuple(m[2] for m in mapped) group_index = get_group_index( arrs, shape=shape, sort=False, xnull=False ) - left_len = len(left_by_values[0]) - lbv = group_index[:left_len] - rbv = group_index[left_len:] - # error: Incompatible types in assignment (expression has type - # "Union[ndarray[Any, dtype[Any]], ndarray[Any, dtype[object_]]]", - # variable has type "List[Union[Union[ExtensionArray, - # ndarray[Any, Any]], Index, Series]]") - right_by_values = rbv # type: ignore[assignment] - # error: Incompatible types in assignment (expression has type - # "Union[ndarray[Any, dtype[Any]], ndarray[Any, dtype[object_]]]", - # variable has type "List[Union[Union[ExtensionArray, - # ndarray[Any, Any]], Index, Series]]") - left_by_values = lbv # type: ignore[assignment] + left_len = len(left_join_keys[0]) + left_by_values = group_index[:left_len] + right_by_values = group_index[left_len:] + + left_by_values = ensure_int64(left_by_values) + right_by_values = ensure_int64(right_by_values) # choose appropriate function by type func = _asof_by_function(self.direction) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index d7198aa0e3500..50ead8747fa2f 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1345,9 +1345,9 @@ def test_by_mixed_tz_aware(self): expected["value_y"] = np.array([np.nan], dtype=object) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("dtype", ["m8[ns]", "M8[us]"]) - def test_by_datelike(self, dtype): - # GH 55453 + @pytest.mark.parametrize("dtype", ["float64", "int16", "m8[ns]", "M8[us]"]) + def test_by_dtype(self, dtype): + # GH 55453, GH 22794 left = pd.DataFrame( { "by_col": np.array([1], dtype=dtype),