REF: merge_asof dont use values_for_argsort (#45475)

jbrockmendel · web-flow · commit 1fd31bdbf170 · 2022-01-25T12:31:06.000-05:00
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -76,6 +76,7 @@
 from pandas.core import groupby
 import pandas.core.algorithms as algos
 from pandas.core.arrays import ExtensionArray
+from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
 import pandas.core.common as com
 from pandas.core.construction import extract_array
 from pandas.core.frame import _merge_doc
@@ -1906,14 +1907,27 @@ def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]
 
         def flip(xs) -> np.ndarray:
             """unlike np.transpose, this returns an array of tuples"""
-            # error: Item "ndarray" of "Union[Any, Union[ExtensionArray, ndarray]]" has
-            # no attribute "_values_for_argsort"
-            xs = [
-                x
-                if not is_extension_array_dtype(x)
-                else extract_array(x)._values_for_argsort()  # type: ignore[union-attr]
-                for x in xs
-            ]
+
+            def injection(obj):
+                if not is_extension_array_dtype(obj):
+                    # ndarray
+                    return obj
+                obj = extract_array(obj)
+                if isinstance(obj, NDArrayBackedExtensionArray):
+                    # fastpath for e.g. dt64tz, categorical
+                    return obj._ndarray
+                # FIXME: returning obj._values_for_argsort() here doesn't
+                #  break in any existing test cases, but i (@jbrockmendel)
+                #  am pretty sure it should!
+                #  e.g.
+                #  arr = pd.array([0, pd.NA, 255], dtype="UInt8")
+                #  will have values_for_argsort (before GH#45434)
+                #  np.array([0, 255, 255], dtype=np.uint8)
+                #  and the non-injectivity should make a difference somehow
+                #  shouldn't it?
+                return np.asarray(obj)
+
+            xs = [injection(x) for x in xs]
             labels = list(string.ascii_lowercase[: len(xs)])
             dtypes = [x.dtype for x in xs]
             labeled_dtypes = list(zip(labels, dtypes))
@@ -1966,6 +1980,8 @@ def flip(xs) -> np.ndarray:
                 left_by_values = left_by_values[0]
                 right_by_values = right_by_values[0]
             else:
+                # We get here with non-ndarrays in test_merge_by_col_tz_aware
+                #  and test_merge_groupby_multiple_column_with_categorical_column
                 left_by_values = flip(left_by_values)
                 right_by_values = flip(right_by_values)
 
diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py
@@ -1224,6 +1224,50 @@ def test_merge_on_nans(self, func, side):
             else:
                 merge_asof(df, df_null, on="a")
 
+    def test_by_nullable(self, any_numeric_ea_dtype):
+        # Note: this test passes if instead of using pd.array we use
+        #  np.array([np.nan, 1]).  Other than that, I (@jbrockmendel)
+        #  have NO IDEA what the expected behavior is.
+        # TODO(GH#32306): may be relevant to the expected behavior here.
+
+        arr = pd.array([pd.NA, 0, 1], dtype=any_numeric_ea_dtype)
+        if arr.dtype.kind in ["i", "u"]:
+            max_val = np.iinfo(arr.dtype.numpy_dtype).max
+        else:
+            max_val = np.finfo(arr.dtype.numpy_dtype).max
+        # set value s.t. (at least for integer dtypes) arr._values_for_argsort
+        #  is not an injection
+        arr[2] = max_val
+
+        left = pd.DataFrame(
+            {
+                "by_col1": arr,
+                "by_col2": ["HELLO", "To", "You"],
+                "on_col": [2, 4, 6],
+                "value": ["a", "c", "e"],
+            }
+        )
+        right = pd.DataFrame(
+            {
+                "by_col1": arr,
+                "by_col2": ["WORLD", "Wide", "Web"],
+                "on_col": [1, 2, 6],
+                "value": ["b", "d", "f"],
+            }
+        )
+
+        result = merge_asof(left, right, by=["by_col1", "by_col2"], on="on_col")
+        expected = pd.DataFrame(
+            {
+                "by_col1": arr,
+                "by_col2": ["HELLO", "To", "You"],
+                "on_col": [2, 4, 6],
+                "value_x": ["a", "c", "e"],
+            }
+        )
+        expected["value_y"] = np.array([np.nan, np.nan, np.nan], dtype=object)
+        tm.assert_frame_equal(result, expected)
+
     def test_merge_by_col_tz_aware(self):
         # GH 21184
         left = pd.DataFrame(
@@ -1309,6 +1353,7 @@ def test_timedelta_tolerance_nearest(self):
 
         tm.assert_frame_equal(result, expected)
 
+    # TODO: any_int_dtype; causes failures in _get_join_indexers
     def test_int_type_tolerance(self, any_int_numpy_dtype):
         # GH #28870