From 942bdc0c5eb3c19ce32e9fb5acd0298667ab36b0 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 18 Jan 2023 10:32:57 -0800 Subject: [PATCH] BUG: merge with non-nano --- pandas/core/arrays/timedeltas.py | 2 +- pandas/core/reshape/merge.py | 21 ++++++-- pandas/tests/reshape/merge/test_merge_asof.py | 54 ++++++++++++++----- 3 files changed, 59 insertions(+), 18 deletions(-) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index ac4e8934570ce..1aedc3a31e3e7 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -316,7 +316,7 @@ def _unbox_scalar(self, value) -> np.timedelta64: raise ValueError("'value' should be a Timedelta.") self._check_compatible_with(value) if value is NaT: - return np.timedelta64(value.value, "ns") + return np.timedelta64(value.value, self.unit) else: return value.as_unit(self.unit).asm8 diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 8d009d25a66ba..7d8d7a37ff7e7 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -89,7 +89,10 @@ ) from pandas.core.arrays._mixins import NDArrayBackedExtensionArray import pandas.core.common as com -from pandas.core.construction import extract_array +from pandas.core.construction import ( + ensure_wrapped_if_datetimelike, + extract_array, +) from pandas.core.frame import _merge_doc from pandas.core.indexes.api import default_index from pandas.core.sorting import is_int64_overflow_possible @@ -2109,12 +2112,24 @@ def injection(obj): # initial type conversion as needed if needs_i8_conversion(left_values): - left_values = left_values.view("i8") - right_values = right_values.view("i8") if tolerance is not None: tolerance = Timedelta(tolerance) + + # TODO: we have no test cases with PeriodDtype here; probably + # need to adjust tolerance for that case. + if left_values.dtype.kind in ["m", "M"]: + # Make sure the i8 representation for tolerance + # matches that for left_values/right_values. + lvs = ensure_wrapped_if_datetimelike(left_values) + tolerance = tolerance.as_unit(lvs.unit) + tolerance = tolerance.value + # TODO: require left_values.dtype == right_values.dtype, or at least + # comparable for e.g. dt64tz + left_values = left_values.view("i8") + right_values = right_values.view("i8") + # a "by" parameter requires special handling if self.left_by is not None: # remove 'on' parameter from values if one existed diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 4123f686163d4..3b522eaa075f0 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -16,6 +16,14 @@ from pandas.core.reshape.merge import MergeError +@pytest.fixture(params=["s", "ms", "us", "ns"]) +def unit(request): + """ + Resolution for datetimelike dtypes. + """ + return request.param + + class TestAsOfMerge: def read_data(self, datapath, name, dedupe=False): path = datapath("reshape", "merge", "data", name) @@ -63,8 +71,13 @@ def test_examples1(self): result = merge_asof(left, right, on="a") tm.assert_frame_equal(result, expected) - def test_examples2(self): + def test_examples2(self, unit): """doc-string examples""" + if unit == "s": + pytest.skip( + "This test is invalid for unit='s' because that would " + "round the trades['time']]" + ) trades = pd.DataFrame( { "time": to_datetime( @@ -75,7 +88,7 @@ def test_examples2(self): "20160525 13:30:00.048", "20160525 13:30:00.048", ] - ), + ).astype(f"M8[{unit}]"), "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], "price": [51.95, 51.95, 720.77, 720.92, 98.00], "quantity": [75, 155, 100, 100, 100], @@ -96,7 +109,7 @@ def test_examples2(self): "20160525 13:30:00.072", "20160525 13:30:00.075", ] - ), + ).astype(f"M8[{unit}]"), "ticker": [ "GOOG", "MSFT", @@ -127,7 +140,7 @@ def test_examples2(self): "20160525 13:30:00.048", "20160525 13:30:00.048", ] - ), + ).astype(f"M8[{unit}]"), "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], "price": [51.95, 51.95, 720.77, 720.92, 98.00], "quantity": [75, 155, 100, 100, 100], @@ -639,7 +652,7 @@ def test_tolerance_nearest(self): result = merge_asof(left, right, on="a", direction="nearest", tolerance=1) tm.assert_frame_equal(result, expected) - def test_tolerance_tz(self): + def test_tolerance_tz(self, unit): # GH 14844 left = pd.DataFrame( { @@ -648,6 +661,7 @@ def test_tolerance_tz(self): freq="D", periods=5, tz=pytz.timezone("UTC"), + unit=unit, ), "value1": np.arange(5), } @@ -659,6 +673,7 @@ def test_tolerance_tz(self): freq="D", periods=5, tz=pytz.timezone("UTC"), + unit=unit, ), "value2": list("ABCDE"), } @@ -672,6 +687,7 @@ def test_tolerance_tz(self): freq="D", periods=5, tz=pytz.timezone("UTC"), + unit=unit, ), "value1": np.arange(5), "value2": list("BCDEE"), @@ -1314,22 +1330,27 @@ def test_by_mixed_tz_aware(self): expected["value_y"] = np.array([np.nan], dtype=object) tm.assert_frame_equal(result, expected) - def test_timedelta_tolerance_nearest(self): + def test_timedelta_tolerance_nearest(self, unit): # GH 27642 + if unit == "s": + pytest.skip( + "This test is invalid with unit='s' because that would " + "round left['time']" + ) left = pd.DataFrame( list(zip([0, 5, 10, 15, 20, 25], [0, 1, 2, 3, 4, 5])), columns=["time", "left"], ) - left["time"] = pd.to_timedelta(left["time"], "ms") + left["time"] = pd.to_timedelta(left["time"], "ms").astype(f"m8[{unit}]") right = pd.DataFrame( list(zip([0, 3, 9, 12, 15, 18], [0, 1, 2, 3, 4, 5])), columns=["time", "right"], ) - right["time"] = pd.to_timedelta(right["time"], "ms") + right["time"] = pd.to_timedelta(right["time"], "ms").astype(f"m8[{unit}]") expected = pd.DataFrame( list( @@ -1342,7 +1363,7 @@ def test_timedelta_tolerance_nearest(self): columns=["time", "left", "right"], ) - expected["time"] = pd.to_timedelta(expected["time"], "ms") + expected["time"] = pd.to_timedelta(expected["time"], "ms").astype(f"m8[{unit}]") result = merge_asof( left, right, on="time", tolerance=Timedelta("1ms"), direction="nearest" @@ -1400,12 +1421,17 @@ def test_merge_index_column_tz(self): ) tm.assert_frame_equal(result, expected) - def test_left_index_right_index_tolerance(self): + def test_left_index_right_index_tolerance(self, unit): # https://github.com/pandas-dev/pandas/issues/35558 - dr1 = pd.date_range(start="1/1/2020", end="1/20/2020", freq="2D") + Timedelta( - seconds=0.4 - ) - dr2 = pd.date_range(start="1/1/2020", end="2/1/2020") + if unit == "s": + pytest.skip( + "This test is invalid with unit='s' because that would round dr1" + ) + + dr1 = pd.date_range( + start="1/1/2020", end="1/20/2020", freq="2D", unit=unit + ) + Timedelta(seconds=0.4).as_unit(unit) + dr2 = pd.date_range(start="1/1/2020", end="2/1/2020", unit=unit) df1 = pd.DataFrame({"val1": "foo"}, index=pd.DatetimeIndex(dr1)) df2 = pd.DataFrame({"val2": "bar"}, index=pd.DatetimeIndex(dr2))