From 205f34266e4c400d775d3e7f78465f45dc2a8af4 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 3 Dec 2023 23:03:32 -0500 Subject: [PATCH 1/2] BUG: merge should upcast to highest resolution --- doc/source/whatsnew/v2.1.4.rst | 1 + pandas/core/reshape/merge.py | 10 +--------- pandas/tests/reshape/merge/test_merge.py | 25 ++++++++++++++++++------ 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 77ce303dc1bfe..f434561d5f1c0 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -24,6 +24,7 @@ Bug fixes - Bug in :class:`Series` constructor raising DeprecationWarning when ``index`` is a list of :class:`Series` (:issue:`55228`) - Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) +- Fixed bug in :meth:`DataFrame.merge` not being able to join on ``datetime64`` columns or differing resolutions (:issue:`55212`) - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f8575b1b53908..49c5e61799627 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1073,16 +1073,8 @@ def _maybe_add_join_keys( key_col = Index(lvals) result_dtype = lvals.dtype else: - key_col = Index(lvals).where(~mask_left, rvals) result_dtype = find_common_type([lvals.dtype, rvals.dtype]) - if ( - lvals.dtype.kind == "M" - and rvals.dtype.kind == "M" - and result_dtype.kind == "O" - ): - # TODO(non-nano) Workaround for common_type not dealing - # with different resolutions - result_dtype = key_col.dtype + key_col = Index(lvals).astype(result_dtype).where(~mask_left, rvals) if result._is_label_reference(name): result[name] = result._constructor_sliced( diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 7538894bbf1c9..85ebdfe388512 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2818,7 +2818,8 @@ def test_merge_arrow_and_numpy_dtypes(dtype): @pytest.mark.parametrize("how", ["inner", "left", "outer", "right"]) @pytest.mark.parametrize("tz", [None, "America/Chicago"]) -def test_merge_datetime_different_resolution(tz, how): +@pytest.mark.parametrize("unit", ["us", "ms", "s"]) +def test_merge_datetime_different_resolution(tz, how, unit): # https://github.com/pandas-dev/pandas/issues/53200 vals = [ pd.Timestamp(2023, 5, 12, tz=tz), @@ -2828,19 +2829,31 @@ def test_merge_datetime_different_resolution(tz, how): df1 = DataFrame({"t": vals[:2], "a": [1.0, 2.0]}) df1["t"] = df1["t"].dt.as_unit("ns") df2 = DataFrame({"t": vals[1:], "b": [1.0, 2.0]}) - df2["t"] = df2["t"].dt.as_unit("s") + df2["t"] = df2["t"].dt.as_unit(unit) expected = DataFrame({"t": vals, "a": [1.0, 2.0, np.nan], "b": [np.nan, 1.0, 2.0]}) expected["t"] = expected["t"].dt.as_unit("ns") if how == "inner": - expected = expected.iloc[[1]].reset_index(drop=True) + expected1 = expected.iloc[[1]].reset_index(drop=True) + expected2 = expected1 elif how == "left": - expected = expected.iloc[[0, 1]] + expected1 = expected.iloc[[0, 1]] + expected2 = expected.iloc[[1, 2]].reset_index(drop=True) elif how == "right": - expected = expected.iloc[[1, 2]].reset_index(drop=True) + expected1 = expected.iloc[[1, 2]].reset_index(drop=True) + expected2 = expected.iloc[[0, 1]] + else: + expected1 = expected + expected2 = expected result = df1.merge(df2, on="t", how=how) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected1) + + # Check lower resolution to higher resolution also works + # GH55212 + expected2 = expected2[["t", "b", "a"]] + result1 = df2.merge(df1, on="t", how=how) + tm.assert_frame_equal(result1, expected2) def test_merge_multiindex_single_level(): From bbacf84640cbf371af0cba62fad59693f0c42ffb Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 1 Jan 2024 08:08:11 -0800 Subject: [PATCH 2/2] move whatsnew --- doc/source/whatsnew/v2.1.4.rst | 1 - doc/source/whatsnew/v2.2.0.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 413711d4552a6..57b83a294963b 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -28,7 +28,6 @@ Bug fixes - Fixed bug in :func:`to_numeric` converting to extension dtype for ``string[pyarrow_numpy]`` dtype (:issue:`56179`) - Fixed bug in :meth:`.DataFrameGroupBy.min` and :meth:`.DataFrameGroupBy.max` not preserving extension dtype for empty object (:issue:`55619`) - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) -- Fixed bug in :meth:`DataFrame.merge` not being able to join on ``datetime64`` columns or differing resolutions (:issue:`55212`) - Fixed bug in :meth:`DataFrame.to_hdf` raising when columns have ``StringDtype`` (:issue:`55088`) - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`Series.__ne__` resulting in False for comparison between ``NA`` and string value for ``dtype="string[pyarrow_numpy]"`` (:issue:`56122`) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 129f5cedb86c2..ed0c5d4169f00 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -761,6 +761,7 @@ Datetimelike - Bug in parsing datetime strings with nanosecond resolution with non-ISO8601 formats incorrectly truncating sub-microsecond components (:issue:`56051`) - Bug in parsing datetime strings with sub-second resolution and trailing zeros incorrectly inferring second or millisecond resolution (:issue:`55737`) - Bug in the results of :func:`to_datetime` with an floating-dtype argument with ``unit`` not matching the pointwise results of :class:`Timestamp` (:issue:`56037`) +- Fixed bug in :meth:`DataFrame.merge` not being able to join on ``datetime64`` columns of differing resolutions (:issue:`55212`) Timedelta ^^^^^^^^^