From 34d65b609b029ecb704ebda7af87b4eba72c4e18 Mon Sep 17 00:00:00 2001 From: Charlie-XIAO Date: Tue, 13 Jun 2023 14:52:42 +0800 Subject: [PATCH 1/7] BUG: pd.concat dataframes with different datetime64 resolutions --- pandas/core/dtypes/concat.py | 8 +++++++- pandas/tests/reshape/concat/test_concat.py | 14 ++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index cba7c44a219bf..acd87eda5f31c 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -115,7 +115,13 @@ def concat_compat( # i.e. isinstance(to_concat[0], ExtensionArray) to_concat_eas = cast("Sequence[ExtensionArray]", to_concat) cls = type(to_concat[0]) - return cls._concat_same_type(to_concat_eas) + # GH#53640: eg. for datetime array, axis=1 but 0 is default + # However, class method `_concat_same_type()` for some classes + # may not support the keyword argument `axis` + if axis == 0: + return cls._concat_same_type(to_concat_eas) + else: + return cls._concat_same_type(to_concat_eas, axis=axis) else: to_concat_arrs = cast("Sequence[np.ndarray]", to_concat) result = np.concatenate(to_concat_arrs, axis=axis) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index dc14e6e74302e..94f4f7c9842a1 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -826,3 +826,17 @@ def test_concat_mismatched_keys_length(): concat((x for x in sers), keys=(y for y in keys), axis=1) with tm.assert_produces_warning(FutureWarning, match=msg): concat((x for x in sers), keys=(y for y in keys), axis=0) + + +def test_concat_datetime64_diff_resolution(): + # GH#53640 + df1 = DataFrame({"a": [0, 1], "b": [4, 5]}, dtype="datetime64[s]") + df2 = DataFrame({"a": [2, 3], "b": [6, 7]}, dtype="datetime64[ms]") + result = concat([df1, df2]) + + expected = DataFrame( + data={"a": [0, 1000, 2, 3], "b": [4000, 5000, 6, 7]}, + index=[0, 1, 0, 1], + dtype="datetime64[ms]", + ) + tm.assert_frame_equal(result, expected) From 6b14b402b5e6571324442d3b52ddc0c6a4f17722 Mon Sep 17 00:00:00 2001 From: Charlie-XIAO Date: Tue, 13 Jun 2023 16:02:59 +0800 Subject: [PATCH 2/7] resolve mypy --- pandas/core/dtypes/concat.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index acd87eda5f31c..fd28c06064030 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -117,11 +117,14 @@ def concat_compat( cls = type(to_concat[0]) # GH#53640: eg. for datetime array, axis=1 but 0 is default # However, class method `_concat_same_type()` for some classes - # may not support the keyword argument `axis` - if axis == 0: + # may not support the `axis` keyword + if ea_compat_axis or axis == 0: return cls._concat_same_type(to_concat_eas) else: - return cls._concat_same_type(to_concat_eas, axis=axis) + return cls._concat_same_type( + to_concat_eas, + axis=axis, # type: ignore[call-arg] + ) else: to_concat_arrs = cast("Sequence[np.ndarray]", to_concat) result = np.concatenate(to_concat_arrs, axis=axis) From 4c9c7d1948793bda985693b4587d06b314ae861a Mon Sep 17 00:00:00 2001 From: Charlie-XIAO Date: Tue, 13 Jun 2023 16:20:49 +0800 Subject: [PATCH 3/7] retrigger checks From b16024d39eb210fed36203295c5801bdb51c15cd Mon Sep 17 00:00:00 2001 From: Yao Xiao <108576690+Charlie-XIAO@users.noreply.github.com> Date: Thu, 26 Oct 2023 01:00:06 +0800 Subject: [PATCH 4/7] parametrize test --- pandas/tests/reshape/concat/test_concat.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 9aca9ee26fd07..b5e73842f1d88 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -5,6 +5,7 @@ from collections.abc import Iterator from datetime import datetime from decimal import Decimal +from itertools import permutations import numpy as np import pytest @@ -869,15 +870,17 @@ def test_concat_ea_upcast(): tm.assert_frame_equal(result, expected) -def test_concat_datetime64_diff_resolution(): +@pytest.mark.parametrize( + "dtype1,dtype2", + permutations( + ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], 2 + ), +) +def test_concat_different_datetime_resolution(dtype1, dtype2): # GH#53640 - df1 = DataFrame({"a": [0, 1], "b": [4, 5]}, dtype="datetime64[s]") - df2 = DataFrame({"a": [2, 3], "b": [6, 7]}, dtype="datetime64[ms]") + df1 = DataFrame(np.random.default_rng(2).standard_normal((3, 4)), dtype=dtype1) + df2 = DataFrame(np.random.default_rng(2).standard_normal((4, 4)), dtype=dtype2) result = concat([df1, df2]) - expected = DataFrame( - data={"a": [0, 1000, 2, 3], "b": [4000, 5000, 6, 7]}, - index=[0, 1, 0, 1], - dtype="datetime64[ms]", - ) + expected = DataFrame(np.r_[df1.values, df2.values], index=[0, 1, 2, 0, 1, 2, 3]) tm.assert_frame_equal(result, expected) From 5865480b07723d6f07117969ba80c74006b2ffa0 Mon Sep 17 00:00:00 2001 From: Yao Xiao <108576690+Charlie-XIAO@users.noreply.github.com> Date: Fri, 27 Oct 2023 21:34:05 +0800 Subject: [PATCH 5/7] apply suggestion of jbrockmendel --- pandas/tests/reshape/concat/test_concat.py | 17 ------------- pandas/tests/reshape/concat/test_datetimes.py | 24 +++++++++++++++---- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 7478b8f5280fb..216422bc12dc2 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -5,7 +5,6 @@ from collections.abc import Iterator from datetime import datetime from decimal import Decimal -from itertools import permutations import numpy as np import pytest @@ -882,19 +881,3 @@ def test_concat_none_with_timezone_timestamp(): result = concat([df1, df2], ignore_index=True) expected = DataFrame({"A": [None, pd.Timestamp("1990-12-20 00:00:00+00:00")]}) tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "dtype1,dtype2", - permutations( - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], 2 - ), -) -def test_concat_different_datetime_resolution(dtype1, dtype2): - # GH#53640 - df1 = DataFrame(np.random.default_rng(2).standard_normal((3, 4)), dtype=dtype1) - df2 = DataFrame(np.random.default_rng(2).standard_normal((4, 4)), dtype=dtype2) - result = concat([df1, df2]) - - expected = DataFrame(np.r_[df1.values, df2.values], index=[0, 1, 2, 0, 1, 2, 3]) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 71606fb72c0f6..4eb2177d85da4 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -19,6 +19,22 @@ ) import pandas._testing as tm +UNITS = ["s", "ms", "us", "ns"] + + +@pytest.fixture(params=UNITS) +def unit(request): + return request.param + + +unit2 = unit + + +def _get_finer_unit(unit, unit2): + if UNITS.index(unit) >= UNITS.index(unit2): + return unit + return unit2 + class TestDatetimeConcat: def test_concat_datetime64_block(self): @@ -307,17 +323,17 @@ def test_concat_tz_series2(self): result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) - def test_concat_tz_series3(self): + def test_concat_tz_series3(self, unit, unit2): # see gh-12217 and gh-12306 # Concatenating two UTC times - first = DataFrame([[datetime(2016, 1, 1)]]) + first = DataFrame([[datetime(2016, 1, 1)]], dtype=f"M8[{unit}]") first[0] = first[0].dt.tz_localize("UTC") - second = DataFrame([[datetime(2016, 1, 2)]]) + second = DataFrame([[datetime(2016, 1, 2)]], dtype=f"M8[{unit2}]") second[0] = second[0].dt.tz_localize("UTC") result = concat([first, second]) - assert result[0].dtype == "datetime64[ns, UTC]" + assert result[0].dtype == f"datetime64[{_get_finer_unit(unit, unit2)}, UTC]" def test_concat_tz_series4(self): # Concatenating two London times From 555106b514105c1eba1c0ec9c5df3bdce6007a58 Mon Sep 17 00:00:00 2001 From: Yao Xiao <108576690+Charlie-XIAO@users.noreply.github.com> Date: Fri, 27 Oct 2023 22:08:02 +0800 Subject: [PATCH 6/7] add parametrization for test_concat_tz_series* --- pandas/tests/reshape/concat/test_datetimes.py | 57 +++++++++++++------ 1 file changed, 39 insertions(+), 18 deletions(-) diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 4eb2177d85da4..22e79dd5000cf 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -307,17 +307,25 @@ def test_concat_NaT_series_dataframe_all_NaT(self, tz1, tz2): class TestTimezoneConcat: - def test_concat_tz_series(self): + def test_concat_tz_series(self, unit, unit2): # gh-11755: tz and no tz - x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC")) - y = Series(date_range("2012-01-01", "2012-01-02")) + x = Series( + date_range( + "20151124 08:00", "20151124 09:00", freq="1h", tz="UTC", unit=unit + ) + ) + y = Series(date_range("2012-01-01", "2012-01-02", unit=unit2)) expected = Series([x[0], x[1], y[0], y[1]], dtype="object") result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) - def test_concat_tz_series2(self): + def test_concat_tz_series2(self, unit): # gh-11887: concat tz and object - x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC")) + x = Series( + date_range( + "20151124 08:00", "20151124 09:00", freq="1h", tz="UTC", unit=unit + ) + ) y = Series(["a", "b"]) expected = Series([x[0], x[1], y[0], y[1]], dtype="object") result = concat([x, y], ignore_index=True) @@ -335,38 +343,51 @@ def test_concat_tz_series3(self, unit, unit2): result = concat([first, second]) assert result[0].dtype == f"datetime64[{_get_finer_unit(unit, unit2)}, UTC]" - def test_concat_tz_series4(self): + def test_concat_tz_series4(self, unit, unit2): # Concatenating two London times - first = DataFrame([[datetime(2016, 1, 1)]]) + first = DataFrame([[datetime(2016, 1, 1)]], dtype=f"M8[{unit}]") first[0] = first[0].dt.tz_localize("Europe/London") - second = DataFrame([[datetime(2016, 1, 2)]]) + second = DataFrame([[datetime(2016, 1, 2)]], dtype=f"M8[{unit2}]") second[0] = second[0].dt.tz_localize("Europe/London") result = concat([first, second]) - assert result[0].dtype == "datetime64[ns, Europe/London]" + assert ( + result[0].dtype + == f"datetime64[{_get_finer_unit(unit, unit2)}, Europe/London]" + ) - def test_concat_tz_series5(self): + def test_concat_tz_series5(self, unit, unit2): # Concatenating 2+1 London times - first = DataFrame([[datetime(2016, 1, 1)], [datetime(2016, 1, 2)]]) + first = DataFrame( + [[datetime(2016, 1, 1)], [datetime(2016, 1, 2)]], dtype=f"M8[{unit}]" + ) first[0] = first[0].dt.tz_localize("Europe/London") - second = DataFrame([[datetime(2016, 1, 3)]]) + second = DataFrame([[datetime(2016, 1, 3)]], dtype=f"M8[{unit2}]") second[0] = second[0].dt.tz_localize("Europe/London") result = concat([first, second]) - assert result[0].dtype == "datetime64[ns, Europe/London]" + assert ( + result[0].dtype + == f"datetime64[{_get_finer_unit(unit, unit2)}, Europe/London]" + ) - def test_concat_tz_series6(self): - # Concat'ing 1+2 London times - first = DataFrame([[datetime(2016, 1, 1)]]) + def test_concat_tz_series6(self, unit, unit2): + # Concatenaing 1+2 London times + first = DataFrame([[datetime(2016, 1, 1)]], dtype=f"M8[{unit}]") first[0] = first[0].dt.tz_localize("Europe/London") - second = DataFrame([[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]]) + second = DataFrame( + [[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]], dtype=f"M8[{unit2}]" + ) second[0] = second[0].dt.tz_localize("Europe/London") result = concat([first, second]) - assert result[0].dtype == "datetime64[ns, Europe/London]" + assert ( + result[0].dtype + == f"datetime64[{_get_finer_unit(unit, unit2)}, Europe/London]" + ) def test_concat_tz_series_tzlocal(self): # see gh-13583 From 571c23cee7dccbb22b78f50e1bb53a3d55cb0eab Mon Sep 17 00:00:00 2001 From: Yao Xiao <108576690+Charlie-XIAO@users.noreply.github.com> Date: Sat, 28 Oct 2023 11:15:30 +0800 Subject: [PATCH 7/7] apply suggested changes by jbrockmendel --- pandas/tests/reshape/concat/test_datetimes.py | 41 +++++++------------ 1 file changed, 14 insertions(+), 27 deletions(-) diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 22e79dd5000cf..fe8e243919d05 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -307,25 +307,17 @@ def test_concat_NaT_series_dataframe_all_NaT(self, tz1, tz2): class TestTimezoneConcat: - def test_concat_tz_series(self, unit, unit2): + def test_concat_tz_series(self): # gh-11755: tz and no tz - x = Series( - date_range( - "20151124 08:00", "20151124 09:00", freq="1h", tz="UTC", unit=unit - ) - ) - y = Series(date_range("2012-01-01", "2012-01-02", unit=unit2)) + x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC")) + y = Series(date_range("2012-01-01", "2012-01-02")) expected = Series([x[0], x[1], y[0], y[1]], dtype="object") result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) - def test_concat_tz_series2(self, unit): + def test_concat_tz_series2(self): # gh-11887: concat tz and object - x = Series( - date_range( - "20151124 08:00", "20151124 09:00", freq="1h", tz="UTC", unit=unit - ) - ) + x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC")) y = Series(["a", "b"]) expected = Series([x[0], x[1], y[0], y[1]], dtype="object") result = concat([x, y], ignore_index=True) @@ -341,7 +333,8 @@ def test_concat_tz_series3(self, unit, unit2): second[0] = second[0].dt.tz_localize("UTC") result = concat([first, second]) - assert result[0].dtype == f"datetime64[{_get_finer_unit(unit, unit2)}, UTC]" + exp_unit = _get_finer_unit(unit, unit2) + assert result[0].dtype == f"datetime64[{exp_unit}, UTC]" def test_concat_tz_series4(self, unit, unit2): # Concatenating two London times @@ -352,10 +345,8 @@ def test_concat_tz_series4(self, unit, unit2): second[0] = second[0].dt.tz_localize("Europe/London") result = concat([first, second]) - assert ( - result[0].dtype - == f"datetime64[{_get_finer_unit(unit, unit2)}, Europe/London]" - ) + exp_unit = _get_finer_unit(unit, unit2) + assert result[0].dtype == f"datetime64[{exp_unit}, Europe/London]" def test_concat_tz_series5(self, unit, unit2): # Concatenating 2+1 London times @@ -368,13 +359,11 @@ def test_concat_tz_series5(self, unit, unit2): second[0] = second[0].dt.tz_localize("Europe/London") result = concat([first, second]) - assert ( - result[0].dtype - == f"datetime64[{_get_finer_unit(unit, unit2)}, Europe/London]" - ) + exp_unit = _get_finer_unit(unit, unit2) + assert result[0].dtype == f"datetime64[{exp_unit}, Europe/London]" def test_concat_tz_series6(self, unit, unit2): - # Concatenaing 1+2 London times + # Concatenating 1+2 London times first = DataFrame([[datetime(2016, 1, 1)]], dtype=f"M8[{unit}]") first[0] = first[0].dt.tz_localize("Europe/London") @@ -384,10 +373,8 @@ def test_concat_tz_series6(self, unit, unit2): second[0] = second[0].dt.tz_localize("Europe/London") result = concat([first, second]) - assert ( - result[0].dtype - == f"datetime64[{_get_finer_unit(unit, unit2)}, Europe/London]" - ) + exp_unit = _get_finer_unit(unit, unit2) + assert result[0].dtype == f"datetime64[{exp_unit}, Europe/London]" def test_concat_tz_series_tzlocal(self): # see gh-13583