From 7fe4d103389f31651c081efb6cd589d157c2ca38 Mon Sep 17 00:00:00 2001 From: julian048 Date: Tue, 1 Aug 2023 10:59:27 -0400 Subject: [PATCH 1/6] add pyarrow conversion pytests --- pandas/tests/dtypes/test_dtypes.py | 278 +++++++++++++++++++++++++++++ 1 file changed, 278 insertions(+) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 07a4e06db2e74..e90a868aad2d7 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -1,6 +1,7 @@ import re import numpy as np +import pyarrow as pa import pytest import pytz @@ -32,9 +33,14 @@ CategoricalIndex, DatetimeIndex, IntervalIndex, + NaT, Series, SparseDtype, + Timedelta, + Timestamp, + concat, date_range, + timedelta_range, ) import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray @@ -1193,3 +1199,275 @@ def test_multi_column_dtype_assignment(): df["b"] = 0 tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize( + "unit", + [ + "s", + "ms", + "us", + "ns", + ], +) +def test_convert_dtypes_timestamp(unit): + series = Series(date_range("2020-01-01", "2020-01-02", freq="1min")) + expected = series.astype(f"timestamp[{unit}][pyarrow]") + + converted = expected.convert_dtypes(dtype_backend="pyarrow") + + tm.assert_series_equal(expected, converted) + + +@pytest.mark.parametrize( + "unit", + [ + "s", + "ms", + "us", + "ns", + ], +) +def test_convert_dtypes_duration(unit): + series = Series(timedelta_range("1s", "10s", freq="1s")) + expected = series.astype(f"duration[{unit}][pyarrow]") + + converted = expected.convert_dtypes(dtype_backend="pyarrow") + + tm.assert_series_equal(expected, converted) + + +@pytest.mark.parametrize( + "timestamp_unit, duration_unit", + [ + ("s", "s"), + ("s", "ms"), + ("s", "us"), + ("s", "ns"), + ("ms", "s"), + ("ms", "ms"), + ("ms", "us"), + ("ms", "ns"), + ("us", "s"), + ("us", "ms"), + ("us", "us"), + ("us", "ns"), + ("ns", "s"), + ("ns", "ms"), + ("ns", "us"), + ("ns", "ns"), + ], +) +def test_convert_dtypes_timestamp_and_duration(timestamp_unit, duration_unit): + timestamp_series = Series( + date_range("2020-01-01", "2020-01-02", freq="1min") + ).astype(f"timestamp[{timestamp_unit}][pyarrow]") + duration_series = Series(timedelta_range("1s", "10s", freq="1s")).astype( + f"duration[{duration_unit}][pyarrow]" + ) + + df = concat([timestamp_series, duration_series], axis=1) + converted = df.convert_dtypes(dtype_backend="pyarrow") + + tm.assert_frame_equal(df, converted) + + +@pytest.mark.parametrize( + "unit", + [ + "s", + "ms", + "us", + "ns", + ], +) +def test_convert_dtypes_datetime(unit): + series = Series(date_range("2020-01-01", "2020-01-02", freq="1min")).astype( + f"datetime64[{unit}]" + ) + + expected = series.astype(f"timestamp[{unit}][pyarrow]") + converted = series.convert_dtypes(dtype_backend="pyarrow") + + tm.assert_series_equal(expected, converted) + + +@pytest.mark.parametrize( + "unit", + [ + "s", + "ms", + "us", + "ns", + ], +) +def test_convert_dtypes_timedelta(unit): + series = Series(timedelta_range("1s", "10s", freq="1s")).astype( + f"timedelta64[{unit}]" + ) + + expected = series.astype(f"duration[{unit}][pyarrow]") + converted = series.convert_dtypes(dtype_backend="pyarrow") + + tm.assert_series_equal(expected, converted) + + +@pytest.mark.parametrize( + "unit", + [ + "s", + "ms", + "us", + "ns", + ], +) +def test_pa_table_to_pandas_datetime(unit): + df = pd.DataFrame(date_range("2020-01-01", "2020-01-02", freq="1min")).astype( + f"datetime64[{unit}]" + ) + df_converted_to_pa = pa.table(df) + df_back_to_pd = df_converted_to_pa.to_pandas() + + tm.assert_frame_equal(df, df_back_to_pd) + + +@pytest.mark.parametrize( + "unit", + [ + "s", + "ms", + "us", + "ns", + ], +) +def test_pa_table_to_pandas_timedelta(unit): + df = pd.DataFrame(timedelta_range("1s", "10s", freq="1s")).astype( + f"timedelta64[{unit}]" + ) + df_converted_to_pa = pa.table(df) + df_back_to_pd = df_converted_to_pa.to_pandas() + + tm.assert_frame_equal(df, df_back_to_pd) + + +@pytest.mark.parametrize( + "datetime_unit, timedelta_unit", + [ + ("s", "s"), + ("s", "ms"), + ("s", "us"), + ("s", "ns"), + ("ms", "s"), + ("ms", "ms"), + ("ms", "us"), + ("ms", "ns"), + ("us", "s"), + ("us", "ms"), + ("us", "us"), + ("us", "ns"), + ("ns", "s"), + ("ns", "ms"), + ("ns", "us"), + ("ns", "ns"), + ], +) +def test_pa_table_and_to_pandas_datetime_and_timedelta(datetime_unit, timedelta_unit): + timestamp_series = Series( + date_range("2020-01-01", "2020-01-02", freq="1min") + ).astype(f"datetime64[{datetime_unit}]") + duration_series = Series(timedelta_range("1s", "10s", freq="1s")).astype( + f"timedelta64[{timedelta_unit}]" + ) + + df = concat([timestamp_series, duration_series], axis=1) + df_converted_to_pa = pa.table(df) + df_back_to_pd = df_converted_to_pa.to_pandas() + + tm.assert_frame_equal(df, df_back_to_pd) + + +@pytest.mark.parametrize( + "unit", + [ + "s", + "ms", + "us", + "ns", + ], +) +def test_pa_table_to_pandas_timestamp(unit): + df = pd.DataFrame(date_range("2020-01-01", "2020-01-02", freq="1min")).astype( + f"timestamp[{unit}][pyarrow]" + ) + df_converted_to_pa = pa.table(df) + df_back_to_pd = df_converted_to_pa.to_pandas() + + tm.assert_frame_equal(df, df_back_to_pd) + + +@pytest.mark.parametrize( + "unit", + [ + "s", + "ms", + "us", + "ns", + ], +) +def test_pa_table_to_pandas_duration(unit): + df = pd.DataFrame(timedelta_range("1s", "10s", freq="1s")).astype( + f"duration[{unit}][pyarrow]" + ) + df_converted_to_pa = pa.table(df) + df_back_to_pd = df_converted_to_pa.to_pandas() + + tm.assert_frame_equal(df, df_back_to_pd) + + +@pytest.mark.parametrize( + "timestamp_unit, duration_unit", + [ + ("s", "s"), + ("s", "ms"), + ("s", "us"), + ("s", "ns"), + ("ms", "s"), + ("ms", "ms"), + ("ms", "us"), + ("ms", "ns"), + ("us", "s"), + ("us", "ms"), + ("us", "us"), + ("us", "ns"), + ("ns", "s"), + ("ns", "ms"), + ("ns", "us"), + ("ns", "ns"), + ], +) +def test_pa_table_and_to_pandas_timestamp_and_duration(timestamp_unit, duration_unit): + timestamp_series = Series( + date_range("2020-01-01", "2020-01-02", freq="1min") + ).astype(f"timestamp[{timestamp_unit}][pyarrow]") + duration_series = Series(timedelta_range("1s", "10s", freq="1s")).astype( + f"duration[{duration_unit}][pyarrow]" + ) + + df = concat([timestamp_series, duration_series], axis=1) + df_converted_to_pa = pa.table(df) + df_back_to_pd = df_converted_to_pa.to_pandas() + + tm.assert_frame_equal(df, df_back_to_pd) + + +def test_conversion_with_missing_values(): + df = pd.DataFrame( + { + "timestamp_col": [Timestamp("2020-01-01"), NaT], + "duration_col": [Timedelta("1s"), NaT], + } + ) + df_coverted_to_pa = pa.table(df) + df_back_to_pd = df_coverted_to_pa.to_pandas() + + tm.assert_frame_equal(df, df_back_to_pd) From eca3b3fd02155a29a78d61823cf1da7b80e1d85d Mon Sep 17 00:00:00 2001 From: julian048 Date: Tue, 1 Aug 2023 17:54:16 -0400 Subject: [PATCH 2/6] make test match GH issue and move test to proper location --- pandas/tests/dtypes/test_dtypes.py | 278 ------------------ .../frame/methods/test_convert_dtypes.py | 15 + 2 files changed, 15 insertions(+), 278 deletions(-) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index e90a868aad2d7..07a4e06db2e74 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -1,7 +1,6 @@ import re import numpy as np -import pyarrow as pa import pytest import pytz @@ -33,14 +32,9 @@ CategoricalIndex, DatetimeIndex, IntervalIndex, - NaT, Series, SparseDtype, - Timedelta, - Timestamp, - concat, date_range, - timedelta_range, ) import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray @@ -1199,275 +1193,3 @@ def test_multi_column_dtype_assignment(): df["b"] = 0 tm.assert_frame_equal(df, expected) - - -@pytest.mark.parametrize( - "unit", - [ - "s", - "ms", - "us", - "ns", - ], -) -def test_convert_dtypes_timestamp(unit): - series = Series(date_range("2020-01-01", "2020-01-02", freq="1min")) - expected = series.astype(f"timestamp[{unit}][pyarrow]") - - converted = expected.convert_dtypes(dtype_backend="pyarrow") - - tm.assert_series_equal(expected, converted) - - -@pytest.mark.parametrize( - "unit", - [ - "s", - "ms", - "us", - "ns", - ], -) -def test_convert_dtypes_duration(unit): - series = Series(timedelta_range("1s", "10s", freq="1s")) - expected = series.astype(f"duration[{unit}][pyarrow]") - - converted = expected.convert_dtypes(dtype_backend="pyarrow") - - tm.assert_series_equal(expected, converted) - - -@pytest.mark.parametrize( - "timestamp_unit, duration_unit", - [ - ("s", "s"), - ("s", "ms"), - ("s", "us"), - ("s", "ns"), - ("ms", "s"), - ("ms", "ms"), - ("ms", "us"), - ("ms", "ns"), - ("us", "s"), - ("us", "ms"), - ("us", "us"), - ("us", "ns"), - ("ns", "s"), - ("ns", "ms"), - ("ns", "us"), - ("ns", "ns"), - ], -) -def test_convert_dtypes_timestamp_and_duration(timestamp_unit, duration_unit): - timestamp_series = Series( - date_range("2020-01-01", "2020-01-02", freq="1min") - ).astype(f"timestamp[{timestamp_unit}][pyarrow]") - duration_series = Series(timedelta_range("1s", "10s", freq="1s")).astype( - f"duration[{duration_unit}][pyarrow]" - ) - - df = concat([timestamp_series, duration_series], axis=1) - converted = df.convert_dtypes(dtype_backend="pyarrow") - - tm.assert_frame_equal(df, converted) - - -@pytest.mark.parametrize( - "unit", - [ - "s", - "ms", - "us", - "ns", - ], -) -def test_convert_dtypes_datetime(unit): - series = Series(date_range("2020-01-01", "2020-01-02", freq="1min")).astype( - f"datetime64[{unit}]" - ) - - expected = series.astype(f"timestamp[{unit}][pyarrow]") - converted = series.convert_dtypes(dtype_backend="pyarrow") - - tm.assert_series_equal(expected, converted) - - -@pytest.mark.parametrize( - "unit", - [ - "s", - "ms", - "us", - "ns", - ], -) -def test_convert_dtypes_timedelta(unit): - series = Series(timedelta_range("1s", "10s", freq="1s")).astype( - f"timedelta64[{unit}]" - ) - - expected = series.astype(f"duration[{unit}][pyarrow]") - converted = series.convert_dtypes(dtype_backend="pyarrow") - - tm.assert_series_equal(expected, converted) - - -@pytest.mark.parametrize( - "unit", - [ - "s", - "ms", - "us", - "ns", - ], -) -def test_pa_table_to_pandas_datetime(unit): - df = pd.DataFrame(date_range("2020-01-01", "2020-01-02", freq="1min")).astype( - f"datetime64[{unit}]" - ) - df_converted_to_pa = pa.table(df) - df_back_to_pd = df_converted_to_pa.to_pandas() - - tm.assert_frame_equal(df, df_back_to_pd) - - -@pytest.mark.parametrize( - "unit", - [ - "s", - "ms", - "us", - "ns", - ], -) -def test_pa_table_to_pandas_timedelta(unit): - df = pd.DataFrame(timedelta_range("1s", "10s", freq="1s")).astype( - f"timedelta64[{unit}]" - ) - df_converted_to_pa = pa.table(df) - df_back_to_pd = df_converted_to_pa.to_pandas() - - tm.assert_frame_equal(df, df_back_to_pd) - - -@pytest.mark.parametrize( - "datetime_unit, timedelta_unit", - [ - ("s", "s"), - ("s", "ms"), - ("s", "us"), - ("s", "ns"), - ("ms", "s"), - ("ms", "ms"), - ("ms", "us"), - ("ms", "ns"), - ("us", "s"), - ("us", "ms"), - ("us", "us"), - ("us", "ns"), - ("ns", "s"), - ("ns", "ms"), - ("ns", "us"), - ("ns", "ns"), - ], -) -def test_pa_table_and_to_pandas_datetime_and_timedelta(datetime_unit, timedelta_unit): - timestamp_series = Series( - date_range("2020-01-01", "2020-01-02", freq="1min") - ).astype(f"datetime64[{datetime_unit}]") - duration_series = Series(timedelta_range("1s", "10s", freq="1s")).astype( - f"timedelta64[{timedelta_unit}]" - ) - - df = concat([timestamp_series, duration_series], axis=1) - df_converted_to_pa = pa.table(df) - df_back_to_pd = df_converted_to_pa.to_pandas() - - tm.assert_frame_equal(df, df_back_to_pd) - - -@pytest.mark.parametrize( - "unit", - [ - "s", - "ms", - "us", - "ns", - ], -) -def test_pa_table_to_pandas_timestamp(unit): - df = pd.DataFrame(date_range("2020-01-01", "2020-01-02", freq="1min")).astype( - f"timestamp[{unit}][pyarrow]" - ) - df_converted_to_pa = pa.table(df) - df_back_to_pd = df_converted_to_pa.to_pandas() - - tm.assert_frame_equal(df, df_back_to_pd) - - -@pytest.mark.parametrize( - "unit", - [ - "s", - "ms", - "us", - "ns", - ], -) -def test_pa_table_to_pandas_duration(unit): - df = pd.DataFrame(timedelta_range("1s", "10s", freq="1s")).astype( - f"duration[{unit}][pyarrow]" - ) - df_converted_to_pa = pa.table(df) - df_back_to_pd = df_converted_to_pa.to_pandas() - - tm.assert_frame_equal(df, df_back_to_pd) - - -@pytest.mark.parametrize( - "timestamp_unit, duration_unit", - [ - ("s", "s"), - ("s", "ms"), - ("s", "us"), - ("s", "ns"), - ("ms", "s"), - ("ms", "ms"), - ("ms", "us"), - ("ms", "ns"), - ("us", "s"), - ("us", "ms"), - ("us", "us"), - ("us", "ns"), - ("ns", "s"), - ("ns", "ms"), - ("ns", "us"), - ("ns", "ns"), - ], -) -def test_pa_table_and_to_pandas_timestamp_and_duration(timestamp_unit, duration_unit): - timestamp_series = Series( - date_range("2020-01-01", "2020-01-02", freq="1min") - ).astype(f"timestamp[{timestamp_unit}][pyarrow]") - duration_series = Series(timedelta_range("1s", "10s", freq="1s")).astype( - f"duration[{duration_unit}][pyarrow]" - ) - - df = concat([timestamp_series, duration_series], axis=1) - df_converted_to_pa = pa.table(df) - df_back_to_pd = df_converted_to_pa.to_pandas() - - tm.assert_frame_equal(df, df_back_to_pd) - - -def test_conversion_with_missing_values(): - df = pd.DataFrame( - { - "timestamp_col": [Timestamp("2020-01-01"), NaT], - "duration_col": [Timedelta("1s"), NaT], - } - ) - df_coverted_to_pa = pa.table(df) - df_back_to_pd = df_coverted_to_pa.to_pandas() - - tm.assert_frame_equal(df, df_back_to_pd) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 082ef025992dd..5cae255f03581 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -167,3 +167,18 @@ def test_convert_dtypes_pyarrow_to_np_nullable(self): result = ser.convert_dtypes(dtype_backend="numpy_nullable") expected = pd.DataFrame(range(2), dtype="Int32") tm.assert_frame_equal(result, expected) + + def test_convert_dtypes_timestamp_and_duration(self): + # GH 54191 + pytest.importorskip("pyarrow") + timestamp_series = pd.Series( + pd.date_range("2020-01-01", "2020-01-02", freq="1min") + ).astype("timestamp[ms][pyarrow]") + duration_series = pd.Series(pd.timedelta_range("1s", "10s", freq="1s")).astype( + "duration[ms][pyarrow]" + ) + + df = pd.concat([timestamp_series, duration_series], axis=1) + converted = df.convert_dtypes(dtype_backend="pyarrow") + + tm.assert_frame_equal(df, converted) From 8a1a5befc1de2fd5c2e2e6b72262108a8e3aa880 Mon Sep 17 00:00:00 2001 From: julian048 Date: Tue, 1 Aug 2023 17:59:19 -0400 Subject: [PATCH 3/6] fix naming of expected and result variables for assertion --- pandas/tests/frame/methods/test_convert_dtypes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 5cae255f03581..c632fb7e085e3 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -178,7 +178,7 @@ def test_convert_dtypes_timestamp_and_duration(self): "duration[ms][pyarrow]" ) - df = pd.concat([timestamp_series, duration_series], axis=1) - converted = df.convert_dtypes(dtype_backend="pyarrow") + expected = pd.concat([timestamp_series, duration_series], axis=1) + result = expected.convert_dtypes(dtype_backend="pyarrow") - tm.assert_frame_equal(df, converted) + tm.assert_frame_equal(expected, result) From eecb330894270cae46714f010dccd696f1ad58be Mon Sep 17 00:00:00 2001 From: julian048 Date: Tue, 1 Aug 2023 18:06:24 -0400 Subject: [PATCH 4/6] optimize test --- pandas/tests/frame/methods/test_convert_dtypes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index c632fb7e085e3..2a64af1101bc6 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -172,9 +172,9 @@ def test_convert_dtypes_timestamp_and_duration(self): # GH 54191 pytest.importorskip("pyarrow") timestamp_series = pd.Series( - pd.date_range("2020-01-01", "2020-01-02", freq="1min") + pd.date_range("2020-01-01", "2020-01-02", freq="1440min") ).astype("timestamp[ms][pyarrow]") - duration_series = pd.Series(pd.timedelta_range("1s", "10s", freq="1s")).astype( + duration_series = pd.Series(pd.timedelta_range("1s", "2s", freq="1s")).astype( "duration[ms][pyarrow]" ) From 28e2954e4a2bfd7734111a46d1457ec1f1389638 Mon Sep 17 00:00:00 2001 From: julian048 Date: Mon, 7 Aug 2023 14:14:37 -0400 Subject: [PATCH 5/6] make test mirror GH #54191 --- .../tests/frame/methods/test_convert_dtypes.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 2a64af1101bc6..5a4ac40f2a432 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -168,17 +168,10 @@ def test_convert_dtypes_pyarrow_to_np_nullable(self): expected = pd.DataFrame(range(2), dtype="Int32") tm.assert_frame_equal(result, expected) - def test_convert_dtypes_timestamp_and_duration(self): + def test_convert_dtypes_pyarrow_timestamp(self): # GH 54191 pytest.importorskip("pyarrow") - timestamp_series = pd.Series( - pd.date_range("2020-01-01", "2020-01-02", freq="1440min") - ).astype("timestamp[ms][pyarrow]") - duration_series = pd.Series(pd.timedelta_range("1s", "2s", freq="1s")).astype( - "duration[ms][pyarrow]" - ) - - expected = pd.concat([timestamp_series, duration_series], axis=1) - result = expected.convert_dtypes(dtype_backend="pyarrow") - - tm.assert_frame_equal(expected, result) + ser = pd.Series(pd.date_range("2020-01-01", "2020-01-02", freq="1min")) + expected = ser.astype("timestamp[ms][pyarrow]") + result = ser.convert_dtypes(dtype_backend="pyarrow") + tm.assert_series_equal(result, expected) From 2960eb64c4053990caa7a15345083f28a4ea856f Mon Sep 17 00:00:00 2001 From: julian048 Date: Wed, 9 Aug 2023 10:20:07 -0400 Subject: [PATCH 6/6] fix test logic --- pandas/tests/frame/methods/test_convert_dtypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 5a4ac40f2a432..c2b1016e88402 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -173,5 +173,5 @@ def test_convert_dtypes_pyarrow_timestamp(self): pytest.importorskip("pyarrow") ser = pd.Series(pd.date_range("2020-01-01", "2020-01-02", freq="1min")) expected = ser.astype("timestamp[ms][pyarrow]") - result = ser.convert_dtypes(dtype_backend="pyarrow") + result = expected.convert_dtypes(dtype_backend="pyarrow") tm.assert_series_equal(result, expected)