From 38308e7de3682784820bb275ae3b4a0ad3c439b7 Mon Sep 17 00:00:00 2001 From: Eduard Akhmetshin Date: Sat, 3 Aug 2024 13:43:27 +0100 Subject: [PATCH 1/7] Add test_non_nanosecond_timestamps --- pandas/tests/io/test_parquet.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 561c718ea5851..beb384a776dd0 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1131,6 +1131,31 @@ def test_infer_string_large_string_type(self, tmp_path, pa): # assert result["strings"].dtype == "string" # FIXME: don't leave commented-out + def test_non_nanosecond_timestamps(self, tmp_path, pa): + # GH#49236 + # + # pandas 1.x didn't support non-nanosecond datetimes. + # pyarrow.Table.to_pandas supports timestamp_as_object param to solve that issue: + # https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.to_pandas + # + # This test tests that the current version of pandas supports non-nanosecond (microsecond in this case) datetimes, + # the code example from GH#49236 doesn't fail anymore, and timestamp_as_object is not needed. + import pyarrow as pa + import pyarrow.parquet as pq + + path = tmp_path / "non_nanosecond_timestamp.p" + + arr = pa.array([datetime.datetime(1600, 1, 1)], type=pa.timestamp("us")) + table = pa.table([arr], names=["timestamp"]) + pq.write_table(table, path) + + result = read_parquet(path) + expected = pd.DataFrame( + data={"timestamp": [datetime.datetime(1600, 1, 1)]}, + dtype="datetime64[us]", + ) + tm.assert_frame_equal(result, expected) + class TestParquetFastParquet(Base): @pytest.mark.xfail(reason="datetime_with_nat gets incorrect values") From e3bc03e4cd3d3bbef1a468932857fa0d3ee970e6 Mon Sep 17 00:00:00 2001 From: Eduard Akhmetshin Date: Sat, 3 Aug 2024 13:59:34 +0100 Subject: [PATCH 2/7] Fix lint errors --- pandas/tests/io/test_parquet.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index beb384a776dd0..374612d4cae30 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1135,11 +1135,13 @@ def test_non_nanosecond_timestamps(self, tmp_path, pa): # GH#49236 # # pandas 1.x didn't support non-nanosecond datetimes. - # pyarrow.Table.to_pandas supports timestamp_as_object param to solve that issue: + # pyarrow.Table.to_pandas supports timestamp_as_object param to solve that issue # https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.to_pandas # - # This test tests that the current version of pandas supports non-nanosecond (microsecond in this case) datetimes, - # the code example from GH#49236 doesn't fail anymore, and timestamp_as_object is not needed. + # This test tests that the current version of pandas + # supports non-nanosecond (microsecond in this case) datetimes, + # the code example from GH#49236 doesn't fail anymore, + # and timestamp_as_object param is not needed. import pyarrow as pa import pyarrow.parquet as pq From 43b05607e33248e897e939191af511358cc08b75 Mon Sep 17 00:00:00 2001 From: Eduard Akhmetshin Date: Sat, 3 Aug 2024 21:20:52 +0100 Subject: [PATCH 3/7] Remove detailed comment; use temp path helper --- pandas/tests/io/test_parquet.py | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 374612d4cae30..a03d2b037879d 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1131,27 +1131,17 @@ def test_infer_string_large_string_type(self, tmp_path, pa): # assert result["strings"].dtype == "string" # FIXME: don't leave commented-out - def test_non_nanosecond_timestamps(self, tmp_path, pa): + def test_non_nanosecond_timestamps(self): # GH#49236 - # - # pandas 1.x didn't support non-nanosecond datetimes. - # pyarrow.Table.to_pandas supports timestamp_as_object param to solve that issue - # https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.to_pandas - # - # This test tests that the current version of pandas - # supports non-nanosecond (microsecond in this case) datetimes, - # the code example from GH#49236 doesn't fail anymore, - # and timestamp_as_object param is not needed. import pyarrow as pa import pyarrow.parquet as pq - path = tmp_path / "non_nanosecond_timestamp.p" - - arr = pa.array([datetime.datetime(1600, 1, 1)], type=pa.timestamp("us")) - table = pa.table([arr], names=["timestamp"]) - pq.write_table(table, path) + with tm.ensure_clean() as path: + arr = pa.array([datetime.datetime(1600, 1, 1)], type=pa.timestamp("us")) + table = pa.table([arr], names=["timestamp"]) + pq.write_table(table, path) + result = read_parquet(path) - result = read_parquet(path) expected = pd.DataFrame( data={"timestamp": [datetime.datetime(1600, 1, 1)]}, dtype="datetime64[us]", From cff22d49252fbdb3339d00c6b7d0d3a7b1523ae6 Mon Sep 17 00:00:00 2001 From: Eduard Akhmetshin Date: Tue, 6 Aug 2024 21:12:54 +0100 Subject: [PATCH 4/7] Use temp_file --- pandas/tests/io/test_parquet.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index a03d2b037879d..7dbc97fbcb633 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1131,17 +1131,15 @@ def test_infer_string_large_string_type(self, tmp_path, pa): # assert result["strings"].dtype == "string" # FIXME: don't leave commented-out - def test_non_nanosecond_timestamps(self): + def test_non_nanosecond_timestamps(self, temp_file): # GH#49236 import pyarrow as pa import pyarrow.parquet as pq - with tm.ensure_clean() as path: - arr = pa.array([datetime.datetime(1600, 1, 1)], type=pa.timestamp("us")) - table = pa.table([arr], names=["timestamp"]) - pq.write_table(table, path) - result = read_parquet(path) - + arr = pa.array([datetime.datetime(1600, 1, 1)], type=pa.timestamp("us")) + table = pa.table([arr], names=["timestamp"]) + pq.write_table(table, temp_file) + result = read_parquet(temp_file) expected = pd.DataFrame( data={"timestamp": [datetime.datetime(1600, 1, 1)]}, dtype="datetime64[us]", From 7c70df4ca24a29ea7cabadedfb961ba3654dda4d Mon Sep 17 00:00:00 2001 From: Eduard Akhmetshin Date: Fri, 9 Aug 2024 14:36:17 +0100 Subject: [PATCH 5/7] Apply suggestions from code review Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/io/test_parquet.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 7dbc97fbcb633..085d1c204de73 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1133,8 +1133,9 @@ def test_infer_string_large_string_type(self, tmp_path, pa): def test_non_nanosecond_timestamps(self, temp_file): # GH#49236 - import pyarrow as pa - import pyarrow.parquet as pq + pa = pytest.importorskip("pyarrow") + pq = pytest.importorskip("pyarrow.parquet") + arr = pa.array([datetime.datetime(1600, 1, 1)], type=pa.timestamp("us")) table = pa.table([arr], names=["timestamp"]) From 92561485d782b719220047dd98b5a5ebfe7c3afd Mon Sep 17 00:00:00 2001 From: Eduard Akhmetshin Date: Fri, 9 Aug 2024 14:36:58 +0100 Subject: [PATCH 6/7] Remove extra empty line --- pandas/tests/io/test_parquet.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 085d1c204de73..5eade611ba5cb 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1136,7 +1136,6 @@ def test_non_nanosecond_timestamps(self, temp_file): pa = pytest.importorskip("pyarrow") pq = pytest.importorskip("pyarrow.parquet") - arr = pa.array([datetime.datetime(1600, 1, 1)], type=pa.timestamp("us")) table = pa.table([arr], names=["timestamp"]) pq.write_table(table, temp_file) From 8ca1aaff03dbabd6babfc74ef5981b0a88e6677b Mon Sep 17 00:00:00 2001 From: Eduard Akhmetshin Date: Mon, 19 Aug 2024 18:53:16 +0100 Subject: [PATCH 7/7] Skip test in minimal version env --- pandas/tests/io/test_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 0c68bf28ca591..f4d64bf84b3f5 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1139,7 +1139,7 @@ def test_infer_string_large_string_type(self, tmp_path, pa): def test_non_nanosecond_timestamps(self, temp_file): # GH#49236 - pa = pytest.importorskip("pyarrow") + pa = pytest.importorskip("pyarrow", "11.0.0") pq = pytest.importorskip("pyarrow.parquet") arr = pa.array([datetime.datetime(1600, 1, 1)], type=pa.timestamp("us"))