From f6a2a5df481dfe4e2b7696a97cd21031df471090 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 31 Aug 2024 14:27:29 +0200 Subject: [PATCH 1/2] TST (string dtype): fix and clean up arrow roundtrip tests --- pandas/tests/arrays/masked/test_arrow_compat.py | 11 +++-------- pandas/tests/arrays/string_/test_string.py | 12 ++++++++---- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index c719e19a7c8d1..d99b1118444c9 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -1,17 +1,12 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm -pytestmark = [ - pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" - ), - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) pa = pytest.importorskip("pyarrow") diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 91ad01a2fb0eb..d77ddfed445e7 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -522,7 +522,6 @@ def test_arrow_array(dtype): assert arr.equals(expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_roundtrip(dtype, string_storage, using_infer_string): # roundtrip possible from arrow 1.0.0 @@ -541,13 +540,15 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string): assert result["a"].dtype == "object" else: assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage}]") + expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value)) + expected.columns = expected.columns.astype( + pd.StringDtype(string_storage, na_value=np.nan) + ) tm.assert_frame_equal(result, expected) # ensure the missing value is represented by NA and not np.nan or None assert result.loc[2, "a"] is result["a"].dtype.na_value -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): # GH-41040 @@ -569,7 +570,10 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): assert result["a"].dtype == "object" else: assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage}]") + expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value)) + expected.columns = expected.columns.astype( + pd.StringDtype(string_storage, na_value=np.nan) + ) tm.assert_frame_equal(result, expected) From 30c41f3b36822929bd3b72e7dc822c78b86ea54c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Sep 2024 08:59:25 +0200 Subject: [PATCH 2/2] fix using_infer_string --- pandas/tests/arrays/string_/test_string.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index d77ddfed445e7..dd87dbf8e9a43 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -541,9 +541,10 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string): else: assert isinstance(result["a"].dtype, pd.StringDtype) expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value)) - expected.columns = expected.columns.astype( - pd.StringDtype(string_storage, na_value=np.nan) - ) + if using_infer_string: + expected.columns = expected.columns.astype( + pd.StringDtype(string_storage, na_value=np.nan) + ) tm.assert_frame_equal(result, expected) # ensure the missing value is represented by NA and not np.nan or None assert result.loc[2, "a"] is result["a"].dtype.na_value @@ -571,9 +572,10 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): else: assert isinstance(result["a"].dtype, pd.StringDtype) expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value)) - expected.columns = expected.columns.astype( - pd.StringDtype(string_storage, na_value=np.nan) - ) + if using_infer_string: + expected.columns = expected.columns.astype( + pd.StringDtype(string_storage, na_value=np.nan) + ) tm.assert_frame_equal(result, expected)