diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 2e8e2345d4c0a..24aca954b201f 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -27,7 +27,8 @@ Bug fixes - Bug in :attr:`Series.dt.days` that would overflow ``int32`` number of days (:issue:`52391`) - Bug in :class:`arrays.DatetimeArray` constructor returning an incorrect unit when passed a non-nanosecond numpy datetime array (:issue:`52555`) - Bug in :func:`Series.median` with :class:`ArrowDtype` returning an approximate median (:issue:`52679`) -- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on-categorical dtypes (:issue:`49889`) +- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on categorical dtypes (:issue:`49889`) +- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on large string dtypes (:issue:`52795`) - Bug in :func:`pandas.testing.assert_series_equal` where ``check_dtype=False`` would still raise for datetime or timedelta types with different resolutions (:issue:`52449`) - Bug in :func:`read_csv` casting PyArrow datetimes to NumPy when ``dtype_backend="pyarrow"`` and ``parse_dates`` is set causing a performance bottleneck in the process (:issue:`52546`) - Bug in :func:`to_datetime` and :func:`to_timedelta` when trying to convert numeric data with a :class:`ArrowDtype` (:issue:`52425`) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 2bbb678516968..998f3bc374942 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -238,8 +238,11 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: # Retrieve the data buffer containing the UTF-8 code units data_buff, protocol_data_dtype = buffers["data"] # We're going to reinterpret the buffer as uint8, so make sure we can do it safely - assert protocol_data_dtype[1] == 8 # bitwidth == 8 - assert protocol_data_dtype[2] == ArrowCTypes.STRING # format_str == utf-8 + assert protocol_data_dtype[1] == 8 + assert protocol_data_dtype[2] in ( + ArrowCTypes.STRING, + ArrowCTypes.LARGE_STRING, + ) # format_str == utf-8 # Convert the buffers to NumPy arrays. In order to go from STRING to # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array) data_dtype = ( diff --git a/pandas/core/interchange/utils.py b/pandas/core/interchange/utils.py index 89599818d6814..69c0367238d7a 100644 --- a/pandas/core/interchange/utils.py +++ b/pandas/core/interchange/utils.py @@ -39,6 +39,7 @@ class ArrowCTypes: FLOAT32 = "f" FLOAT64 = "g" STRING = "u" # utf-8 + LARGE_STRING = "U" # utf-8 DATE32 = "tdD" DATE64 = "tdm" # Resoulution: diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index abdfb4e79cb20..a9835b8641e7d 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -89,6 +89,21 @@ def test_categorical_pyarrow(): tm.assert_frame_equal(result, expected) +def test_large_string_pyarrow(): + # GH 52795 + pa = pytest.importorskip("pyarrow", "11.0.0") + + arr = ["Mon", "Tue"] + table = pa.table({"weekday": pa.array(arr, "large_string")}) + exchange_df = table.__dataframe__() + result = from_dataframe(exchange_df) + expected = pd.DataFrame({"weekday": ["Mon", "Tue"]}) + tm.assert_frame_equal(result, expected) + + # check round-trip + assert pa.Table.equals(pa.interchange.from_dataframe(result), table) + + @pytest.mark.parametrize( "data", [int_data, uint_data, float_data, bool_data, datetime_data] )