Skip to content

Commit 961f9c4

Browse files
authored
BUG: interchange.from_dataframe doesn't work with large_string (#52800)
1 parent 17345e3 commit 961f9c4

File tree

4 files changed

+23
-3
lines changed

4 files changed

+23
-3
lines changed

doc/source/whatsnew/v2.0.1.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ Bug fixes
2828
- Bug in :attr:`Series.dt.days` that would overflow ``int32`` number of days (:issue:`52391`)
2929
- Bug in :class:`arrays.DatetimeArray` constructor returning an incorrect unit when passed a non-nanosecond numpy datetime array (:issue:`52555`)
3030
- Bug in :func:`Series.median` with :class:`ArrowDtype` returning an approximate median (:issue:`52679`)
31-
- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on-categorical dtypes (:issue:`49889`)
31+
- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on categorical dtypes (:issue:`49889`)
32+
- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on large string dtypes (:issue:`52795`)
3233
- Bug in :func:`pandas.testing.assert_series_equal` where ``check_dtype=False`` would still raise for datetime or timedelta types with different resolutions (:issue:`52449`)
3334
- Bug in :func:`read_csv` casting PyArrow datetimes to NumPy when ``dtype_backend="pyarrow"`` and ``parse_dates`` is set causing a performance bottleneck in the process (:issue:`52546`)
3435
- Bug in :func:`to_datetime` and :func:`to_timedelta` when trying to convert numeric data with a :class:`ArrowDtype` (:issue:`52425`)

pandas/core/interchange/from_dataframe.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -238,8 +238,11 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
238238
# Retrieve the data buffer containing the UTF-8 code units
239239
data_buff, protocol_data_dtype = buffers["data"]
240240
# We're going to reinterpret the buffer as uint8, so make sure we can do it safely
241-
assert protocol_data_dtype[1] == 8 # bitwidth == 8
242-
assert protocol_data_dtype[2] == ArrowCTypes.STRING # format_str == utf-8
241+
assert protocol_data_dtype[1] == 8
242+
assert protocol_data_dtype[2] in (
243+
ArrowCTypes.STRING,
244+
ArrowCTypes.LARGE_STRING,
245+
) # format_str == utf-8
243246
# Convert the buffers to NumPy arrays. In order to go from STRING to
244247
# an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array)
245248
data_dtype = (

pandas/core/interchange/utils.py

+1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ class ArrowCTypes:
3939
FLOAT32 = "f"
4040
FLOAT64 = "g"
4141
STRING = "u" # utf-8
42+
LARGE_STRING = "U" # utf-8
4243
DATE32 = "tdD"
4344
DATE64 = "tdm"
4445
# Resoulution:

pandas/tests/interchange/test_impl.py

+15
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,21 @@ def test_categorical_pyarrow():
8989
tm.assert_frame_equal(result, expected)
9090

9191

92+
def test_large_string_pyarrow():
93+
# GH 52795
94+
pa = pytest.importorskip("pyarrow", "11.0.0")
95+
96+
arr = ["Mon", "Tue"]
97+
table = pa.table({"weekday": pa.array(arr, "large_string")})
98+
exchange_df = table.__dataframe__()
99+
result = from_dataframe(exchange_df)
100+
expected = pd.DataFrame({"weekday": ["Mon", "Tue"]})
101+
tm.assert_frame_equal(result, expected)
102+
103+
# check round-trip
104+
assert pa.Table.equals(pa.interchange.from_dataframe(result), table)
105+
106+
92107
@pytest.mark.parametrize(
93108
"data", [int_data, uint_data, float_data, bool_data, datetime_data]
94109
)

0 commit comments

Comments
 (0)