Skip to content

Commit b4c4c5b

Browse files
WillAydmeeseeksmachine
authored andcommitted
Backport PR pandas-dev#60333: BUG (string dtype): fix handling of string dtype in interchange protocol
1 parent 7958d6c commit b4c4c5b

File tree

2 files changed

+10
-11
lines changed

2 files changed

+10
-11
lines changed

pandas/core/interchange/from_dataframe.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
import numpy as np
88

9+
from pandas._config import using_string_dtype
10+
911
from pandas.compat._optional import import_optional_dependency
1012
from pandas.errors import SettingWithCopyError
1113

@@ -124,8 +126,6 @@ def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame:
124126
-------
125127
pd.DataFrame
126128
"""
127-
# We need a dict of columns here, with each column being a NumPy array (at
128-
# least for now, deal with non-NumPy dtypes later).
129129
columns: dict[str, Any] = {}
130130
buffers = [] # hold on to buffers, keeps memory alive
131131
for name in df.column_names():
@@ -324,8 +324,12 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
324324
# Add to our list of strings
325325
str_list[i] = string
326326

327-
# Convert the string list to a NumPy array
328-
return np.asarray(str_list, dtype="object"), buffers
327+
if using_string_dtype():
328+
res = pd.Series(str_list, dtype="str")
329+
else:
330+
res = np.asarray(str_list, dtype="object") # type: ignore[assignment]
331+
332+
return res, buffers # type: ignore[return-value]
329333

330334

331335
def parse_datetime_format_str(format_str, data) -> pd.Series | np.ndarray:

pandas/tests/interchange/test_impl.py

+2-7
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66
import numpy as np
77
import pytest
88

9-
from pandas._config import using_string_dtype
10-
119
from pandas._libs.tslibs import iNaT
1210
from pandas.compat import (
1311
is_ci_environment,
@@ -412,7 +410,6 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None:
412410
pd.api.interchange.from_dataframe(df)
413411

414412

415-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
416413
def test_empty_string_column():
417414
# https://github.com/pandas-dev/pandas/issues/56703
418415
df = pd.DataFrame({"a": []}, dtype=str)
@@ -421,13 +418,12 @@ def test_empty_string_column():
421418
tm.assert_frame_equal(df, result)
422419

423420

424-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
425421
def test_large_string():
426422
# GH#56702
427423
pytest.importorskip("pyarrow")
428424
df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]")
429425
result = pd.api.interchange.from_dataframe(df.__dataframe__())
430-
expected = pd.DataFrame({"a": ["x"]}, dtype="object")
426+
expected = pd.DataFrame({"a": ["x"]}, dtype="str")
431427
tm.assert_frame_equal(result, expected)
432428

433429

@@ -438,7 +434,6 @@ def test_non_str_names():
438434
assert names == ["0"]
439435

440436

441-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
442437
def test_non_str_names_w_duplicates():
443438
# https://github.com/pandas-dev/pandas/issues/56701
444439
df = pd.DataFrame({"0": [1, 2, 3], 0: [4, 5, 6]})
@@ -449,7 +444,7 @@ def test_non_str_names_w_duplicates():
449444
"Expected a Series, got a DataFrame. This likely happened because you "
450445
"called __dataframe__ on a DataFrame which, after converting column "
451446
r"names to string, resulted in duplicated names: Index\(\['0', '0'\], "
452-
r"dtype='object'\). Please rename these columns before using the "
447+
r"dtype='(str|object)'\). Please rename these columns before using the "
453448
"interchange protocol."
454449
),
455450
):

0 commit comments

Comments
 (0)