Skip to content

Commit 8c5941c

Browse files
BUG: use EA.astype in ExtensionBlock.to_native_types (#28841)
1 parent 5b0bf23 commit 8c5941c

File tree

5 files changed

+185
-2
lines changed

5 files changed

+185
-2
lines changed

doc/source/whatsnew/v0.25.2.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ I/O
6464

6565
- Fix regression in notebook display where <th> tags not used for :attr:`DataFrame.index` (:issue:`28204`).
6666
- Regression in :meth:`~DataFrame.to_csv` where writing a :class:`Series` or :class:`DataFrame` indexed by an :class:`IntervalIndex` would incorrectly raise a ``TypeError`` (:issue:`28210`)
67-
-
67+
- Fix :meth:`~DataFrame.to_csv` with ``ExtensionArray`` with list-like values (:issue:`28840`).
6868
-
6969

7070
Plotting

pandas/core/internals/blocks.py

+18-1
Original file line numberDiff line numberDiff line change
@@ -687,7 +687,6 @@ def _try_coerce_args(self, other):
687687

688688
def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs):
689689
""" convert to our native types format, slicing if desired """
690-
691690
values = self.get_values()
692691

693692
if slicer is not None:
@@ -1783,6 +1782,23 @@ def get_values(self, dtype=None):
17831782
def to_dense(self):
17841783
return np.asarray(self.values)
17851784

1785+
def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs):
1786+
"""override to use ExtensionArray astype for the conversion"""
1787+
values = self.values
1788+
if slicer is not None:
1789+
values = values[slicer]
1790+
mask = isna(values)
1791+
1792+
try:
1793+
values = values.astype(str)
1794+
values[mask] = na_rep
1795+
except Exception:
1796+
# eg SparseArray does not support setitem, needs to be converted to ndarray
1797+
return super().to_native_types(slicer, na_rep, quoting, **kwargs)
1798+
1799+
# we are expected to return a 2-d ndarray
1800+
return values.reshape(1, len(values))
1801+
17861802
def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None):
17871803
"""
17881804
Take values according to indexer and return them as a block.
@@ -2265,6 +2281,7 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock):
22652281
is_extension = True
22662282

22672283
_can_hold_element = DatetimeBlock._can_hold_element
2284+
to_native_types = DatetimeBlock.to_native_types
22682285
fill_value = np.datetime64("NaT", "ns")
22692286

22702287
@property
+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .array import ListArray, ListDtype, make_data
2+
3+
__all__ = ["ListArray", "ListDtype", "make_data"]

pandas/tests/extension/list/array.py

+133
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
"""
2+
Test extension array for storing nested data in a pandas container.
3+
4+
The ListArray stores an ndarray of lists.
5+
"""
6+
import numbers
7+
import random
8+
import string
9+
10+
import numpy as np
11+
12+
from pandas.core.dtypes.base import ExtensionDtype
13+
14+
import pandas as pd
15+
from pandas.core.arrays import ExtensionArray
16+
17+
18+
class ListDtype(ExtensionDtype):
19+
type = list
20+
name = "list"
21+
na_value = np.nan
22+
23+
@classmethod
24+
def construct_array_type(cls):
25+
"""
26+
Return the array type associated with this dtype.
27+
28+
Returns
29+
-------
30+
type
31+
"""
32+
return ListArray
33+
34+
@classmethod
35+
def construct_from_string(cls, string):
36+
if string == cls.name:
37+
return cls()
38+
else:
39+
raise TypeError("Cannot construct a '{}' from '{}'".format(cls, string))
40+
41+
42+
class ListArray(ExtensionArray):
43+
dtype = ListDtype()
44+
__array_priority__ = 1000
45+
46+
def __init__(self, values, dtype=None, copy=False):
47+
if not isinstance(values, np.ndarray):
48+
raise TypeError("Need to pass a numpy array as values")
49+
for val in values:
50+
if not isinstance(val, self.dtype.type) and not pd.isna(val):
51+
raise TypeError("All values must be of type " + str(self.dtype.type))
52+
self.data = values
53+
54+
@classmethod
55+
def _from_sequence(cls, scalars, dtype=None, copy=False):
56+
data = np.empty(len(scalars), dtype=object)
57+
data[:] = scalars
58+
return cls(data)
59+
60+
def __getitem__(self, item):
61+
if isinstance(item, numbers.Integral):
62+
return self.data[item]
63+
else:
64+
# slice, list-like, mask
65+
return type(self)(self.data[item])
66+
67+
def __len__(self) -> int:
68+
return len(self.data)
69+
70+
def isna(self):
71+
return np.array(
72+
[not isinstance(x, list) and np.isnan(x) for x in self.data], dtype=bool
73+
)
74+
75+
def take(self, indexer, allow_fill=False, fill_value=None):
76+
# re-implement here, since NumPy has trouble setting
77+
# sized objects like UserDicts into scalar slots of
78+
# an ndarary.
79+
indexer = np.asarray(indexer)
80+
msg = (
81+
"Index is out of bounds or cannot do a "
82+
"non-empty take from an empty array."
83+
)
84+
85+
if allow_fill:
86+
if fill_value is None:
87+
fill_value = self.dtype.na_value
88+
# bounds check
89+
if (indexer < -1).any():
90+
raise ValueError
91+
try:
92+
output = [
93+
self.data[loc] if loc != -1 else fill_value for loc in indexer
94+
]
95+
except IndexError:
96+
raise IndexError(msg)
97+
else:
98+
try:
99+
output = [self.data[loc] for loc in indexer]
100+
except IndexError:
101+
raise IndexError(msg)
102+
103+
return self._from_sequence(output)
104+
105+
def copy(self):
106+
return type(self)(self.data[:])
107+
108+
def astype(self, dtype, copy=True):
109+
if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
110+
if copy:
111+
return self.copy()
112+
return self
113+
elif pd.api.types.is_string_dtype(dtype) and not pd.api.types.is_object_dtype(
114+
dtype
115+
):
116+
# numpy has problems with astype(str) for nested elements
117+
return np.array([str(x) for x in self.data], dtype=dtype)
118+
return np.array(self.data, dtype=dtype, copy=copy)
119+
120+
@classmethod
121+
def _concat_same_type(cls, to_concat):
122+
data = np.concatenate([x.data for x in to_concat])
123+
return cls(data)
124+
125+
126+
def make_data():
127+
# TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer
128+
data = np.empty(100, dtype=object)
129+
data[:] = [
130+
[random.choice(string.ascii_letters) for _ in range(random.randint(0, 10))]
131+
for _ in range(100)
132+
]
133+
return data
+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import pytest
2+
3+
import pandas as pd
4+
5+
from .array import ListArray, ListDtype, make_data
6+
7+
8+
@pytest.fixture
9+
def dtype():
10+
return ListDtype()
11+
12+
13+
@pytest.fixture
14+
def data():
15+
"""Length-100 ListArray for semantics test."""
16+
data = make_data()
17+
18+
while len(data[0]) == len(data[1]):
19+
data = make_data()
20+
21+
return ListArray(data)
22+
23+
24+
def test_to_csv(data):
25+
# https://github.com/pandas-dev/pandas/issues/28840
26+
# array with list-likes fail when doing astype(str) on the numpy array
27+
# which was done in to_native_types
28+
df = pd.DataFrame({"a": data})
29+
res = df.to_csv()
30+
assert str(data[0]) in res

0 commit comments

Comments
 (0)