Skip to content

Backport PR #28841: BUG: use EA.astype in ExtensionBlock.to_native_types (#28841) #28985

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.25.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ I/O

- Fix regression in notebook display where <th> tags not used for :attr:`DataFrame.index` (:issue:`28204`).
- Regression in :meth:`~DataFrame.to_csv` where writing a :class:`Series` or :class:`DataFrame` indexed by an :class:`IntervalIndex` would incorrectly raise a ``TypeError`` (:issue:`28210`)
-
- Fix :meth:`~DataFrame.to_csv` with ``ExtensionArray`` with list-like values (:issue:`28840`).
-

Plotting
Expand Down
19 changes: 18 additions & 1 deletion pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -749,7 +749,6 @@ def _try_coerce_and_cast_result(self, result, dtype=None):

def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs):
""" convert to our native types format, slicing if desired """

values = self.get_values()

if slicer is not None:
Expand Down Expand Up @@ -1848,6 +1847,23 @@ def get_values(self, dtype=None):
def to_dense(self):
return np.asarray(self.values)

def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs):
"""override to use ExtensionArray astype for the conversion"""
values = self.values
if slicer is not None:
values = values[slicer]
mask = isna(values)

try:
values = values.astype(str)
values[mask] = na_rep
except Exception:
# eg SparseArray does not support setitem, needs to be converted to ndarray
return super().to_native_types(slicer, na_rep, quoting, **kwargs)

# we are expected to return a 2-d ndarray
return values.reshape(1, len(values))

def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None):
"""
Take values according to indexer and return them as a block.
Expand Down Expand Up @@ -2374,6 +2390,7 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock):
is_extension = True

_can_hold_element = DatetimeBlock._can_hold_element
to_native_types = DatetimeBlock.to_native_types

@property
def _holder(self):
Expand Down
3 changes: 3 additions & 0 deletions pandas/tests/extension/list/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .array import ListArray, ListDtype, make_data

__all__ = ["ListArray", "ListDtype", "make_data"]
133 changes: 133 additions & 0 deletions pandas/tests/extension/list/array.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
"""
Test extension array for storing nested data in a pandas container.

The ListArray stores an ndarray of lists.
"""
import numbers
import random
import string

import numpy as np

from pandas.core.dtypes.base import ExtensionDtype

import pandas as pd
from pandas.core.arrays import ExtensionArray


class ListDtype(ExtensionDtype):
type = list
name = "list"
na_value = np.nan

@classmethod
def construct_array_type(cls):
"""
Return the array type associated with this dtype.

Returns
-------
type
"""
return ListArray

@classmethod
def construct_from_string(cls, string):
if string == cls.name:
return cls()
else:
raise TypeError("Cannot construct a '{}' from '{}'".format(cls, string))


class ListArray(ExtensionArray):
dtype = ListDtype()
__array_priority__ = 1000

def __init__(self, values, dtype=None, copy=False):
if not isinstance(values, np.ndarray):
raise TypeError("Need to pass a numpy array as values")
for val in values:
if not isinstance(val, self.dtype.type) and not pd.isna(val):
raise TypeError("All values must be of type " + str(self.dtype.type))
self.data = values

@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
data = np.empty(len(scalars), dtype=object)
data[:] = scalars
return cls(data)

def __getitem__(self, item):
if isinstance(item, numbers.Integral):
return self.data[item]
else:
# slice, list-like, mask
return type(self)(self.data[item])

def __len__(self) -> int:
return len(self.data)

def isna(self):
return np.array(
[not isinstance(x, list) and np.isnan(x) for x in self.data], dtype=bool
)

def take(self, indexer, allow_fill=False, fill_value=None):
# re-implement here, since NumPy has trouble setting
# sized objects like UserDicts into scalar slots of
# an ndarary.
indexer = np.asarray(indexer)
msg = (
"Index is out of bounds or cannot do a "
"non-empty take from an empty array."
)

if allow_fill:
if fill_value is None:
fill_value = self.dtype.na_value
# bounds check
if (indexer < -1).any():
raise ValueError
try:
output = [
self.data[loc] if loc != -1 else fill_value for loc in indexer
]
except IndexError:
raise IndexError(msg)
else:
try:
output = [self.data[loc] for loc in indexer]
except IndexError:
raise IndexError(msg)

return self._from_sequence(output)

def copy(self):
return type(self)(self.data[:])

def astype(self, dtype, copy=True):
if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
if copy:
return self.copy()
return self
elif pd.api.types.is_string_dtype(dtype) and not pd.api.types.is_object_dtype(
dtype
):
# numpy has problems with astype(str) for nested elements
return np.array([str(x) for x in self.data], dtype=dtype)
return np.array(self.data, dtype=dtype, copy=copy)

@classmethod
def _concat_same_type(cls, to_concat):
data = np.concatenate([x.data for x in to_concat])
return cls(data)


def make_data():
# TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer
data = np.empty(100, dtype=object)
data[:] = [
[random.choice(string.ascii_letters) for _ in range(random.randint(0, 10))]
for _ in range(100)
]
return data
30 changes: 30 additions & 0 deletions pandas/tests/extension/list/test_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import pytest

import pandas as pd

from .array import ListArray, ListDtype, make_data


@pytest.fixture
def dtype():
return ListDtype()


@pytest.fixture
def data():
"""Length-100 ListArray for semantics test."""
data = make_data()

while len(data[0]) == len(data[1]):
data = make_data()

return ListArray(data)


def test_to_csv(data):
# https://github.com/pandas-dev/pandas/issues/28840
# array with list-likes fail when doing astype(str) on the numpy array
# which was done in to_native_types
df = pd.DataFrame({"a": data})
res = df.to_csv()
assert str(data[0]) in res