Skip to content

Bugfix for constructing empty Series, with index, using ExtensionDtyp… #44615

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 12 commits into from
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,7 @@ Sparse
ExtensionArray
^^^^^^^^^^^^^^
- Bug in :meth:`IntegerArray.searchsorted` and :meth:`FloatingArray.searchsorted` returning inconsistent results when acting on ``np.nan`` (:issue:`45255`)
- Bug in :class:`Series` construction with index and empty data when :class:`ExtensionDtype` has ``na_value`` of None (:issue:`44602`)
-

Styler
Expand Down
6 changes: 5 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@
from pandas.core.arrays.sparse import SparseAccessor
import pandas.core.common as com
from pandas.core.construction import (
construct_1d_arraylike_from_scalar,
create_series_with_explicit_dtype,
extract_array,
is_empty_data,
Expand Down Expand Up @@ -495,7 +496,10 @@ def _init_dict(
elif index is not None:
# fastpath for Series(data=None). Just use broadcasting a scalar
# instead of reindexing.
values = na_value_for_dtype(pandas_dtype(dtype), compat=False)
dtype_ = pandas_dtype(dtype)
values = na_value_for_dtype(dtype_, compat=False)
if values is None:
values = construct_1d_arraylike_from_scalar(values, len(index), dtype_)
keys = index
else:
keys, values = (), []
Expand Down
81 changes: 76 additions & 5 deletions pandas/tests/extension/test_common.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from collections import abc
import numbers

import numpy as np
import pytest

Expand All @@ -9,30 +12,81 @@
from pandas.core.arrays import ExtensionArray


class DummyDtype(dtypes.ExtensionDtype):
class DummyClass:
pass


class DummyDtype(dtypes.ExtensionDtype):

type = DummyClass
name = "dummy"

@classmethod
def construct_from_string(cls, string):
if string == cls.name:
return cls()
else:
raise TypeError(f"Cannot construct a '{cls}' from '{string}'")

@classmethod
def construct_array_type(cls):
return DummyArray


class DummyArray(ExtensionArray):

_dtype = DummyDtype

def __init__(self, data):
self.data = data
self.data = np.array(data)

def __array__(self, dtype):
def __array__(self, dtype=None):
return self.data

@property
def dtype(self):
return DummyDtype()
return DummyArray._dtype()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do you need the inderection of _dtype? what is wrong with what was here before

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's because I'm subclassing DummyDtype and DummyArray in order to test several possible na_values

https://github.com/venaturum/pandas/blob/69117c1ebd1621a69164129169232c83a7439b19/pandas/tests/extension/test_common.py#L138

@pytest.mark.parametrize("na_value", [np.nan, pd.NA, None])
def test_empty_series_construction(na_value):
    class TempDType(DummyDtype):
        @classmethod
        def construct_array_type(cls):
            return TempArray

    TempDType.na_value = na_value

    class TempArray(DummyArray):
        _dtype = TempDType

    result = pd.Series(index=[1, 2, 3], dtype=TempDType())
    expected = pd.Series([na_value] * 3, index=[1, 2, 3], dtype=TempDType())
    tm.assert_series_equal(result, expected)


def astype(self, dtype, copy=True):
# we don't support anything but a single dtype
if isinstance(dtype, DummyDtype):
if isinstance(dtype, self._dtype):
if copy:
return type(self)(self.data)
return self

return np.array(self, dtype=dtype, copy=copy)

def __len__(self):
return len(self.data)

@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
if isinstance(scalars, cls._dtype.type):
scalars = [scalars]
return DummyArray(scalars)

def take(self, indices, allow_fill=False, fill_value=None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need all of the new methods this implements?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @jbrockmendel , apologies for the slow reply. These methods, and those in the alternative solution are the minimum required for the tests to pass.

from pandas.core.algorithms import take

data = self.astype(object)

if allow_fill and fill_value is None:
fill_value = self.dtype.na_value

result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill)
return self._from_sequence(result, dtype=self.dtype)

def isna(self):
return np.array([x is self.dtype.na_value for x in self.data], dtype="bool")

def __getitem__(self, idx):
if isinstance(idx, numbers.Integral):
return self.data[idx]
elif isinstance(idx, (abc.Iterable, slice)):
return DummyArray(self.data[idx])
else:
raise TypeError("Index type not supported", idx)


class TestExtensionArrayDtype:
@pytest.mark.parametrize(
Expand Down Expand Up @@ -79,3 +133,20 @@ def test_astype_no_copy():
def test_is_extension_array_dtype(dtype):
assert isinstance(dtype, dtypes.ExtensionDtype)
assert is_extension_array_dtype(dtype)


@pytest.mark.parametrize("na_value", [np.nan, pd.NA, None])
def test_empty_series_construction(na_value):
class TempDType(DummyDtype):
@classmethod
def construct_array_type(cls):
return TempArray

TempDType.na_value = na_value

class TempArray(DummyArray):
_dtype = TempDType

result = pd.Series(index=[1, 2, 3], dtype=TempDType())
expected = pd.Series([na_value] * 3, index=[1, 2, 3], dtype=TempDType())
tm.assert_series_equal(result, expected)