Skip to content

BUG: Create empty dataframe with string dtype fails #33651

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
5 changes: 4 additions & 1 deletion pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,10 @@ def init_dict(data, index, columns, dtype=None):

# no obvious "empty" int column
if missing.any() and not is_integer_dtype(dtype):
if dtype is None or np.issubdtype(dtype, np.flexible):
if is_extension_array_dtype(dtype):
# GH 33623
nan_dtype = dtype
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this will be dtype.na_value

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you update this

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I can't figure out how to fix this from "this will be dtype.na_value".

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In [15]: pd.Int32Dtype.na_value                                                                                                                                                                                                                          
Out[15]: <NA>

nan_dtype = dtype.na_value

Copy link
Contributor Author

@kotamatsuoka kotamatsuoka Apr 26, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will change to nan_dtype = dtype.na_value, error occurs.

            if not isinstance(dtype, (np.dtype, type(np.dtype))):
>               dtype = dtype.dtype
E               AttributeError: 'NAType' object has no attribute 'dtype'

pandas/core/dtypes/cast.py:1545: AttributeError

So I updated it like this.

if (
            dtype is None
            or is_extension_array_dtype(dtype)
            or np.issubdtype(dtype, np.flexible)
        ):
    nan_dtype = object

elif dtype is None or np.issubdtype(dtype, np.flexible):
# GH#1783
nan_dtype = object
else:
Expand Down
11 changes: 11 additions & 0 deletions pandas/tests/extension/arrow/test_bool.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@ def dtype():
return ArrowBoolDtype()


@pytest.fixture
def columns():
values = np.random.randint(0, 2, size=100, dtype=bool)
values[1] = ~values[0]
return ArrowBoolArray.from_scalars(values)


@pytest.fixture
def data():
values = np.random.randint(0, 2, size=100, dtype=bool)
Expand Down Expand Up @@ -55,6 +62,10 @@ def test_from_dtype(self, data):
def test_from_sequence_from_cls(self, data):
super().test_from_sequence_from_cls(data)

@pytest.mark.xfail(reason="bad is-na for empty data")
def test_construct_empty_dataframe(self, columns, dtype):
super().test_construct_empty_dataframe(columns, dtype)


class TestReduce(base.BaseNoReduceTests):
def test_reduce_series_boolean(self):
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/extension/base/constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,7 @@ def test_pandas_array_dtype(self, data):
result = pd.array(data, dtype=np.dtype(object))
expected = pd.arrays.PandasArray(np.asarray(data, dtype=object))
self.assert_equal(result, expected)

def test_construct_empty_dataframe(self, columns, dtype):
# GH 33623
pd.DataFrame(columns=columns, dtype=dtype)
5 changes: 5 additions & 0 deletions pandas/tests/extension/decimal/test_decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ def dtype():
return DecimalDtype()


@pytest.fixture
def columns():
return DecimalArray(make_data())


@pytest.fixture
def data():
return DecimalArray(make_data())
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/extension/json/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,16 @@ def dtype():
return JSONDtype()


@pytest.fixture
def columns():
data = make_data()

while len(data[0]) == len(data[1]):
data = make_data()

return JSONArray(data)


@pytest.fixture
def data():
"""Length-100 PeriodArray for semantics test."""
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/extension/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ def dtype():
return BooleanDtype()


@pytest.fixture
def columns(dtype):
return pd.array([True, False], dtype=dtype)


@pytest.fixture
def data(dtype):
return pd.array(make_data(), dtype=dtype)
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/extension/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ def dtype():
return CategoricalDtype()


@pytest.fixture
def columns():
return Categorical(make_data())


@pytest.fixture
def data():
"""Length-100 array for this type.
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/extension/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ def dtype(request):
return DatetimeTZDtype(unit="ns", tz=request.param)


@pytest.fixture
def columns(dtype):
return DatetimeArray(pd.date_range("2000", periods=100, tz=dtype.tz), dtype=dtype)


@pytest.fixture
def data(dtype):
data = DatetimeArray(pd.date_range("2000", periods=100, tz=dtype.tz), dtype=dtype)
Expand Down
9 changes: 8 additions & 1 deletion pandas/tests/extension/test_integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ def dtype(request):
return request.param()


@pytest.fixture
def columns():
return integer_array(make_data())


@pytest.fixture
def data(dtype):
return integer_array(make_data(), dtype=dtype)
Expand Down Expand Up @@ -186,7 +191,9 @@ class TestInterface(base.BaseInterfaceTests):


class TestConstructors(base.BaseConstructorsTests):
pass
@pytest.mark.xfail(reason="bad is-na for empty data")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this xfailed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • coerce_to_array() in core/arrays/integer.py doesn't accept array(nan).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally we would fix this here. What needs to change?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need to allows values.ndim to be 0 in coerce_to_array().

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we instead not pass a 0-dim array to coerce_to_array? It's not clear to me why we need a 0-d array in the first place.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added processing to convert np.nan to [].

values = [] if values is np.nan else values

def test_construct_empty_dataframe(self, columns, dtyoe):
super().test_construct_empty_dataframe(columns, dtype)


class TestReshaping(base.BaseReshapingTests):
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/extension/test_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ def dtype():
return IntervalDtype()


@pytest.fixture
def columns():
return IntervalArray(make_data())


@pytest.fixture
def data():
"""Length-100 PeriodArray for semantics test."""
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/extension/test_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ def allow_in_pandas(monkeypatch):
yield


@pytest.fixture
def columns():
return PandasArray(np.arange(1, 3))


@pytest.fixture
def data(allow_in_pandas, dtype):
if dtype.numpy_dtype == "object":
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/extension/test_period.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ def dtype():
return PeriodDtype(freq="D")


@pytest.fixture
def columns(dtype):
return PeriodArray(np.arange(2020, 2021), freq=dtype.freq)


@pytest.fixture
def data(dtype):
return PeriodArray(np.arange(1970, 2070), freq=dtype.freq)
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/extension/test_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ def dtype():
return SparseDtype()


@pytest.fixture
def columns():
return SparseArray(np.ones(10) * 2)


@pytest.fixture(params=[0, np.nan])
def data(request):
"""Length-100 PeriodArray for semantics test."""
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/extension/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@ def dtype():
return StringDtype()


@pytest.fixture
def columns():
strings = np.random.choice(list(string.ascii_letters), size=1)

return StringArray._from_sequence(strings)


@pytest.fixture
def data():
strings = np.random.choice(list(string.ascii_letters), size=100)
Expand Down