Skip to content

BUG: Wrong dtype when resetting a multiindex with missing values. (#1… #27370

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 13 commits into from
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1068,6 +1068,8 @@ MultiIndex
^^^^^^^^^^

- Bug in which incorrect exception raised by :class:`Timedelta` when testing the membership of :class:`MultiIndex` (:issue:`24570`)
- Bug in :meth:`DataFrame.reset_index` where dtype was sometimes not preserved for :class:`MultiIndex` that is empty or with missing values (:issue:`19602`)
- Bug in which :meth:`DataFrame.reset_index` did not work for :class:`MultiIndex` with :class:`CategoricalIndex` levels with missing values (:issue:`24206`)
-

I/O
Expand Down
41 changes: 13 additions & 28 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@
from pandas.core import algorithms, common as com, nanops, ops
from pandas.core.accessor import CachedAccessor
from pandas.core.arrays import Categorical, ExtensionArray
from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray
from pandas.core.arrays.sparse import SparseFrameAccessor
from pandas.core.generic import NDFrame, _shared_docs
from pandas.core.index import (
Expand Down Expand Up @@ -4610,33 +4609,19 @@ def _maybe_casted_values(index, labels=None):

# if we have the labels, extract the values with a mask
if labels is not None:
mask = labels == -1

# we can have situations where the whole mask is -1,
# meaning there is nothing found in labels, so make all nan's
if mask.all():
values = np.empty(len(mask))
values.fill(np.nan)
if isinstance(values, np.ndarray):
mask = labels == -1
# we can have situations where the whole mask is -1,
# meaning there is nothing found in labels, so make all nan's
if mask.all():
values = np.empty(len(mask), dtype=values.dtype)
values.fill(np.nan)
else:
values = values.take(labels)
if mask.any():
values, _ = maybe_upcast_putmask(values, mask, np.nan)
else:
values = values.take(labels)

# TODO(https://github.com/pandas-dev/pandas/issues/24206)
# Push this into maybe_upcast_putmask?
# We can't pass EAs there right now. Looks a bit
# complicated.
# So we unbox the ndarray_values, op, re-box.
values_type = type(values)
values_dtype = values.dtype

if issubclass(values_type, DatetimeLikeArray):
values = values._data

if mask.any():
values, changed = maybe_upcast_putmask(values, mask, np.nan)

if issubclass(values_type, DatetimeLikeArray):
values = values_type(values, dtype=values_dtype)

values = values.take(labels, allow_fill=True)
return values

new_index = ibase.default_index(len(new_obj))
Expand Down Expand Up @@ -4680,7 +4665,7 @@ def _maybe_casted_values(index, labels=None):
missing = self.columns.nlevels - len(name_lst)
name_lst += [col_fill] * missing
name = tuple(name_lst)
# to ndarray and maybe infer different dtype
# to array-like and maybe infer different dtype
level_values = _maybe_casted_values(lev, lab)
new_obj.insert(0, name, level_values)

Expand Down
40 changes: 40 additions & 0 deletions pandas/tests/frame/test_alter_axes.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from pandas import (
Categorical,
CategoricalIndex,
DataFrame,
DatetimeIndex,
Index,
Expand Down Expand Up @@ -1186,6 +1187,45 @@ def test_reset_index_multiindex_nan(self):
rs = df.set_index(["A", "B"]).reset_index()
tm.assert_frame_equal(rs, df)

def test_reset_index_multiindex_datetime_all_nan(self):
# GH 19602
df = DataFrame({0: DatetimeIndex([]), 1: []}, columns=[0, 1])
rs = df.set_index([0, 1]).reset_index()
tm.assert_frame_equal(rs, df)

idx = MultiIndex(
levels=[DatetimeIndex([]), DatetimeIndex(["2015-01-01 11:00:00"])],
codes=[[-1, -1], [0, -1]],
names=[0, 1],
)
df = DataFrame(index=idx).reset_index()

xp = DataFrame(
{
0: DatetimeIndex([np.nan, np.nan]),
1: DatetimeIndex(["2015-01-01 11:00:00", np.nan]),
},
columns=[0, 1],
)
tm.assert_frame_equal(df, xp)

def test_reset_index_multiindex_categorical_with_nan(self):
# GH 24206
idx = MultiIndex(
[CategoricalIndex(["A", "B"]), CategoricalIndex(["a", "b"])],
[[0, 0, 1, 1], [0, 1, 0, -1]],
)
df = DataFrame({"col": range(len(idx))}, index=idx).reset_index()
xp = DataFrame(
{
"level_0": CategoricalIndex(["A", "A", "B", "B"]),
"level_1": CategoricalIndex(["a", "b", "a", np.nan]),
"col": [0, 1, 2, 3],
},
columns=["level_0", "level_1", "col"],
)
tm.assert_frame_equal(df, xp)

def test_reset_index_with_datetimeindex_cols(self):
# GH5818
#
Expand Down