Skip to content

BUG: Wrong dtype when resetting a multiindex with missing values. (#1… #27370

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 13 commits into from
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,8 @@ Missing
MultiIndex
^^^^^^^^^^

-
- Bug in :meth:`DataFrame.reset_index` where dtype was sometimes not preserved for :class:`MultiIndex` that is empty or with missing values (:issue:`19602`)
- Bug in which :meth:`DataFrame.reset_index` did not work for :class:`MultiIndex` with :class:`CategoricalIndex` levels with missing values (:issue:`24206`)
-

I/O
Expand Down
38 changes: 38 additions & 0 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -1391,3 +1391,41 @@ def maybe_cast_to_integer_array(arr, dtype, copy=False):

if is_integer_dtype(dtype) and (is_float_dtype(arr) or is_object_dtype(arr)):
raise ValueError("Trying to coerce float values to integers")


def maybe_casted_values(index, codes=None):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think mark this as private with a leading underscore.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you sure ? It's not really private since it's used in frame.py, and @jreback already asked me to remove the underscore when moving it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe wait to hear from @jreback but .core.* is supposed to be private, but people still use methods from there. We plan to deprecate it, but until then it's probably best to prefix methods with underscores.

"""
Convert an index, given directly or as a pair (level, codes), to a 1D array.

Parameters
----------
index : Index
codes : sequence of integers (optional)

Returns
-------
ExtensionArray or ndarray
If codes is `None`, the values of `index`.
If codes is passed, an array obtained by taking from `index` the indices
contained in `codes`.
"""
values = index._values
if values.dtype == np.object_:
values = lib.maybe_convert_objects(values)

# if we have the labels, extract the values with a mask
if codes is not None:
if isinstance(values, np.ndarray):
mask = codes == -1
# we can have situations where the whole mask is -1,
# meaning there is nothing found in labels, so make all nan's
if mask.all():
values = np.empty(len(mask), dtype=values.dtype)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to get here with an extension dtype? Like pd.date_range('2000', periods=4, tz="CET")? If so, I suspect that will fail.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't have time to really get into it, but after a quick look I would say that it's possible. I'll have to do something about it when i come back.

values.fill(np.nan)
else:
values = values.take(codes)
if mask.any():
values, _ = maybe_upcast_putmask(values, mask, np.nan)
else:
values = values.take(codes, allow_fill=True)
return values
45 changes: 2 additions & 43 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@
infer_dtype_from_scalar,
invalidate_string_dtypes,
maybe_cast_to_datetime,
maybe_casted_values,
maybe_convert_platform,
maybe_downcast_to_dtype,
maybe_infer_to_datetimelike,
maybe_upcast,
maybe_upcast_putmask,
)
from pandas.core.dtypes.common import (
ensure_float64,
Expand Down Expand Up @@ -83,7 +83,6 @@
from pandas.core import algorithms, common as com, nanops, ops
from pandas.core.accessor import CachedAccessor
from pandas.core.arrays import Categorical, ExtensionArray
from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray
from pandas.core.arrays.sparse import SparseFrameAccessor
from pandas.core.generic import NDFrame, _shared_docs
from pandas.core.index import (
Expand All @@ -93,9 +92,7 @@
ensure_index_from_sequences,
)
from pandas.core.indexes import base as ibase
from pandas.core.indexes.datetimes import DatetimeIndex
from pandas.core.indexes.multi import maybe_droplevels
from pandas.core.indexes.period import PeriodIndex
from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable
from pandas.core.internals import BlockManager
from pandas.core.internals.construction import (
Expand Down Expand Up @@ -4550,43 +4547,6 @@ class max type
else:
new_obj = self.copy()

def _maybe_casted_values(index, labels=None):
values = index._values
if not isinstance(index, (PeriodIndex, DatetimeIndex)):
if values.dtype == np.object_:
values = lib.maybe_convert_objects(values)

# if we have the labels, extract the values with a mask
if labels is not None:
mask = labels == -1

# we can have situations where the whole mask is -1,
# meaning there is nothing found in labels, so make all nan's
if mask.all():
values = np.empty(len(mask))
values.fill(np.nan)
else:
values = values.take(labels)

# TODO(https://github.com/pandas-dev/pandas/issues/24206)
# Push this into maybe_upcast_putmask?
# We can't pass EAs there right now. Looks a bit
# complicated.
# So we unbox the ndarray_values, op, re-box.
values_type = type(values)
values_dtype = values.dtype

if issubclass(values_type, DatetimeLikeArray):
values = values._data

if mask.any():
values, changed = maybe_upcast_putmask(values, mask, np.nan)

if issubclass(values_type, DatetimeLikeArray):
values = values_type(values, dtype=values_dtype)

return values

new_index = ibase.default_index(len(new_obj))
if level is not None:
if not isinstance(level, (tuple, list)):
Expand Down Expand Up @@ -4628,8 +4588,7 @@ def _maybe_casted_values(index, labels=None):
missing = self.columns.nlevels - len(name_lst)
name_lst += [col_fill] * missing
name = tuple(name_lst)
# to ndarray and maybe infer different dtype
level_values = _maybe_casted_values(lev, lab)
level_values = maybe_casted_values(lev, lab)
new_obj.insert(0, name, level_values)

new_obj.index = new_index
Expand Down
64 changes: 37 additions & 27 deletions pandas/tests/frame/test_alter_axes.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from pandas import (
Categorical,
CategoricalIndex,
DataFrame,
DatetimeIndex,
Index,
Expand Down Expand Up @@ -1157,34 +1158,43 @@ def test_reset_index_multiindex_col(self):
)
tm.assert_frame_equal(rs, xp)

def test_reset_index_multiindex_nan(self):
# GH6322, testing reset_index on MultiIndexes
# when we have a nan or all nan
df = DataFrame(
{"A": ["a", "b", "c"], "B": [0, 1, np.nan], "C": np.random.rand(3)}
)
rs = df.set_index(["A", "B"]).reset_index()
tm.assert_frame_equal(rs, df)

df = DataFrame(
{"A": [np.nan, "b", "c"], "B": [0, 1, 2], "C": np.random.rand(3)}
)
rs = df.set_index(["A", "B"]).reset_index()
tm.assert_frame_equal(rs, df)

df = DataFrame({"A": ["a", "b", "c"], "B": [0, 1, 2], "C": [np.nan, 1.1, 2.2]})
rs = df.set_index(["A", "B"]).reset_index()
tm.assert_frame_equal(rs, df)
@pytest.mark.parametrize(
"columns",
[
[["a", "b", "c"], [0, 1, np.nan], np.random.rand(3)],
[[np.nan, "b", "c"], [0, 1, 2], np.random.rand(3)],
[["a", "b", "c"], [0, 1, 2], [np.nan, 1.1, 2.2]],
[["a", "b", "c"], [np.nan, np.nan, np.nan], np.random.rand(3)],
[
DatetimeIndex([np.nan, np.nan]),
DatetimeIndex(["2015-01-01 11:00:00", np.nan]),
np.random.rand(2),
],
[
CategoricalIndex(["A", "A", "B", "B"]),
CategoricalIndex(["a", "b", "a", np.nan]),
np.random.rand(4),
],
[DatetimeIndex([]), [], []],
],
)
def test_reset_index_multiindex_nan(self, columns):
# GH6322, GH19602, GH24206: testing reset_index on MultiIndex
# with some nans or all nans
column_names = ["A", "B", "C"]
columns = dict(zip(column_names, columns))
df = DataFrame(columns, columns=column_names)
result = df.set_index(column_names[:2]).reset_index()
tm.assert_frame_equal(df, result)

df = DataFrame(
{
"A": ["a", "b", "c"],
"B": [np.nan, np.nan, np.nan],
"C": np.random.rand(3),
}
)
rs = df.set_index(["A", "B"]).reset_index()
tm.assert_frame_equal(rs, df)
def test_reset_index_multiindex_empty(self):
# GH19602: preserve dtypes when resetting multiindex of
# empty dataframe
idx = MultiIndex.from_product([[0, 1], [1, 2]])
empty_df = DataFrame(index=idx)[:0]
types = empty_df.reset_index().dtypes
assert types[0] == np.int64
assert types[1] == np.int64

def test_reset_index_with_datetimeindex_cols(self):
# GH5818
Expand Down