diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index b35f230100f8d..1df31da95c87d 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -151,7 +151,8 @@ Missing MultiIndex ^^^^^^^^^^ -- +- Bug in :meth:`DataFrame.reset_index` where dtype was sometimes not preserved for :class:`MultiIndex` that is empty or with missing values (:issue:`19602`) +- Bug in which :meth:`DataFrame.reset_index` did not work for :class:`MultiIndex` with :class:`CategoricalIndex` levels with missing values (:issue:`24206`) - I/O diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 4bb1deffd9524..9abbed07aeb68 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1391,3 +1391,41 @@ def maybe_cast_to_integer_array(arr, dtype, copy=False): if is_integer_dtype(dtype) and (is_float_dtype(arr) or is_object_dtype(arr)): raise ValueError("Trying to coerce float values to integers") + + +def maybe_casted_values(index, codes=None): + """ + Convert an index, given directly or as a pair (level, codes), to a 1D array. + + Parameters + ---------- + index : Index + codes : sequence of integers (optional) + + Returns + ------- + ExtensionArray or ndarray + If codes is `None`, the values of `index`. + If codes is passed, an array obtained by taking from `index` the indices + contained in `codes`. + """ + values = index._values + if values.dtype == np.object_: + values = lib.maybe_convert_objects(values) + + # if we have the labels, extract the values with a mask + if codes is not None: + if isinstance(values, np.ndarray): + mask = codes == -1 + # we can have situations where the whole mask is -1, + # meaning there is nothing found in labels, so make all nan's + if mask.all(): + values = np.empty(len(mask), dtype=values.dtype) + values.fill(np.nan) + else: + values = values.take(codes) + if mask.any(): + values, _ = maybe_upcast_putmask(values, mask, np.nan) + else: + values = values.take(codes, allow_fill=True) + return values diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6114c9efbfe28..cce0def99febb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -41,11 +41,11 @@ infer_dtype_from_scalar, invalidate_string_dtypes, maybe_cast_to_datetime, + maybe_casted_values, maybe_convert_platform, maybe_downcast_to_dtype, maybe_infer_to_datetimelike, maybe_upcast, - maybe_upcast_putmask, ) from pandas.core.dtypes.common import ( ensure_float64, @@ -83,7 +83,6 @@ from pandas.core import algorithms, common as com, nanops, ops from pandas.core.accessor import CachedAccessor from pandas.core.arrays import Categorical, ExtensionArray -from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.generic import NDFrame, _shared_docs from pandas.core.index import ( @@ -93,9 +92,7 @@ ensure_index_from_sequences, ) from pandas.core.indexes import base as ibase -from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.multi import maybe_droplevels -from pandas.core.indexes.period import PeriodIndex from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable from pandas.core.internals import BlockManager from pandas.core.internals.construction import ( @@ -4550,43 +4547,6 @@ class max type else: new_obj = self.copy() - def _maybe_casted_values(index, labels=None): - values = index._values - if not isinstance(index, (PeriodIndex, DatetimeIndex)): - if values.dtype == np.object_: - values = lib.maybe_convert_objects(values) - - # if we have the labels, extract the values with a mask - if labels is not None: - mask = labels == -1 - - # we can have situations where the whole mask is -1, - # meaning there is nothing found in labels, so make all nan's - if mask.all(): - values = np.empty(len(mask)) - values.fill(np.nan) - else: - values = values.take(labels) - - # TODO(https://github.com/pandas-dev/pandas/issues/24206) - # Push this into maybe_upcast_putmask? - # We can't pass EAs there right now. Looks a bit - # complicated. - # So we unbox the ndarray_values, op, re-box. - values_type = type(values) - values_dtype = values.dtype - - if issubclass(values_type, DatetimeLikeArray): - values = values._data - - if mask.any(): - values, changed = maybe_upcast_putmask(values, mask, np.nan) - - if issubclass(values_type, DatetimeLikeArray): - values = values_type(values, dtype=values_dtype) - - return values - new_index = ibase.default_index(len(new_obj)) if level is not None: if not isinstance(level, (tuple, list)): @@ -4628,8 +4588,7 @@ def _maybe_casted_values(index, labels=None): missing = self.columns.nlevels - len(name_lst) name_lst += [col_fill] * missing name = tuple(name_lst) - # to ndarray and maybe infer different dtype - level_values = _maybe_casted_values(lev, lab) + level_values = maybe_casted_values(lev, lab) new_obj.insert(0, name, level_values) new_obj.index = new_index diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 00b59fd4dc087..7d2be2357feca 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -12,6 +12,7 @@ from pandas import ( Categorical, + CategoricalIndex, DataFrame, DatetimeIndex, Index, @@ -1157,34 +1158,43 @@ def test_reset_index_multiindex_col(self): ) tm.assert_frame_equal(rs, xp) - def test_reset_index_multiindex_nan(self): - # GH6322, testing reset_index on MultiIndexes - # when we have a nan or all nan - df = DataFrame( - {"A": ["a", "b", "c"], "B": [0, 1, np.nan], "C": np.random.rand(3)} - ) - rs = df.set_index(["A", "B"]).reset_index() - tm.assert_frame_equal(rs, df) - - df = DataFrame( - {"A": [np.nan, "b", "c"], "B": [0, 1, 2], "C": np.random.rand(3)} - ) - rs = df.set_index(["A", "B"]).reset_index() - tm.assert_frame_equal(rs, df) - - df = DataFrame({"A": ["a", "b", "c"], "B": [0, 1, 2], "C": [np.nan, 1.1, 2.2]}) - rs = df.set_index(["A", "B"]).reset_index() - tm.assert_frame_equal(rs, df) + @pytest.mark.parametrize( + "columns", + [ + [["a", "b", "c"], [0, 1, np.nan], np.random.rand(3)], + [[np.nan, "b", "c"], [0, 1, 2], np.random.rand(3)], + [["a", "b", "c"], [0, 1, 2], [np.nan, 1.1, 2.2]], + [["a", "b", "c"], [np.nan, np.nan, np.nan], np.random.rand(3)], + [ + DatetimeIndex([np.nan, np.nan]), + DatetimeIndex(["2015-01-01 11:00:00", np.nan]), + np.random.rand(2), + ], + [ + CategoricalIndex(["A", "A", "B", "B"]), + CategoricalIndex(["a", "b", "a", np.nan]), + np.random.rand(4), + ], + [DatetimeIndex([]), [], []], + ], + ) + def test_reset_index_multiindex_nan(self, columns): + # GH6322, GH19602, GH24206: testing reset_index on MultiIndex + # with some nans or all nans + column_names = ["A", "B", "C"] + columns = dict(zip(column_names, columns)) + df = DataFrame(columns, columns=column_names) + result = df.set_index(column_names[:2]).reset_index() + tm.assert_frame_equal(df, result) - df = DataFrame( - { - "A": ["a", "b", "c"], - "B": [np.nan, np.nan, np.nan], - "C": np.random.rand(3), - } - ) - rs = df.set_index(["A", "B"]).reset_index() - tm.assert_frame_equal(rs, df) + def test_reset_index_multiindex_empty(self): + # GH19602: preserve dtypes when resetting multiindex of + # empty dataframe + idx = MultiIndex.from_product([[0, 1], [1, 2]]) + empty_df = DataFrame(index=idx)[:0] + types = empty_df.reset_index().dtypes + assert types[0] == np.int64 + assert types[1] == np.int64 def test_reset_index_with_datetimeindex_cols(self): # GH5818