Skip to content

BUG: fix reindexing MultiIndex with categorical datetime-like level #21657

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.23.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ Fixed Regressions
- Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`)
- Re-allowed duplicate level names of a ``MultiIndex``. Accessing a level that has a duplicate name by name still raises an error (:issue:`19029`).
- Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`)
- Fixed regression in :meth:`~DataFrame.reindex` and :meth:`~DataFrame.groupby`
with a MultiIndex or multiple keys that contains categorical datetime-like values (:issue:`21390`).
- Fixed regression in unary negative operations with object dtype (:issue:`21380`)
- Bug in :meth:`Timestamp.ceil` and :meth:`Timestamp.floor` when timestamp is a multiple of the rounding frequency (:issue:`21262`)

Expand Down
26 changes: 12 additions & 14 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from pandas.compat.numpy import function as nv
from pandas import compat

from pandas.core.dtypes.dtypes import (
ExtensionDtype, PandasExtensionDtype)
from pandas.core.dtypes.common import (
_ensure_int64,
_ensure_platform_int,
Expand Down Expand Up @@ -807,20 +809,16 @@ def values(self):
return self._tuples

values = []
for lev, lab in zip(self.levels, self.labels):
# Need to box timestamps, etc.
box = hasattr(lev, '_box_values')
# Try to minimize boxing.
if box and len(lev) > len(lab):
taken = lev._box_values(algos.take_1d(lev._ndarray_values,
lab))
elif box:
taken = algos.take_1d(lev._box_values(lev._ndarray_values),
lab,
fill_value=lev._na_value)
else:
taken = algos.take_1d(np.asarray(lev._values), lab)
values.append(taken)

for i in range(self.nlevels):
vals = self._get_level_values(i)
if is_categorical_dtype(vals):
vals = vals.get_values()
if (isinstance(vals.dtype, (PandasExtensionDtype, ExtensionDtype))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

elif here

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I need the 'if' here because the result of categorical.get_values() can still be an Index with extension dtype / datetime dtype.

I would like to explore a bit more how to streamline the path from series/index/array object -> numpy array that is boxed if needed (currenlty that doesn't seem to easy, and is handled again in many different places), but that is for another PR

or hasattr(vals, '_box_values')):
vals = vals.astype(object)
vals = np.array(vals, copy=False)
values.append(vals)

self._tuples = lib.fast_zip(values)
return self._tuples
Expand Down
15 changes: 14 additions & 1 deletion pandas/tests/frame/test_axis_select_reindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import numpy as np

from pandas.compat import lrange, lzip, u
from pandas import (compat, DataFrame, Series, Index, MultiIndex,
from pandas import (compat, DataFrame, Series, Index, MultiIndex, Categorical,
date_range, isna)
import pandas as pd

Expand Down Expand Up @@ -1129,6 +1129,19 @@ def test_reindex_multi(self):

assert_frame_equal(result, expected)

def test_reindex_multi_categorical_time(self):
# https://github.com/pandas-dev/pandas/issues/21390
midx = pd.MultiIndex.from_product(
[Categorical(['a', 'b', 'c']),
Categorical(date_range("2012-01-01", periods=3, freq='H'))])
df = pd.DataFrame({'a': range(len(midx))}, index=midx)
df2 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 8]]

result = df2.reindex(midx)
expected = pd.DataFrame(
{'a': [0, 1, 2, 3, 4, 5, 6, np.nan, 8]}, index=midx)
assert_frame_equal(result, expected)

data = [[1, 2, 3], [1, 2, 3]]

@pytest.mark.parametrize('actual', [
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -850,3 +850,23 @@ def test_empty_prod():
result = df.groupby("A", observed=False).B.prod(min_count=1)
expected = pd.Series([2, 1, np.nan], expected_idx, name='B')
tm.assert_series_equal(result, expected)


def test_groupby_multiindex_categorical_datetime():
# https://github.com/pandas-dev/pandas/issues/21390

df = pd.DataFrame({
'key1': pd.Categorical(list('abcbabcba')),
'key2': pd.Categorical(
list(pd.date_range('2018-06-01 00', freq='1T', periods=3)) * 3),
'values': np.arange(9),
})
result = df.groupby(['key1', 'key2']).mean()

idx = pd.MultiIndex.from_product(
[pd.Categorical(['a', 'b', 'c']),
pd.Categorical(pd.date_range('2018-06-01 00', freq='1T', periods=3))],
names=['key1', 'key2'])
expected = pd.DataFrame(
{'values': [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx)
assert_frame_equal(result, expected)
12 changes: 10 additions & 2 deletions pandas/tests/indexes/test_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@

import pandas as pd

from pandas import (CategoricalIndex, DataFrame, Index, MultiIndex,
compat, date_range, period_range)
from pandas import (CategoricalIndex, Categorical, DataFrame, Index,
MultiIndex, compat, date_range, period_range)
from pandas.compat import PY3, long, lrange, lzip, range, u, PYPY
from pandas.errors import PerformanceWarning, UnsortedIndexError
from pandas.core.dtypes.dtypes import CategoricalDtype
Expand Down Expand Up @@ -1596,6 +1596,14 @@ def test_get_indexer_nearest(self):
with pytest.raises(NotImplementedError):
midx.get_indexer(['a'], method='pad', tolerance=2)

def test_get_indexer_categorical_time(self):
# https://github.com/pandas-dev/pandas/issues/21390
midx = MultiIndex.from_product(
[Categorical(['a', 'b', 'c']),
Categorical(date_range("2012-01-01", periods=3, freq='H'))])
result = midx.get_indexer(midx)
tm.assert_numpy_array_equal(result, np.arange(9, dtype=np.intp))

def test_hash_collisions(self):
# non-smoke test that we don't get hash collisions

Expand Down