Skip to content

BUG: fix reindexing MultiIndex with categorical datetime-like level #21657

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.23.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ Fixed Regressions

- Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`)
- Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`)
-
- Fixed regression in :meth:`~DataFrame.reindex` and :meth:`~DataFrame.groupby`
with a MultiIndex or multiple keys that contains categorical datetime-like values (:issue:`21390`).

.. _whatsnew_0232.performance:

Expand Down
10 changes: 8 additions & 2 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -819,11 +819,17 @@ def values(self):
return self._tuples

values = []
for lev, lab in zip(self.levels, self.labels):
for i, (lev, lab) in enumerate(zip(self.levels, self.labels)):
# Need to box timestamps, etc.
box = hasattr(lev, '_box_values')
if is_categorical_dtype(lev):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you need and len(lev) > len(lab) here? I'm not sure what it's for, but the other condition had it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so. Below it is too switch between "first boxing, then take" and "first take, then boxing", as far as I understand "Try to minimize boxing." for performance reasons? (both should be the same).
Anyhow, I don't think it is relevant here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a gigantic hack, I would really not do this. Simply push to 0.23.3

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you have ideas where to look for a better solution?

# TODO GH-21390
taken = self.get_level_values(i)._values.get_values()
if not isinstance(taken, np.ndarray):
# Datetime/Period index
taken = np.array(taken.astype(object))
# Try to minimize boxing.
if box and len(lev) > len(lab):
elif box and len(lev) > len(lab):
taken = lev._box_values(algos.take_1d(lev._ndarray_values,
lab))
elif box:
Expand Down
15 changes: 14 additions & 1 deletion pandas/tests/frame/test_axis_select_reindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import numpy as np

from pandas.compat import lrange, lzip, u
from pandas import (compat, DataFrame, Series, Index, MultiIndex,
from pandas import (compat, DataFrame, Series, Index, MultiIndex, Categorical,
date_range, isna)
import pandas as pd

Expand Down Expand Up @@ -1129,6 +1129,19 @@ def test_reindex_multi(self):

assert_frame_equal(result, expected)

def test_reindex_multi_categorical_time(self):
# https://github.com/pandas-dev/pandas/issues/21390
midx = pd.MultiIndex.from_product(
[Categorical(['a', 'b', 'c']),
Categorical(date_range("2012-01-01", periods=3, freq='H'))])
df = pd.DataFrame({'a': range(len(midx))}, index=midx)
df2 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 8]]

result = df2.reindex(midx)
expected = pd.DataFrame(
{'a': [0, 1, 2, 3, 4, 5, 6, np.nan, 8]}, index=midx)
assert_frame_equal(result, expected)

data = [[1, 2, 3], [1, 2, 3]]

@pytest.mark.parametrize('actual', [
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -854,3 +854,23 @@ def test_empty_prod():
result = df.groupby("A", observed=False).B.prod(min_count=1)
expected = pd.Series([2, 1, np.nan], expected_idx, name='B')
tm.assert_series_equal(result, expected)


def test_groupby_multiindex_categorical_datetime():
# https://github.com/pandas-dev/pandas/issues/21390

df = pd.DataFrame({
'key1': pd.Categorical(list('abcbabcba')),
'key2': pd.Categorical(
list(pd.date_range('2018-06-01 00', freq='1T', periods=3)) * 3),
'values': np.arange(9),
})
result = df.groupby(['key1', 'key2']).mean()

idx = pd.MultiIndex.from_product(
[pd.Categorical(['a', 'b', 'c']),
pd.Categorical(pd.date_range('2018-06-01 00', freq='1T', periods=3))],
names=['key1', 'key2'])
expected = pd.DataFrame(
{'values': [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx)
assert_frame_equal(result, expected)
12 changes: 10 additions & 2 deletions pandas/tests/indexes/test_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@

import pandas as pd

from pandas import (CategoricalIndex, DataFrame, Index, MultiIndex,
compat, date_range, period_range)
from pandas import (CategoricalIndex, Categorical, DataFrame, Index,
MultiIndex, compat, date_range, period_range)
from pandas.compat import PY3, long, lrange, lzip, range, u, PYPY
from pandas.errors import PerformanceWarning, UnsortedIndexError
from pandas.core.dtypes.dtypes import CategoricalDtype
Expand Down Expand Up @@ -1591,6 +1591,14 @@ def test_get_indexer_nearest(self):
with pytest.raises(NotImplementedError):
midx.get_indexer(['a'], method='pad', tolerance=2)

def test_get_indexer_categorical_time(self):
# https://github.com/pandas-dev/pandas/issues/21390
midx = MultiIndex.from_product(
[Categorical(['a', 'b', 'c']),
Categorical(date_range("2012-01-01", periods=3, freq='H'))])
result = midx.get_indexer(midx)
tm.assert_numpy_array_equal(result, np.arange(9, dtype=np.intp))

def test_hash_collisions(self):
# non-smoke test that we don't get hash collisions

Expand Down