Skip to content

Commit 7222318

Browse files
authored
BUG: Fix reindexing with multi-indexed DataFrames (#30766)
1 parent 717d805 commit 7222318

File tree

6 files changed

+468
-17
lines changed

6 files changed

+468
-17
lines changed

asv_bench/benchmarks/multiindex_object.py

+28
Original file line numberDiff line numberDiff line change
@@ -74,10 +74,38 @@ def setup(self):
7474
],
7575
dtype=object,
7676
)
77+
self.other_mi_many_mismatches = MultiIndex.from_tuples(
78+
[
79+
(-7, 41),
80+
(-2, 3),
81+
(-0.7, 5),
82+
(0, 0),
83+
(0, 1.5),
84+
(0, 340),
85+
(0, 1001),
86+
(1, -4),
87+
(1, 20),
88+
(1, 1040),
89+
(432, -5),
90+
(432, 17),
91+
(439, 165.5),
92+
(998, -4),
93+
(998, 24065),
94+
(999, 865.2),
95+
(999, 1000),
96+
(1045, -843),
97+
]
98+
)
7799

78100
def time_get_indexer(self):
79101
self.mi_int.get_indexer(self.obj_index)
80102

103+
def time_get_indexer_and_backfill(self):
104+
self.mi_int.get_indexer(self.other_mi_many_mismatches, method="backfill")
105+
106+
def time_get_indexer_and_pad(self):
107+
self.mi_int.get_indexer(self.other_mi_many_mismatches, method="pad")
108+
81109
def time_is_monotonic(self):
82110
self.mi_int.is_monotonic
83111

doc/source/whatsnew/v1.1.0.rst

+61
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,67 @@ Backwards incompatible API changes
121121
Previously a ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median`) (:issue:`31485`)
122122
- :meth:`DataFrame.at` and :meth:`Series.at` will raise a ``TypeError`` instead of a ``ValueError`` if an incompatible key is passed, and ``KeyError`` if a missing key is passed, matching the behavior of ``.loc[]`` (:issue:`31722`)
123123
- Passing an integer dtype other than ``int64`` to ``np.array(period_index, dtype=...)`` will now raise ``TypeError`` instead of incorrectly using ``int64`` (:issue:`32255`)
124+
125+
``MultiIndex.get_indexer`` interprets `method` argument differently
126+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
127+
128+
This restores the behavior of :meth:`MultiIndex.get_indexer` with ``method='backfill'`` or ``method='pad'`` to the behavior before pandas 0.23.0. In particular, MultiIndexes are treated as a list of tuples and padding or backfilling is done with respect to the ordering of these lists of tuples (:issue:`29896`).
129+
130+
As an example of this, given:
131+
132+
.. ipython:: python
133+
134+
df = pd.DataFrame({
135+
'a': [0, 0, 0, 0],
136+
'b': [0, 2, 3, 4],
137+
'c': ['A', 'B', 'C', 'D'],
138+
}).set_index(['a', 'b'])
139+
mi_2 = pd.MultiIndex.from_product([[0], [-1, 0, 1, 3, 4, 5]])
140+
141+
The differences in reindexing ``df`` with ``mi_2`` and using ``method='backfill'`` can be seen here:
142+
143+
*pandas >= 0.23, < 1.1.0*:
144+
145+
.. code-block:: ipython
146+
147+
In [1]: df.reindex(mi_2, method='backfill')
148+
Out[1]:
149+
c
150+
0 -1 A
151+
0 A
152+
1 D
153+
3 A
154+
4 A
155+
5 C
156+
157+
*pandas <0.23, >= 1.1.0*
158+
159+
.. ipython:: python
160+
161+
df.reindex(mi_2, method='backfill')
162+
163+
And the differences in reindexing ``df`` with ``mi_2`` and using ``method='pad'`` can be seen here:
164+
165+
*pandas >= 0.23, < 1.1.0*
166+
167+
.. code-block:: ipython
168+
169+
In [1]: df.reindex(mi_2, method='pad')
170+
Out[1]:
171+
c
172+
0 -1 NaN
173+
0 NaN
174+
1 D
175+
3 NaN
176+
4 A
177+
5 C
178+
179+
*pandas < 0.23, >= 1.1.0*
180+
181+
.. ipython:: python
182+
183+
df.reindex(mi_2, method='pad')
184+
124185
-
125186

126187
.. _whatsnew_110.api_breaking.indexing_raises_key_errors:

pandas/_libs/index.pyx

+104-16
Original file line numberDiff line numberDiff line change
@@ -612,25 +612,113 @@ cdef class BaseMultiIndexCodesEngine:
612612
in zip(self.levels, zip(*target))]
613613
return self._codes_to_ints(np.array(level_codes, dtype='uint64').T)
614614

615-
def get_indexer(self, object target, object method=None,
616-
object limit=None):
615+
def get_indexer_no_fill(self, object target) -> np.ndarray:
616+
"""
617+
Returns an array giving the positions of each value of `target` in
618+
`self.values`, where -1 represents a value in `target` which does not
619+
appear in `self.values`
620+
621+
Parameters
622+
----------
623+
target : list-like of keys
624+
Each key is a tuple, with a label for each level of the index
625+
626+
Returns
627+
-------
628+
np.ndarray[int64_t, ndim=1] of the indexer of `target` into
629+
`self.values`
630+
"""
617631
lab_ints = self._extract_level_codes(target)
632+
return self._base.get_indexer(self, lab_ints)
618633

619-
# All methods (exact, backfill, pad) directly map to the respective
620-
# methods of the underlying (integers) index...
621-
if method is not None:
622-
# but underlying backfill and pad methods require index and keys
623-
# to be sorted. The index already is (checked in
624-
# Index._get_fill_indexer), sort (integer representations of) keys:
625-
order = np.argsort(lab_ints)
626-
lab_ints = lab_ints[order]
627-
indexer = (getattr(self._base, f'get_{method}_indexer')
628-
(self, lab_ints, limit=limit))
629-
indexer = indexer[order]
630-
else:
631-
indexer = self._base.get_indexer(self, lab_ints)
634+
def get_indexer(self, object target, object values = None,
635+
object method = None, object limit = None) -> np.ndarray:
636+
"""
637+
Returns an array giving the positions of each value of `target` in
638+
`values`, where -1 represents a value in `target` which does not
639+
appear in `values`
632640

633-
return indexer
641+
If `method` is "backfill" then the position for a value in `target`
642+
which does not appear in `values` is that of the next greater value
643+
in `values` (if one exists), and -1 if there is no such value.
644+
645+
Similarly, if the method is "pad" then the position for a value in
646+
`target` which does not appear in `values` is that of the next smaller
647+
value in `values` (if one exists), and -1 if there is no such value.
648+
649+
Parameters
650+
----------
651+
target: list-like of tuples
652+
need not be sorted, but all must have the same length, which must be
653+
the same as the length of all tuples in `values`
654+
values : list-like of tuples
655+
must be sorted and all have the same length. Should be the set of
656+
the MultiIndex's values. Needed only if `method` is not None
657+
method: string
658+
"backfill" or "pad"
659+
limit: int, optional
660+
if provided, limit the number of fills to this value
661+
662+
Returns
663+
-------
664+
np.ndarray[int64_t, ndim=1] of the indexer of `target` into `values`,
665+
filled with the `method` (and optionally `limit`) specified
666+
"""
667+
if method is None:
668+
return self.get_indexer_no_fill(target)
669+
670+
assert method in ("backfill", "pad")
671+
cdef:
672+
int64_t i, j, next_code
673+
int64_t num_values, num_target_values
674+
ndarray[int64_t, ndim=1] target_order
675+
ndarray[object, ndim=1] target_values
676+
ndarray[int64_t, ndim=1] new_codes, new_target_codes
677+
ndarray[int64_t, ndim=1] sorted_indexer
678+
679+
target_order = np.argsort(target.values).astype('int64')
680+
target_values = target.values[target_order]
681+
num_values, num_target_values = len(values), len(target_values)
682+
new_codes, new_target_codes = (
683+
np.empty((num_values,)).astype('int64'),
684+
np.empty((num_target_values,)).astype('int64'),
685+
)
686+
687+
# `values` and `target_values` are both sorted, so we walk through them
688+
# and memoize the (ordered) set of indices in the (implicit) merged-and
689+
# sorted list of the two which belong to each of them
690+
# the effect of this is to create a factorization for the (sorted)
691+
# merger of the index values, where `new_codes` and `new_target_codes`
692+
# are the subset of the factors which appear in `values` and `target`,
693+
# respectively
694+
i, j, next_code = 0, 0, 0
695+
while i < num_values and j < num_target_values:
696+
val, target_val = values[i], target_values[j]
697+
if val <= target_val:
698+
new_codes[i] = next_code
699+
i += 1
700+
if target_val <= val:
701+
new_target_codes[j] = next_code
702+
j += 1
703+
next_code += 1
704+
705+
# at this point, at least one should have reached the end
706+
# the remaining values of the other should be added to the end
707+
assert i == num_values or j == num_target_values
708+
while i < num_values:
709+
new_codes[i] = next_code
710+
i += 1
711+
next_code += 1
712+
while j < num_target_values:
713+
new_target_codes[j] = next_code
714+
j += 1
715+
next_code += 1
716+
717+
# get the indexer, and undo the sorting of `target.values`
718+
sorted_indexer = (
719+
algos.backfill if method == "backfill" else algos.pad
720+
)(new_codes, new_target_codes, limit=limit).astype('int64')
721+
return sorted_indexer[np.argsort(target_order)]
634722

635723
def get_loc(self, object key):
636724
if is_definitely_invalid_key(key):

pandas/core/indexes/multi.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -2455,7 +2455,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
24552455
raise NotImplementedError(
24562456
"tolerance not implemented yet for MultiIndex"
24572457
)
2458-
indexer = self._engine.get_indexer(target, method, limit)
2458+
indexer = self._engine.get_indexer(
2459+
values=self.values, target=target, method=method, limit=limit
2460+
)
24592461
elif method == "nearest":
24602462
raise NotImplementedError(
24612463
"method='nearest' not implemented yet "

pandas/tests/frame/indexing/test_indexing.py

+75
Original file line numberDiff line numberDiff line change
@@ -1432,6 +1432,81 @@ def test_set_value_resize(self, float_frame):
14321432
with pytest.raises(ValueError, match=msg):
14331433
res._set_value("foobar", "baz", "sam")
14341434

1435+
def test_reindex_with_multi_index(self):
1436+
# https://github.com/pandas-dev/pandas/issues/29896
1437+
# tests for reindexing a multi-indexed DataFrame with a new MultiIndex
1438+
#
1439+
# confirms that we can reindex a multi-indexed DataFrame with a new
1440+
# MultiIndex object correctly when using no filling, backfilling, and
1441+
# padding
1442+
#
1443+
# The DataFrame, `df`, used in this test is:
1444+
# c
1445+
# a b
1446+
# -1 0 A
1447+
# 1 B
1448+
# 2 C
1449+
# 3 D
1450+
# 4 E
1451+
# 5 F
1452+
# 6 G
1453+
# 0 0 A
1454+
# 1 B
1455+
# 2 C
1456+
# 3 D
1457+
# 4 E
1458+
# 5 F
1459+
# 6 G
1460+
# 1 0 A
1461+
# 1 B
1462+
# 2 C
1463+
# 3 D
1464+
# 4 E
1465+
# 5 F
1466+
# 6 G
1467+
#
1468+
# and the other MultiIndex, `new_multi_index`, is:
1469+
# 0: 0 0.5
1470+
# 1: 2.0
1471+
# 2: 5.0
1472+
# 3: 5.8
1473+
df = pd.DataFrame(
1474+
{
1475+
"a": [-1] * 7 + [0] * 7 + [1] * 7,
1476+
"b": list(range(7)) * 3,
1477+
"c": ["A", "B", "C", "D", "E", "F", "G"] * 3,
1478+
}
1479+
).set_index(["a", "b"])
1480+
new_index = [0.5, 2.0, 5.0, 5.8]
1481+
new_multi_index = MultiIndex.from_product([[0], new_index], names=["a", "b"])
1482+
1483+
# reindexing w/o a `method` value
1484+
reindexed = df.reindex(new_multi_index)
1485+
expected = pd.DataFrame(
1486+
{"a": [0] * 4, "b": new_index, "c": [np.nan, "C", "F", np.nan]}
1487+
).set_index(["a", "b"])
1488+
tm.assert_frame_equal(expected, reindexed)
1489+
1490+
# reindexing with backfilling
1491+
expected = pd.DataFrame(
1492+
{"a": [0] * 4, "b": new_index, "c": ["B", "C", "F", "G"]}
1493+
).set_index(["a", "b"])
1494+
reindexed_with_backfilling = df.reindex(new_multi_index, method="bfill")
1495+
tm.assert_frame_equal(expected, reindexed_with_backfilling)
1496+
1497+
reindexed_with_backfilling = df.reindex(new_multi_index, method="backfill")
1498+
tm.assert_frame_equal(expected, reindexed_with_backfilling)
1499+
1500+
# reindexing with padding
1501+
expected = pd.DataFrame(
1502+
{"a": [0] * 4, "b": new_index, "c": ["A", "C", "F", "F"]}
1503+
).set_index(["a", "b"])
1504+
reindexed_with_padding = df.reindex(new_multi_index, method="pad")
1505+
tm.assert_frame_equal(expected, reindexed_with_padding)
1506+
1507+
reindexed_with_padding = df.reindex(new_multi_index, method="ffill")
1508+
tm.assert_frame_equal(expected, reindexed_with_padding)
1509+
14351510
def test_set_value_with_index_dtype_change(self):
14361511
df_orig = DataFrame(np.random.randn(3, 3), index=range(3), columns=list("ABC"))
14371512

0 commit comments

Comments
 (0)