Skip to content

Commit 5d7c05f

Browse files
BUG: Fix aligning a DataFrame with a Series with MultiIndex
When aligning a DataFrame to a Series we are using Series.reindex() to broadcast the Series data to the new index. That introduces NaNs when the new index rows are not identical to the existing ones, which is not the case when we introduce a new MultiIndex level. In this patch we use the same approach as for aligning a Series to another Series. That means that we have to replicate a part of Series._reindex_indexer as DataFrame does not have it.
1 parent 8e0baa2 commit 5d7c05f

File tree

4 files changed

+190
-17
lines changed

4 files changed

+190
-17
lines changed

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,7 @@ Reshaping
437437
^^^^^^^^^
438438
- Bug in :func:`concat` between a :class:`Series` with integer dtype and another with :class:`CategoricalDtype` with integer categories and containing ``NaN`` values casting to object dtype instead of ``float64`` (:issue:`45359`)
439439
- Bug in :func:`get_dummies` that selected object and categorical dtypes but not string (:issue:`44965`)
440+
- Bug in :meth:`DataFrame.align` when aligning a :class:`MultiIndex` to a :class:`Series` with another :class:`MultiIndex` (:issue:`46001`)
440441
-
441442

442443
Sparse

pandas/core/generic.py

+38-17
Original file line numberDiff line numberDiff line change
@@ -8975,10 +8975,14 @@ def _align_series(
89758975

89768976
is_series = isinstance(self, ABCSeries)
89778977

8978+
if (not is_series and axis is None) or axis not in [None, 0, 1]:
8979+
raise ValueError("Must specify axis=0 or 1")
8980+
8981+
if is_series and axis == 1:
8982+
raise ValueError("cannot align series to a series other than axis 0")
8983+
89788984
# series/series compat, other must always be a Series
8979-
if is_series:
8980-
if axis:
8981-
raise ValueError("cannot align series to a series other than axis 0")
8985+
if not axis:
89828986

89838987
# equal
89848988
if self.index.equals(other.index):
@@ -8988,26 +8992,43 @@ def _align_series(
89888992
other.index, how=join, level=level, return_indexers=True
89898993
)
89908994

8991-
left = self._reindex_indexer(join_index, lidx, copy)
8995+
if is_series:
8996+
left = self._reindex_indexer(join_index, lidx, copy)
8997+
elif join_index is None:
8998+
left = self.copy() if copy else self
8999+
else:
9000+
data = np.array(
9001+
[
9002+
algos.take_nd(
9003+
values,
9004+
lidx,
9005+
allow_fill=True,
9006+
fill_value=None,
9007+
)
9008+
for values in self._values.T
9009+
]
9010+
).T.tolist()
9011+
9012+
left = self._constructor(
9013+
data=data, columns=self.columns, index=join_index
9014+
)
9015+
89929016
right = other._reindex_indexer(join_index, ridx, copy)
89939017

89949018
else:
9019+
89959020
# one has > 1 ndim
89969021
fdata = self._mgr
8997-
if axis in [0, 1]:
8998-
join_index = self.axes[axis]
8999-
lidx, ridx = None, None
9000-
if not join_index.equals(other.index):
9001-
join_index, lidx, ridx = join_index.join(
9002-
other.index, how=join, level=level, return_indexers=True
9003-
)
9004-
9005-
if lidx is not None:
9006-
bm_axis = self._get_block_manager_axis(axis)
9007-
fdata = fdata.reindex_indexer(join_index, lidx, axis=bm_axis)
9022+
join_index = self.axes[1]
9023+
lidx, ridx = None, None
9024+
if not join_index.equals(other.index):
9025+
join_index, lidx, ridx = join_index.join(
9026+
other.index, how=join, level=level, return_indexers=True
9027+
)
90089028

9009-
else:
9010-
raise ValueError("Must specify axis=0 or 1")
9029+
if lidx is not None:
9030+
bm_axis = self._get_block_manager_axis(1)
9031+
fdata = fdata.reindex_indexer(join_index, lidx, axis=bm_axis)
90119032

90129033
if copy and fdata is self._mgr:
90139034
fdata = fdata.copy()

pandas/tests/frame/methods/test_align.py

+41
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,47 @@ def test_align_series_combinations(self):
243243
tm.assert_series_equal(res1, exp2)
244244
tm.assert_frame_equal(res2, exp1)
245245

246+
def test_multiindex_align_to_series_with_common_index_level(self):
247+
# GH-46001
248+
foo_index = Index([1, 2, 3], name="foo")
249+
bar_index = Index([1, 2], name="bar")
250+
251+
series = Series([1, 2], index=bar_index, name="foo_series")
252+
df = DataFrame(
253+
{"col": np.arange(6)},
254+
index=pd.MultiIndex.from_product([foo_index, bar_index]),
255+
)
256+
257+
expected = Series([1, 2] * 3, index=df.index, name="foo_series")
258+
_, result = df.align(series, axis=0)
259+
260+
tm.assert_series_equal(result, expected)
261+
262+
def test_multiindex_align_to_series_with_common_index_level_non_unique_cols(self):
263+
# GH-46001
264+
foo_index = Index([1, 2, 3], name="foo")
265+
bar_index = Index([1, 2], name="bar")
266+
267+
series = Series([1, 2], index=bar_index, name="foo_series")
268+
df = DataFrame(
269+
np.arange(18).reshape(6, 3),
270+
index=pd.MultiIndex.from_product([foo_index, bar_index]),
271+
)
272+
df.columns = ["cfoo", "cbar", "cfoo"]
273+
274+
expected = Series([1, 2] * 3, index=df.index, name="foo_series")
275+
result_left, result_right = df.align(series, axis=0)
276+
277+
tm.assert_series_equal(result_right, expected)
278+
tm.assert_index_equal(result_left.columns, df.columns)
279+
280+
def test_missing_axis_specification_exception(self):
281+
df = DataFrame(np.arange(50).reshape((10, 5)))
282+
series = Series(np.arange(5))
283+
284+
with pytest.raises(ValueError, match=r"axis=0 or 1"):
285+
df.align(series)
286+
246287
def _check_align(self, a, b, axis, fill_axis, how, method, limit=None):
247288
aa, ab = a.align(
248289
b, axis=axis, join=how, method=method, limit=limit, fill_axis=fill_axis

pandas/tests/frame/test_arithmetic.py

+110
Original file line numberDiff line numberDiff line change
@@ -722,6 +722,116 @@ def test_broadcast_multiindex(self, level):
722722

723723
tm.assert_frame_equal(result, expected)
724724

725+
def test_frame_multiindex_operations(self):
726+
# GH 43321
727+
df = DataFrame(
728+
{2010: [1, 2, 3], 2020: [3, 4, 5]},
729+
index=MultiIndex.from_product(
730+
[["a"], ["b"], [0, 1, 2]], names=["scen", "mod", "id"]
731+
),
732+
)
733+
734+
series = Series(
735+
[0.4],
736+
index=MultiIndex.from_product([["b"], ["a"]], names=["mod", "scen"]),
737+
)
738+
739+
expected = DataFrame(
740+
{2010: [1.4, 2.4, 3.4], 2020: [3.4, 4.4, 5.4]},
741+
index=MultiIndex.from_product(
742+
[["a"], ["b"], [0, 1, 2]], names=["scen", "mod", "id"]
743+
),
744+
)
745+
result = df.add(series, axis=0)
746+
747+
tm.assert_frame_equal(result, expected)
748+
749+
def test_frame_multiindex_operations_series_index_to_frame_index(self):
750+
# GH 43321
751+
df = DataFrame(
752+
{2010: [1], 2020: [3]},
753+
index=MultiIndex.from_product([["a"], ["b"]], names=["scen", "mod"]),
754+
)
755+
756+
series = Series(
757+
[10.0, 20.0, 30.0],
758+
index=MultiIndex.from_product(
759+
[["a"], ["b"], [0, 1, 2]], names=["scen", "mod", "id"]
760+
),
761+
)
762+
763+
expected = DataFrame(
764+
{2010: [11.0, 21, 31.0], 2020: [13.0, 23.0, 33.0]},
765+
index=MultiIndex.from_product(
766+
[["a"], ["b"], [0, 1, 2]], names=["scen", "mod", "id"]
767+
),
768+
)
769+
result = df.add(series, axis=0)
770+
771+
tm.assert_frame_equal(result, expected)
772+
773+
def test_frame_multiindex_operations_no_align(self):
774+
df = DataFrame(
775+
{2010: [1, 2, 3], 2020: [3, 4, 5]},
776+
index=MultiIndex.from_product(
777+
[["a"], ["b"], [0, 1, 2]], names=["scen", "mod", "id"]
778+
),
779+
)
780+
781+
series = Series(
782+
[0.4],
783+
index=MultiIndex.from_product([["c"], ["a"]], names=["mod", "scen"]),
784+
)
785+
786+
expected = DataFrame(
787+
{2010: np.nan, 2020: np.nan},
788+
index=MultiIndex.from_tuples(
789+
[
790+
("a", "b", 0),
791+
("a", "b", 1),
792+
("a", "b", 2),
793+
("a", "c", np.nan),
794+
],
795+
names=["scen", "mod", "id"],
796+
),
797+
)
798+
result = df.add(series, axis=0)
799+
800+
tm.assert_frame_equal(result, expected)
801+
802+
def test_frame_multiindex_operations_part_align(self):
803+
df = DataFrame(
804+
{2010: [1, 2, 3], 2020: [3, 4, 5]},
805+
index=MultiIndex.from_tuples(
806+
[
807+
("a", "b", 0),
808+
("a", "b", 1),
809+
("a", "c", 2),
810+
],
811+
names=["scen", "mod", "id"],
812+
),
813+
)
814+
815+
series = Series(
816+
[0.4],
817+
index=MultiIndex.from_product([["b"], ["a"]], names=["mod", "scen"]),
818+
)
819+
820+
expected = DataFrame(
821+
{2010: [1.4, 2.4, np.nan], 2020: [3.4, 4.4, np.nan]},
822+
index=MultiIndex.from_tuples(
823+
[
824+
("a", "b", 0),
825+
("a", "b", 1),
826+
("a", "c", 2),
827+
],
828+
names=["scen", "mod", "id"],
829+
),
830+
)
831+
result = df.add(series, axis=0)
832+
833+
tm.assert_frame_equal(result, expected)
834+
725835

726836
class TestFrameArithmetic:
727837
def test_td64_op_nat_casting(self):

0 commit comments

Comments
 (0)