Skip to content

Commit 51a287a

Browse files
BUG: Fix aligning a DataFrame with a Series with MultiIndex
When aligning a DataFrame to a Series we are using Series.reindex() to broadcast the Series data to the new index. That introduces NaNs when the new index rows are not identical to the existing ones, which is not the case when we introduce a new MultiIndex level. In this patch we use the same approach as for aligning a Series to another Series. That means that we have to replicate a part of Series._reindex_indexer as DataFrame does not have it.
1 parent e4162cd commit 51a287a

File tree

4 files changed

+191
-17
lines changed

4 files changed

+191
-17
lines changed

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,7 @@ Reshaping
433433
^^^^^^^^^
434434
- Bug in :func:`concat` between a :class:`Series` with integer dtype and another with :class:`CategoricalDtype` with integer categories and containing ``NaN`` values casting to object dtype instead of ``float64`` (:issue:`45359`)
435435
- Bug in :func:`get_dummies` that selected object and categorical dtypes but not string (:issue:`44965`)
436+
- Bug in :meth:`DataFrame.align` when aligning a :class:`MultiIndex` to a :class:`Series` with another :class:`MultiIndex` (:issue:`46001`)
436437
-
437438

438439
Sparse

pandas/core/generic.py

+39-17
Original file line numberDiff line numberDiff line change
@@ -8975,10 +8975,14 @@ def _align_series(
89758975

89768976
is_series = isinstance(self, ABCSeries)
89778977

8978+
if (not is_series and axis is None) or axis not in [None, 0, 1]:
8979+
raise ValueError("Must specify axis=0 or 1")
8980+
8981+
if is_series and axis == 1:
8982+
raise ValueError("cannot align series to a series other than axis 0")
8983+
89788984
# series/series compat, other must always be a Series
8979-
if is_series:
8980-
if axis:
8981-
raise ValueError("cannot align series to a series other than axis 0")
8985+
if not axis:
89828986

89838987
# equal
89848988
if self.index.equals(other.index):
@@ -8988,26 +8992,44 @@ def _align_series(
89888992
other.index, how=join, level=level, return_indexers=True
89898993
)
89908994

8991-
left = self._reindex_indexer(join_index, lidx, copy)
8995+
if is_series:
8996+
left = self._reindex_indexer(join_index, lidx, copy)
8997+
elif join_index is None:
8998+
left = self.copy() if copy else self
8999+
else:
9000+
data = np.array(
9001+
[
9002+
algos.take_nd(
9003+
values,
9004+
lidx,
9005+
allow_fill=True,
9006+
fill_value=None,
9007+
)
9008+
for values in self._values.T
9009+
]
9010+
).T
9011+
data = data if data.size > 0 else {}
9012+
9013+
left = self._constructor(
9014+
data=data, columns=self.columns, index=join_index
9015+
)
9016+
89929017
right = other._reindex_indexer(join_index, ridx, copy)
89939018

89949019
else:
9020+
89959021
# one has > 1 ndim
89969022
fdata = self._mgr
8997-
if axis in [0, 1]:
8998-
join_index = self.axes[axis]
8999-
lidx, ridx = None, None
9000-
if not join_index.equals(other.index):
9001-
join_index, lidx, ridx = join_index.join(
9002-
other.index, how=join, level=level, return_indexers=True
9003-
)
9004-
9005-
if lidx is not None:
9006-
bm_axis = self._get_block_manager_axis(axis)
9007-
fdata = fdata.reindex_indexer(join_index, lidx, axis=bm_axis)
9023+
join_index = self.axes[1]
9024+
lidx, ridx = None, None
9025+
if not join_index.equals(other.index):
9026+
join_index, lidx, ridx = join_index.join(
9027+
other.index, how=join, level=level, return_indexers=True
9028+
)
90089029

9009-
else:
9010-
raise ValueError("Must specify axis=0 or 1")
9030+
if lidx is not None:
9031+
bm_axis = self._get_block_manager_axis(1)
9032+
fdata = fdata.reindex_indexer(join_index, lidx, axis=bm_axis)
90119033

90129034
if copy and fdata is self._mgr:
90139035
fdata = fdata.copy()

pandas/tests/frame/methods/test_align.py

+41
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,47 @@ def test_align_series_combinations(self):
243243
tm.assert_series_equal(res1, exp2)
244244
tm.assert_frame_equal(res2, exp1)
245245

246+
def test_multiindex_align_to_series_with_common_index_level(self):
247+
# GH-46001
248+
foo_index = Index([1, 2, 3], name="foo")
249+
bar_index = Index([1, 2], name="bar")
250+
251+
series = Series([1, 2], index=bar_index, name="foo_series")
252+
df = DataFrame(
253+
{"col": np.arange(6)},
254+
index=pd.MultiIndex.from_product([foo_index, bar_index]),
255+
)
256+
257+
expected = Series([1, 2] * 3, index=df.index, name="foo_series")
258+
_, result = df.align(series, axis=0)
259+
260+
tm.assert_series_equal(result, expected)
261+
262+
def test_multiindex_align_to_series_with_common_index_level_non_unique_cols(self):
263+
# GH-46001
264+
foo_index = Index([1, 2, 3], name="foo")
265+
bar_index = Index([1, 2], name="bar")
266+
267+
series = Series([1, 2], index=bar_index, name="foo_series")
268+
df = DataFrame(
269+
np.arange(18).reshape(6, 3),
270+
index=pd.MultiIndex.from_product([foo_index, bar_index]),
271+
)
272+
df.columns = ["cfoo", "cbar", "cfoo"]
273+
274+
expected = Series([1, 2] * 3, index=df.index, name="foo_series")
275+
result_left, result_right = df.align(series, axis=0)
276+
277+
tm.assert_series_equal(result_right, expected)
278+
tm.assert_index_equal(result_left.columns, df.columns)
279+
280+
def test_missing_axis_specification_exception(self):
281+
df = DataFrame(np.arange(50).reshape((10, 5)))
282+
series = Series(np.arange(5))
283+
284+
with pytest.raises(ValueError, match=r"axis=0 or 1"):
285+
df.align(series)
286+
246287
def _check_align(self, a, b, axis, fill_axis, how, method, limit=None):
247288
aa, ab = a.align(
248289
b, axis=axis, join=how, method=method, limit=limit, fill_axis=fill_axis

pandas/tests/frame/test_arithmetic.py

+110
Original file line numberDiff line numberDiff line change
@@ -722,6 +722,116 @@ def test_broadcast_multiindex(self, level):
722722

723723
tm.assert_frame_equal(result, expected)
724724

725+
def test_frame_multiindex_operations(self):
726+
# GH 43321
727+
df = DataFrame(
728+
{2010: [1, 2, 3], 2020: [3, 4, 5]},
729+
index=MultiIndex.from_product(
730+
[["a"], ["b"], [0, 1, 2]], names=["scen", "mod", "id"]
731+
),
732+
)
733+
734+
series = Series(
735+
[0.4],
736+
index=MultiIndex.from_product([["b"], ["a"]], names=["mod", "scen"]),
737+
)
738+
739+
expected = DataFrame(
740+
{2010: [1.4, 2.4, 3.4], 2020: [3.4, 4.4, 5.4]},
741+
index=MultiIndex.from_product(
742+
[["a"], ["b"], [0, 1, 2]], names=["scen", "mod", "id"]
743+
),
744+
)
745+
result = df.add(series, axis=0)
746+
747+
tm.assert_frame_equal(result, expected)
748+
749+
def test_frame_multiindex_operations_series_index_to_frame_index(self):
750+
# GH 43321
751+
df = DataFrame(
752+
{2010: [1], 2020: [3]},
753+
index=MultiIndex.from_product([["a"], ["b"]], names=["scen", "mod"]),
754+
)
755+
756+
series = Series(
757+
[10.0, 20.0, 30.0],
758+
index=MultiIndex.from_product(
759+
[["a"], ["b"], [0, 1, 2]], names=["scen", "mod", "id"]
760+
),
761+
)
762+
763+
expected = DataFrame(
764+
{2010: [11.0, 21, 31.0], 2020: [13.0, 23.0, 33.0]},
765+
index=MultiIndex.from_product(
766+
[["a"], ["b"], [0, 1, 2]], names=["scen", "mod", "id"]
767+
),
768+
)
769+
result = df.add(series, axis=0)
770+
771+
tm.assert_frame_equal(result, expected)
772+
773+
def test_frame_multiindex_operations_no_align(self):
774+
df = DataFrame(
775+
{2010: [1, 2, 3], 2020: [3, 4, 5]},
776+
index=MultiIndex.from_product(
777+
[["a"], ["b"], [0, 1, 2]], names=["scen", "mod", "id"]
778+
),
779+
)
780+
781+
series = Series(
782+
[0.4],
783+
index=MultiIndex.from_product([["c"], ["a"]], names=["mod", "scen"]),
784+
)
785+
786+
expected = DataFrame(
787+
{2010: np.nan, 2020: np.nan},
788+
index=MultiIndex.from_tuples(
789+
[
790+
("a", "b", 0),
791+
("a", "b", 1),
792+
("a", "b", 2),
793+
("a", "c", np.nan),
794+
],
795+
names=["scen", "mod", "id"],
796+
),
797+
)
798+
result = df.add(series, axis=0)
799+
800+
tm.assert_frame_equal(result, expected)
801+
802+
def test_frame_multiindex_operations_part_align(self):
803+
df = DataFrame(
804+
{2010: [1, 2, 3], 2020: [3, 4, 5]},
805+
index=MultiIndex.from_tuples(
806+
[
807+
("a", "b", 0),
808+
("a", "b", 1),
809+
("a", "c", 2),
810+
],
811+
names=["scen", "mod", "id"],
812+
),
813+
)
814+
815+
series = Series(
816+
[0.4],
817+
index=MultiIndex.from_product([["b"], ["a"]], names=["mod", "scen"]),
818+
)
819+
820+
expected = DataFrame(
821+
{2010: [1.4, 2.4, np.nan], 2020: [3.4, 4.4, np.nan]},
822+
index=MultiIndex.from_tuples(
823+
[
824+
("a", "b", 0),
825+
("a", "b", 1),
826+
("a", "c", 2),
827+
],
828+
names=["scen", "mod", "id"],
829+
),
830+
)
831+
result = df.add(series, axis=0)
832+
833+
tm.assert_frame_equal(result, expected)
834+
725835

726836
class TestFrameArithmetic:
727837
def test_td64_op_nat_casting(self):

0 commit comments

Comments
 (0)