Skip to content

BUG: Fix aligning a DataFrame with a Series with MultiIndex #46001 #46058

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,7 @@ Reshaping
^^^^^^^^^
- Bug in :func:`concat` between a :class:`Series` with integer dtype and another with :class:`CategoricalDtype` with integer categories and containing ``NaN`` values casting to object dtype instead of ``float64`` (:issue:`45359`)
- Bug in :func:`get_dummies` that selected object and categorical dtypes but not string (:issue:`44965`)
- Bug in :meth:`DataFrame.align` when aligning a :class:`MultiIndex` to a :class:`Series` with another :class:`MultiIndex` (:issue:`46001`)
-

Sparse
Expand Down
50 changes: 33 additions & 17 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8975,10 +8975,14 @@ def _align_series(

is_series = isinstance(self, ABCSeries)

if (not is_series and axis is None) or axis not in [None, 0, 1]:
raise ValueError("Must specify axis=0 or 1")

if is_series and axis == 1:
raise ValueError("cannot align series to a series other than axis 0")

# series/series compat, other must always be a Series
if is_series:
if axis:
raise ValueError("cannot align series to a series other than axis 0")
if not axis:

# equal
if self.index.equals(other.index):
Expand All @@ -8988,26 +8992,38 @@ def _align_series(
other.index, how=join, level=level, return_indexers=True
)

left = self._reindex_indexer(join_index, lidx, copy)
if is_series:
left = self._reindex_indexer(join_index, lidx, copy)
elif lidx is None:
left = self.copy() if copy else self
else:
data = algos.take_nd(
self.values,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

self.values can be expensive (and lossy) if the dataframe is not a single block.

lidx,
allow_fill=True,
fill_value=None,
)

left = self._constructor(
data=data, columns=self.columns, index=join_index
)

right = other._reindex_indexer(join_index, ridx, copy)

else:

# one has > 1 ndim
fdata = self._mgr
if axis in [0, 1]:
join_index = self.axes[axis]
lidx, ridx = None, None
if not join_index.equals(other.index):
join_index, lidx, ridx = join_index.join(
other.index, how=join, level=level, return_indexers=True
)

if lidx is not None:
bm_axis = self._get_block_manager_axis(axis)
fdata = fdata.reindex_indexer(join_index, lidx, axis=bm_axis)
join_index = self.axes[1]
lidx, ridx = None, None
if not join_index.equals(other.index):
join_index, lidx, ridx = join_index.join(
other.index, how=join, level=level, return_indexers=True
)

else:
raise ValueError("Must specify axis=0 or 1")
if lidx is not None:
bm_axis = self._get_block_manager_axis(1)
fdata = fdata.reindex_indexer(join_index, lidx, axis=bm_axis)

if copy and fdata is self._mgr:
fdata = fdata.copy()
Expand Down
99 changes: 99 additions & 0 deletions pandas/tests/frame/methods/test_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,105 @@ def test_align_series_combinations(self):
tm.assert_series_equal(res1, exp2)
tm.assert_frame_equal(res2, exp1)

def test_multiindex_align_to_series_with_common_index_level(self):
# GH-46001
foo_index = Index([1, 2, 3], name="foo")
bar_index = Index([1, 2], name="bar")

series = Series([1, 2], index=bar_index, name="foo_series")
df = DataFrame(
{"col": np.arange(6)},
index=pd.MultiIndex.from_product([foo_index, bar_index]),
)

expected_r = Series([1, 2] * 3, index=df.index, name="foo_series")
result_l, result_r = df.align(series, axis=0)

tm.assert_frame_equal(result_l, df)
tm.assert_series_equal(result_r, expected_r)

def test_multiindex_align_to_series_with_common_index_level_missing_in_left(self):
# GH-46001
foo_index = Index([1, 2, 3], name="foo")
bar_index = Index([1, 2], name="bar")

series = Series(
[1, 2, 3, 4], index=Index([1, 2, 3, 4], name="bar"), name="foo_series"
)
df = DataFrame(
{"col": np.arange(6)},
index=pd.MultiIndex.from_product([foo_index, bar_index]),
)

expected_r = Series([1, 2] * 3, index=df.index, name="foo_series")
result_l, result_r = df.align(series, axis=0)

tm.assert_frame_equal(result_l, df)
tm.assert_series_equal(result_r, expected_r)

def test_multiindex_align_to_series_with_common_index_level_missing_in_right(self):
# GH-46001
foo_index = Index([1, 2, 3], name="foo")
bar_index = Index([1, 2, 3, 4], name="bar")

series = Series([1, 2], index=Index([1, 2], name="bar"), name="foo_series")
df = DataFrame(
{"col": np.arange(12)},
index=pd.MultiIndex.from_product([foo_index, bar_index]),
)

expected_r = Series(
[1, 2, np.nan, np.nan] * 3, index=df.index, name="foo_series"
)
result_l, result_r = df.align(series, axis=0)

tm.assert_frame_equal(result_l, df)
tm.assert_series_equal(result_r, expected_r)

def test_multiindex_align_to_series_with_common_index_level_missing_in_both(self):
# GH-46001
foo_index = Index([1, 2, 3], name="foo")
bar_index = Index([1, 3, 4], name="bar")

series = Series(
[1, 2, 3], index=Index([1, 2, 4], name="bar"), name="foo_series"
)
df = DataFrame(
{"col": np.arange(9)},
index=pd.MultiIndex.from_product([foo_index, bar_index]),
)

expected_r = Series([1, np.nan, 3] * 3, index=df.index, name="foo_series")
result_l, result_r = df.align(series, axis=0)

tm.assert_frame_equal(result_l, df)
tm.assert_series_equal(result_r, expected_r)

def test_multiindex_align_to_series_with_common_index_level_non_unique_cols(self):
# GH-46001
foo_index = Index([1, 2, 3], name="foo")
bar_index = Index([1, 2], name="bar")

series = Series([1, 2], index=bar_index, name="foo_series")
df = DataFrame(
np.arange(18).reshape(6, 3),
index=pd.MultiIndex.from_product([foo_index, bar_index]),
)
df.columns = ["cfoo", "cbar", "cfoo"]

expected = Series([1, 2] * 3, index=df.index, name="foo_series")
result_left, result_right = df.align(series, axis=0)

tm.assert_series_equal(result_right, expected)
tm.assert_index_equal(result_left.columns, df.columns)

def test_missing_axis_specification_exception(self):
df = DataFrame(np.arange(50).reshape((10, 5)))
series = Series(np.arange(5))

with pytest.raises(ValueError, match=r"axis=0 or 1"):
df.align(series)

def _check_align(self, a, b, axis, fill_axis, how, method, limit=None):
aa, ab = a.align(
b, axis=axis, join=how, method=method, limit=limit, fill_axis=fill_axis
Expand Down
110 changes: 110 additions & 0 deletions pandas/tests/frame/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -722,6 +722,116 @@ def test_broadcast_multiindex(self, level):

tm.assert_frame_equal(result, expected)

def test_frame_multiindex_operations(self):
# GH 43321
df = DataFrame(
{2010: [1, 2, 3], 2020: [3, 4, 5]},
index=MultiIndex.from_product(
[["a"], ["b"], [0, 1, 2]], names=["scen", "mod", "id"]
),
)

series = Series(
[0.4],
index=MultiIndex.from_product([["b"], ["a"]], names=["mod", "scen"]),
)

expected = DataFrame(
{2010: [1.4, 2.4, 3.4], 2020: [3.4, 4.4, 5.4]},
index=MultiIndex.from_product(
[["a"], ["b"], [0, 1, 2]], names=["scen", "mod", "id"]
),
)
result = df.add(series, axis=0)

tm.assert_frame_equal(result, expected)

def test_frame_multiindex_operations_series_index_to_frame_index(self):
# GH 43321
df = DataFrame(
{2010: [1], 2020: [3]},
index=MultiIndex.from_product([["a"], ["b"]], names=["scen", "mod"]),
)

series = Series(
[10.0, 20.0, 30.0],
index=MultiIndex.from_product(
[["a"], ["b"], [0, 1, 2]], names=["scen", "mod", "id"]
),
)

expected = DataFrame(
{2010: [11.0, 21, 31.0], 2020: [13.0, 23.0, 33.0]},
index=MultiIndex.from_product(
[["a"], ["b"], [0, 1, 2]], names=["scen", "mod", "id"]
),
)
result = df.add(series, axis=0)

tm.assert_frame_equal(result, expected)

def test_frame_multiindex_operations_no_align(self):
df = DataFrame(
{2010: [1, 2, 3], 2020: [3, 4, 5]},
index=MultiIndex.from_product(
[["a"], ["b"], [0, 1, 2]], names=["scen", "mod", "id"]
),
)

series = Series(
[0.4],
index=MultiIndex.from_product([["c"], ["a"]], names=["mod", "scen"]),
)

expected = DataFrame(
{2010: np.nan, 2020: np.nan},
index=MultiIndex.from_tuples(
[
("a", "b", 0),
("a", "b", 1),
("a", "b", 2),
("a", "c", np.nan),
],
names=["scen", "mod", "id"],
),
)
result = df.add(series, axis=0)

tm.assert_frame_equal(result, expected)

def test_frame_multiindex_operations_part_align(self):
df = DataFrame(
{2010: [1, 2, 3], 2020: [3, 4, 5]},
index=MultiIndex.from_tuples(
[
("a", "b", 0),
("a", "b", 1),
("a", "c", 2),
],
names=["scen", "mod", "id"],
),
)

series = Series(
[0.4],
index=MultiIndex.from_product([["b"], ["a"]], names=["mod", "scen"]),
)

expected = DataFrame(
{2010: [1.4, 2.4, np.nan], 2020: [3.4, 4.4, np.nan]},
index=MultiIndex.from_tuples(
[
("a", "b", 0),
("a", "b", 1),
("a", "c", 2),
],
names=["scen", "mod", "id"],
),
)
result = df.add(series, axis=0)

tm.assert_frame_equal(result, expected)


class TestFrameArithmetic:
def test_td64_op_nat_casting(self):
Expand Down