diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index d4f22e482af84..76216f19b3ecb 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -135,13 +135,20 @@ def _get_combined_index( indexes = _get_distinct_objs(indexes) if len(indexes) == 0: index = Index([]) - elif len(indexes) == 1: + elif len(indexes) == 1 or all_indexes_same(indexes): index = indexes[0] elif intersect: + duplicates = union_indexes( + [index[index.duplicated(keep="first")] for index in indexes] + ) index = indexes[0] for other in indexes[1:]: index = index.intersection(other) + if len(duplicates.intersection(index)) > 0: + raise InvalidIndexError("Duplicated values in intersection of indices.") else: + if not all(idx.is_unique for idx in indexes): + raise InvalidIndexError("Cannot union indices with duplicate values.") index = union_indexes(indexes, sort=sort) index = ensure_index(index) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 538e937703de6..8a852a8dd4a9a 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.errors import InvalidIndexError + from pandas.core.dtypes.common import is_dtype_equal import pandas as pd @@ -21,6 +23,7 @@ ) import pandas._testing as tm from pandas.api.types import is_datetime64tz_dtype, pandas_dtype +from pandas.core.indexes.api import get_objs_combined_axis COMPATIBLE_INCONSISTENT_PAIRS = { (Int64Index, RangeIndex): (tm.makeIntIndex, tm.makeRangeIndex), @@ -463,3 +466,41 @@ def test_setop_with_categorical(index, sort, method): result = getattr(index, method)(other[:5], sort=sort) expected = getattr(index, method)(index[:5], sort=sort) tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("reverse", [True, False]) +def test_valid_intersection_w_dupes(index, reverse): + # Make sure base index is unique and has at least 3 values + index = index.unique() + if len(index) < 3: + pytest.skip() + + series = [ + pd.Series(1, index=index[[0, 0, 1, 2]]), + pd.Series(0, index=index[[1, 2]]), + ] + if reverse: + series = reversed(series) + + result = get_objs_combined_axis(series, intersect=True) + expected = index[[1, 2]] + + tm.assert_index_equal(result, expected, check_order=False) + + +@pytest.mark.parametrize("reverse", [True, False]) +def test_invalid_intersection_w_dupes(index, reverse): + # Make sure base index is unique and has at least 3 values + index = index.unique() + if len(index) < 3: + pytest.skip() + + series = [ + pd.Series(1, index=index[[0, 0, 1, 2]]), + pd.Series(0, index=index[[0, 2]]), + ] + if reverse: + series = reversed(series) + + with pytest.raises(InvalidIndexError): + _ = get_objs_combined_axis(series, intersect=True) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 16c4e9456aa05..fd3a9a00fee8a 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.errors import InvalidIndexError + import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, concat, date_range import pandas._testing as tm @@ -445,6 +447,57 @@ def test_concat_ordered_dict(self): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("join", ["inner", "outer"]) +def test_concat_duplicates_error(index, join): + # https://github.com/pandas-dev/pandas/issues/6963 + # Needs an index with 4 unique values + index = index.unique() + if len(index) < 4: + pytest.skip() + + index_unique = index[:4] + index_non_unique = index_unique[[0, 0, 1, 2, 3]] + + df_non_unique = DataFrame( + np.ones((1, len(index_non_unique))), columns=index_non_unique + ) + df_unique = DataFrame(np.ones((1, len(index_unique))), columns=index_unique) + + with pytest.raises(InvalidIndexError): + _ = pd.concat([df_non_unique, df_unique], join=join) + + +@pytest.mark.xfail(reason="Not implemented") +def test_concat_intersection_duplicates(index): + # ailing: https://github.com/pandas-dev/pandas/pull/38745/files#r549577521 + # Concat is valid if the intersection does not contain duplicates + # Needs an index with 4 unique values + index = index.unique() + if len(index) < 4: + pytest.skip() + + index_unique = index[[0, 1, 2]] + index_non_unique = index[[1, 2, 3, 3]] + + df_unique = DataFrame( + np.ones((1, len(index_unique))), + columns=index_unique, + ) + df_non_unique = DataFrame( + np.zeros((1, len(index_non_unique))), + columns=index_non_unique, + ) + + result = pd.concat([df_unique, df_non_unique], join="inner") + expected = DataFrame( + [[1, 1], [0, 0]], + columns=index[[1, 2]], + index=[0, 0], + ) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("pdt", [Series, pd.DataFrame]) @pytest.mark.parametrize("dt", np.sctypes["float"]) def test_concat_no_unnecessary_upcast(dt, pdt):