From c6f16778c18c0a4c32ba0bd57aa909b159ff487e Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 28 Dec 2020 18:01:29 +1100 Subject: [PATCH 1/7] Initial test and fix (WIP) --- pandas/core/indexes/api.py | 13 +++++++++++++ pandas/tests/reshape/concat/test_concat.py | 21 +++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index d4f22e482af84..5e343bc5ad861 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -89,6 +89,19 @@ def get_objs_combined_axis( Index """ obs_idxes = [obj._get_axis(axis) for obj in objs] + if all_indexes_same(obs_idxes): + idx = obs_idxes[0] + if sort: + try: + idx = idx.sort_values() + copy = False + except TypeError: + pass + if copy: + idx = idx.copy() + return idx + elif not all(idx.is_unique for idx in obs_idxes): + raise InvalidIndexError() return _get_combined_index(obs_idxes, intersect=intersect, sort=sort, copy=copy) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 16c4e9456aa05..82023997fba7b 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -1,5 +1,6 @@ from collections import abc, deque from decimal import Decimal +from pandas.errors import InvalidIndexError from warnings import catch_warnings import numpy as np @@ -445,6 +446,26 @@ def test_concat_ordered_dict(self): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("index_maker", tm.index_subclass_makers_generator()) +@pytest.mark.parametrize("join", ["inner", "outer"]) +def test_concat_duplicates_error(index_maker, join): + # if index_maker is tm.makeMultiIndex: + # TODO: This generator only makes Indexs of size 4 + # pytest.skip() + index_unique = index_maker(k=4) + index_non_unique = index_unique[[0, 0, 1, 2, 3]] + with pytest.raises(InvalidIndexError): + _ = pd.concat( + [ + pd.DataFrame( + np.ones((1, len(index_non_unique))), columns=index_non_unique + ), + pd.DataFrame(np.ones((1, len(index_unique))), columns=index_unique), + ], + join=join, + ) + + @pytest.mark.parametrize("pdt", [Series, pd.DataFrame]) @pytest.mark.parametrize("dt", np.sctypes["float"]) def test_concat_no_unnecessary_upcast(dt, pdt): From 19c95f00e7875e7d863478299218fbe53e2ce3db Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 29 Dec 2020 16:16:10 +1100 Subject: [PATCH 2/7] Move dataframe def out of error check --- pandas/tests/reshape/concat/test_concat.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 82023997fba7b..15904924fbf77 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -454,16 +454,14 @@ def test_concat_duplicates_error(index_maker, join): # pytest.skip() index_unique = index_maker(k=4) index_non_unique = index_unique[[0, 0, 1, 2, 3]] + + df_non_unique = pd.DataFrame( + np.ones((1, len(index_non_unique))), columns=index_non_unique + ) + df_unique = pd.DataFrame(np.ones((1, len(index_unique))), columns=index_unique) + with pytest.raises(InvalidIndexError): - _ = pd.concat( - [ - pd.DataFrame( - np.ones((1, len(index_non_unique))), columns=index_non_unique - ), - pd.DataFrame(np.ones((1, len(index_unique))), columns=index_unique), - ], - join=join, - ) + _ = pd.concat([df_non_unique, df_unique], join=join) @pytest.mark.parametrize("pdt", [Series, pd.DataFrame]) From bb33098a7860649de2c74b16cc106e1c63c521ae Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 29 Dec 2020 16:17:15 +1100 Subject: [PATCH 3/7] Formatting error --- pandas/tests/reshape/concat/test_concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 15904924fbf77..6b2b5f7471f5f 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -1,6 +1,5 @@ from collections import abc, deque from decimal import Decimal -from pandas.errors import InvalidIndexError from warnings import catch_warnings import numpy as np @@ -11,6 +10,7 @@ import pandas._testing as tm from pandas.core.arrays import SparseArray from pandas.core.construction import create_series_with_explicit_dtype +from pandas.errors import InvalidIndexError from pandas.tests.extension.decimal import to_decimal From 11378a870940c7172bdeb304599653545a8d8107 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 29 Dec 2020 16:22:47 +1100 Subject: [PATCH 4/7] Test allowing duplicate indices if they aren't in intersection (xfail) --- pandas/tests/reshape/concat/test_concat.py | 30 ++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 6b2b5f7471f5f..cb34af9f1d824 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -464,6 +464,36 @@ def test_concat_duplicates_error(index_maker, join): _ = pd.concat([df_non_unique, df_unique], join=join) +@pytest.mark.parametrize("index_maker", tm.index_subclass_makers_generator()) +@pytest.mark.xfail(reason="Not implemented") +def test_concat_intersection_duplicates(index_maker): + # Currently failing: https://github.com/pandas-dev/pandas/pull/38745/files#r549577521 + # Concat is valid if the intersection does not contain duplicates + index_full = index_maker(k=4) + index_unique = index_full[[0, 1, 2]] + index_non_unique = index_full[[1, 2, 3, 3]] + + df_unique = pd.DataFrame( + np.ones((1, len(index_unique))), + columns=index_unique, + ) + df_non_unique = pd.DataFrame( + np.zeros((1, len(index_non_unique))), + columns=index_non_unique, + ) + + result = pd.concat([df_unique, df_non_unique], join="inner") + + tm.assert_frame_equal( + result, + pd.DataFrame( + [[1, 1], [0, 0]], + columns=index_full[[1, 2]], + index=[0, 0], + ), + ) + + @pytest.mark.parametrize("pdt", [Series, pd.DataFrame]) @pytest.mark.parametrize("dt", np.sctypes["float"]) def test_concat_no_unnecessary_upcast(dt, pdt): From ca316ce5fb5dc3251cfb21679d37febf642719a9 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 29 Dec 2020 17:04:53 +1100 Subject: [PATCH 5/7] Better error messages + organization for duplicate errors --- pandas/core/indexes/api.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 5e343bc5ad861..e796a75111c2e 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -89,19 +89,6 @@ def get_objs_combined_axis( Index """ obs_idxes = [obj._get_axis(axis) for obj in objs] - if all_indexes_same(obs_idxes): - idx = obs_idxes[0] - if sort: - try: - idx = idx.sort_values() - copy = False - except TypeError: - pass - if copy: - idx = idx.copy() - return idx - elif not all(idx.is_unique for idx in obs_idxes): - raise InvalidIndexError() return _get_combined_index(obs_idxes, intersect=intersect, sort=sort, copy=copy) @@ -148,13 +135,17 @@ def _get_combined_index( indexes = _get_distinct_objs(indexes) if len(indexes) == 0: index = Index([]) - elif len(indexes) == 1: + elif len(indexes) == 1 or all_indexes_same(indexes): index = indexes[0] elif intersect: index = indexes[0] for other in indexes[1:]: index = index.intersection(other) + if not index.is_unique: + raise InvalidIndexError("Duplicated values in intersection of indices.") else: + if not all(idx.is_unique for idx in indexes): + raise InvalidIndexError("Cannot union indices with duplicate values.") index = union_indexes(indexes, sort=sort) index = ensure_index(index) From e59bef28bba97f2d7aaa2b73387861f73e34a415 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 29 Dec 2020 20:02:28 +1100 Subject: [PATCH 6/7] Fixes for combining axes when non-intersecting labels are duplicated This fixes a previous issue where `get_objs_combined_axis` would not throw and error for duplicated indices in all cases. Now we check to make sure that the duplicated values don't show up in the intersection. This allows safe indexing when there are duplicated values, but they don't show up in the intersection. --- pandas/core/indexes/api.py | 5 ++++- pandas/tests/indexes/test_setops.py | 33 +++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index e796a75111c2e..76216f19b3ecb 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -138,10 +138,13 @@ def _get_combined_index( elif len(indexes) == 1 or all_indexes_same(indexes): index = indexes[0] elif intersect: + duplicates = union_indexes( + [index[index.duplicated(keep="first")] for index in indexes] + ) index = indexes[0] for other in indexes[1:]: index = index.intersection(other) - if not index.is_unique: + if len(duplicates.intersection(index)) > 0: raise InvalidIndexError("Duplicated values in intersection of indices.") else: if not all(idx.is_unique for idx in indexes): diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 538e937703de6..64b9518ea67b0 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -2,10 +2,13 @@ The tests in this package are to ensure the proper resultant dtypes of set operations. """ +from sys import intern +from pandas.errors import InvalidIndexError import numpy as np import pytest from pandas.core.dtypes.common import is_dtype_equal +from pandas.core.indexes.api import get_objs_combined_axis import pandas as pd from pandas import ( @@ -463,3 +466,33 @@ def test_setop_with_categorical(index, sort, method): result = getattr(index, method)(other[:5], sort=sort) expected = getattr(index, method)(index[:5], sort=sort) tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("index_maker", tm.index_subclass_makers_generator()) +@pytest.mark.parametrize("reverse", [True, False]) +def test_valid_intersection_w_dupes(index_maker, reverse): + # TODO: it would be good to ensure these are unique (categoricals aren't) + full = index_maker(k=4) + series = [pd.Series(1, index=full[[0, 1, 1, 2]]), pd.Series(0, index=full[[0, 2]])] + if reverse: + series = reversed(series) + + result = get_objs_combined_axis(series, intersect=True) + + tm.assert_index_equal(full[[0, 2]], result, check_order=False) + + +@pytest.mark.parametrize("index_maker", tm.index_subclass_makers_generator()) +@pytest.mark.parametrize("reverse", [True, False]) +def test_invalid_intersection_w_dupes(index_maker, reverse): + # TODO: it would be good to ensure these are unique (categoricals aren't) + full = index_maker(k=4) + series = [ + pd.Series(1, index=full[[0, 1, 1, 2]]), + pd.Series(0, index=full[[0, 1, 2]]), + ] + if reverse: + series = reversed(series) + + with pytest.raises(InvalidIndexError): + _ = get_objs_combined_axis(series, intersect=True) From f176ad336241e10325f73d25d66a6e754f4c40dd Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 30 Dec 2020 14:36:34 +1100 Subject: [PATCH 7/7] Fixes from review and CI --- pandas/tests/indexes/test_setops.py | 38 +++++++++------ pandas/tests/reshape/concat/test_concat.py | 54 ++++++++++++---------- 2 files changed, 52 insertions(+), 40 deletions(-) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 64b9518ea67b0..8a852a8dd4a9a 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -2,13 +2,12 @@ The tests in this package are to ensure the proper resultant dtypes of set operations. """ -from sys import intern -from pandas.errors import InvalidIndexError import numpy as np import pytest +from pandas.errors import InvalidIndexError + from pandas.core.dtypes.common import is_dtype_equal -from pandas.core.indexes.api import get_objs_combined_axis import pandas as pd from pandas import ( @@ -24,6 +23,7 @@ ) import pandas._testing as tm from pandas.api.types import is_datetime64tz_dtype, pandas_dtype +from pandas.core.indexes.api import get_objs_combined_axis COMPATIBLE_INCONSISTENT_PAIRS = { (Int64Index, RangeIndex): (tm.makeIntIndex, tm.makeRangeIndex), @@ -468,28 +468,36 @@ def test_setop_with_categorical(index, sort, method): tm.assert_index_equal(result, expected) -@pytest.mark.parametrize("index_maker", tm.index_subclass_makers_generator()) @pytest.mark.parametrize("reverse", [True, False]) -def test_valid_intersection_w_dupes(index_maker, reverse): - # TODO: it would be good to ensure these are unique (categoricals aren't) - full = index_maker(k=4) - series = [pd.Series(1, index=full[[0, 1, 1, 2]]), pd.Series(0, index=full[[0, 2]])] +def test_valid_intersection_w_dupes(index, reverse): + # Make sure base index is unique and has at least 3 values + index = index.unique() + if len(index) < 3: + pytest.skip() + + series = [ + pd.Series(1, index=index[[0, 0, 1, 2]]), + pd.Series(0, index=index[[1, 2]]), + ] if reverse: series = reversed(series) result = get_objs_combined_axis(series, intersect=True) + expected = index[[1, 2]] - tm.assert_index_equal(full[[0, 2]], result, check_order=False) + tm.assert_index_equal(result, expected, check_order=False) -@pytest.mark.parametrize("index_maker", tm.index_subclass_makers_generator()) @pytest.mark.parametrize("reverse", [True, False]) -def test_invalid_intersection_w_dupes(index_maker, reverse): - # TODO: it would be good to ensure these are unique (categoricals aren't) - full = index_maker(k=4) +def test_invalid_intersection_w_dupes(index, reverse): + # Make sure base index is unique and has at least 3 values + index = index.unique() + if len(index) < 3: + pytest.skip() + series = [ - pd.Series(1, index=full[[0, 1, 1, 2]]), - pd.Series(0, index=full[[0, 1, 2]]), + pd.Series(1, index=index[[0, 0, 1, 2]]), + pd.Series(0, index=index[[0, 2]]), ] if reverse: series = reversed(series) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index cb34af9f1d824..fd3a9a00fee8a 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -5,12 +5,13 @@ import numpy as np import pytest +from pandas.errors import InvalidIndexError + import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, concat, date_range import pandas._testing as tm from pandas.core.arrays import SparseArray from pandas.core.construction import create_series_with_explicit_dtype -from pandas.errors import InvalidIndexError from pandas.tests.extension.decimal import to_decimal @@ -446,53 +447,56 @@ def test_concat_ordered_dict(self): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("index_maker", tm.index_subclass_makers_generator()) @pytest.mark.parametrize("join", ["inner", "outer"]) -def test_concat_duplicates_error(index_maker, join): - # if index_maker is tm.makeMultiIndex: - # TODO: This generator only makes Indexs of size 4 - # pytest.skip() - index_unique = index_maker(k=4) +def test_concat_duplicates_error(index, join): + # https://github.com/pandas-dev/pandas/issues/6963 + # Needs an index with 4 unique values + index = index.unique() + if len(index) < 4: + pytest.skip() + + index_unique = index[:4] index_non_unique = index_unique[[0, 0, 1, 2, 3]] - df_non_unique = pd.DataFrame( + df_non_unique = DataFrame( np.ones((1, len(index_non_unique))), columns=index_non_unique ) - df_unique = pd.DataFrame(np.ones((1, len(index_unique))), columns=index_unique) + df_unique = DataFrame(np.ones((1, len(index_unique))), columns=index_unique) with pytest.raises(InvalidIndexError): _ = pd.concat([df_non_unique, df_unique], join=join) -@pytest.mark.parametrize("index_maker", tm.index_subclass_makers_generator()) @pytest.mark.xfail(reason="Not implemented") -def test_concat_intersection_duplicates(index_maker): - # Currently failing: https://github.com/pandas-dev/pandas/pull/38745/files#r549577521 +def test_concat_intersection_duplicates(index): + # ailing: https://github.com/pandas-dev/pandas/pull/38745/files#r549577521 # Concat is valid if the intersection does not contain duplicates - index_full = index_maker(k=4) - index_unique = index_full[[0, 1, 2]] - index_non_unique = index_full[[1, 2, 3, 3]] + # Needs an index with 4 unique values + index = index.unique() + if len(index) < 4: + pytest.skip() + + index_unique = index[[0, 1, 2]] + index_non_unique = index[[1, 2, 3, 3]] - df_unique = pd.DataFrame( + df_unique = DataFrame( np.ones((1, len(index_unique))), columns=index_unique, ) - df_non_unique = pd.DataFrame( + df_non_unique = DataFrame( np.zeros((1, len(index_non_unique))), columns=index_non_unique, ) result = pd.concat([df_unique, df_non_unique], join="inner") - - tm.assert_frame_equal( - result, - pd.DataFrame( - [[1, 1], [0, 0]], - columns=index_full[[1, 2]], - index=[0, 0], - ), + expected = DataFrame( + [[1, 1], [0, 0]], + columns=index[[1, 2]], + index=[0, 0], ) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("pdt", [Series, pd.DataFrame]) @pytest.mark.parametrize("dt", np.sctypes["float"])