Skip to content

[WIP] Test (and more fixes) for duplicate indices with concat #38745

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions pandas/core/indexes/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,19 @@ def get_objs_combined_axis(
Index
"""
obs_idxes = [obj._get_axis(axis) for obj in objs]
if all_indexes_same(obs_idxes):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is not the place to do at all. should be in concat itself.

ideally we want a nice error message if this happens (and the indices are not equal).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree I need to add the error message, but I do think this function is where the error should be thrown.

Every function which uses get_objs_combined_axis throws an error if there are duplicate indices. Here is a demonstration of this from pandas 1.1.5:

import numpy as np
import pandas as pd
import pandas._testing as tm

def list_of_series_constructor(*args):
    return pd.DataFrame(list(args))

def crosstab(*args):
    return pd.crosstab(*args)

def concat_series(*args):
    return pd.concat(list(args), axis=1)
    
def concat_dfs(*args):
    return pd.concat([pd.DataFrame(x) for x in args], axis=1)

def duplicated_indices_errors(func, index_makers):
    records = []
    for i, index_maker in enumerate(index_makers):
        rec = {"index_maker": index_maker.__name__}
        uniq = index_maker(k=4)
        non_uniq = uniq[[0, 0, 1, 2]]

        s_uniq = pd.Series(np.ones(len(uniq)), index=uniq)
        s_non_uniq = pd.Series(np.ones(len(non_uniq)), index=non_uniq)

        try:
            res = func(s_uniq, s_non_uniq)
        except Exception as e:
            rec["error_type"] = str(type(e))
            rec["error_message"] = str(e)
        records.append(rec)
    return records

funcs = [list_of_series_constructor, crosstab, concat_series, concat_dfs]

index_makers = [
    tm.makeStringIndex,
    tm.makeIntIndex,
    tm.makeUIntIndex,
    tm.makeFloatIndex,
    tm.makeDateIndex,
    tm.makeTimedeltaIndex,
    tm.makePeriodIndex,
    tm.makeMultiIndex,
    tm.makeBoolIndex,
]

pd.concat(
    {
        f.__name__: pd.DataFrame(duplicated_indices_errors(f, index_makers)) 
        for f in funcs
    }
).to_markdown()
index_maker error_type error_message
('list_of_series_constructor', 0) makeStringIndex <class 'pandas.errors.InvalidIndexError'> Reindexing only valid with uniquely valued Index objects
('list_of_series_constructor', 1) makeIntIndex <class 'pandas.errors.InvalidIndexError'> Reindexing only valid with uniquely valued Index objects
('list_of_series_constructor', 2) makeUIntIndex <class 'pandas.errors.InvalidIndexError'> Reindexing only valid with uniquely valued Index objects
('list_of_series_constructor', 3) makeFloatIndex <class 'pandas.errors.InvalidIndexError'> Reindexing only valid with uniquely valued Index objects
('list_of_series_constructor', 4) makeDateIndex <class 'pandas.errors.InvalidIndexError'> Reindexing only valid with uniquely valued Index objects
('list_of_series_constructor', 5) makeTimedeltaIndex <class 'pandas.errors.InvalidIndexError'> Reindexing only valid with uniquely valued Index objects
('list_of_series_constructor', 6) makePeriodIndex <class 'pandas.errors.InvalidIndexError'> Reindexing only valid with uniquely valued Index objects
('list_of_series_constructor', 7) makeMultiIndex <class 'ValueError'> Reindexing only valid with uniquely valued Index objects
('list_of_series_constructor', 8) makeBoolIndex <class 'pandas.errors.InvalidIndexError'> Reindexing only valid with uniquely valued Index objects
('crosstab', 0) makeStringIndex <class 'ValueError'> cannot reindex from a duplicate axis
('crosstab', 1) makeIntIndex <class 'ValueError'> cannot reindex from a duplicate axis
('crosstab', 2) makeUIntIndex <class 'ValueError'> cannot reindex from a duplicate axis
('crosstab', 3) makeFloatIndex <class 'ValueError'> cannot reindex from a duplicate axis
('crosstab', 4) makeDateIndex <class 'ValueError'> cannot reindex from a duplicate axis
('crosstab', 5) makeTimedeltaIndex <class 'ValueError'> cannot reindex from a duplicate axis
('crosstab', 6) makePeriodIndex <class 'ValueError'> cannot reindex from a duplicate axis
('crosstab', 7) makeMultiIndex <class 'ValueError'> cannot handle a non-unique multi-index!
('crosstab', 8) makeBoolIndex <class 'ValueError'> cannot reindex from a duplicate axis
('concat_series', 0) makeStringIndex <class 'ValueError'> cannot reindex from a duplicate axis
('concat_series', 1) makeIntIndex <class 'ValueError'> cannot reindex from a duplicate axis
('concat_series', 2) makeUIntIndex <class 'ValueError'> cannot reindex from a duplicate axis
('concat_series', 3) makeFloatIndex <class 'ValueError'> cannot reindex from a duplicate axis
('concat_series', 4) makeDateIndex <class 'ValueError'> cannot reindex from a duplicate axis
('concat_series', 5) makeTimedeltaIndex <class 'ValueError'> cannot reindex from a duplicate axis
('concat_series', 6) makePeriodIndex <class 'ValueError'> cannot reindex from a duplicate axis
('concat_series', 7) makeMultiIndex <class 'ValueError'> cannot handle a non-unique multi-index!
('concat_series', 8) makeBoolIndex <class 'ValueError'> cannot reindex from a duplicate axis
('concat_dfs', 0) makeStringIndex <class 'ValueError'> Shape of passed values is (5, 2), indices imply (4, 2)
('concat_dfs', 1) makeIntIndex <class 'ValueError'> Shape of passed values is (7, 2), indices imply (5, 2)
('concat_dfs', 2) makeUIntIndex <class 'ValueError'> Shape of passed values is (7, 2), indices imply (5, 2)
('concat_dfs', 3) makeFloatIndex <class 'ValueError'> Shape of passed values is (7, 2), indices imply (5, 2)
('concat_dfs', 4) makeDateIndex <class 'ValueError'> Shape of passed values is (7, 2), indices imply (5, 2)
('concat_dfs', 5) makeTimedeltaIndex <class 'ValueError'> Shape of passed values is (7, 2), indices imply (5, 2)
('concat_dfs', 6) makePeriodIndex <class 'ValueError'> Shape of passed values is (7, 2), indices imply (5, 2)
('concat_dfs', 7) makeMultiIndex <class 'ValueError'> cannot handle a non-unique multi-index!
('concat_dfs', 8) makeBoolIndex <class 'ValueError'> Shape of passed values is (4, 2), indices imply (2, 2)

Instead of just giving concat a better error message, all these methods could get better (and more consistent) error messages. I think it's important for the rules about unions and intersections of indices to be defined in these more internal methods so behaviour isn't defined (and reimplemented) per top level function.

Additionally, this function essentially returns nonsense for unions when there are duplicated indices since different Index types have different definitions for unions with duplicates, so I think it's appropriate to throw errors here instead of passing on those nonsense results.

This has also made me realize there is one case where it's okay for there to be duplicates, which is when indices contain duplicates, but the intersection would not.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This has also made me realize there is one case where it's okay for there to be duplicates, which is when indices contain duplicates, but the intersection would not.

This got a bit more complicated. idx.get_indexer(keys) throws an error if idx is non-unique, regardless of whether keys have unique indices in idx.

Interestingly enough, it does not throw an error if the idx is non-unique, but keys is empty (which is the case I had initially seen).

Examples using pandas 1.1.5:

pd.crosstab(
    pd.Series(np.arange(4), index=[0, 1, 1, 2]),
    pd.Series(np.arange(2), index=[0, 2]),
)
# ValueError: cannot reindex from a duplicate axis

pd.crosstab(
    pd.Series(np.arange(4), index=[0, 1, 1, 2]),
    pd.Series(np.arange(2), index=[3, 4]),
)
# Empty DataFrame
# Columns: []
# Index: []
Examples with pd.concat

This had the same behaviour despite not using .get_indexer directly in 1.1.5.

pd.concat(
    [
        pd.Series(np.arange(4), index=[0, 1, 1, 2]),
        pd.Series(np.arange(2), index=[0, 2]),
    ],
    axis=1,
    join="inner"
)
# ValueError: cannot reindex from a duplicate axis

pd.concat(
    [
        pd.Series(np.arange(4), index=[0, 1, 1, 2]),
        pd.Series(np.arange(2), index=[3, 4]),
    ],
    axis=1,
    join="inner"
)
# Empty DataFrame
# Columns: [0, 1]
# Index: []

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opened a more general issue for this: #38797

idx = obs_idxes[0]
if sort:
try:
idx = idx.sort_values()
copy = False
except TypeError:
pass
if copy:
idx = idx.copy()
return idx
elif not all(idx.is_unique for idx in obs_idxes):
raise InvalidIndexError()
return _get_combined_index(obs_idxes, intersect=intersect, sort=sort, copy=copy)


Expand Down
21 changes: 21 additions & 0 deletions pandas/tests/reshape/concat/test_concat.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from collections import abc, deque
from decimal import Decimal
from pandas.errors import InvalidIndexError
from warnings import catch_warnings

import numpy as np
Expand Down Expand Up @@ -445,6 +446,26 @@ def test_concat_ordered_dict(self):
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("index_maker", tm.index_subclass_makers_generator())
@pytest.mark.parametrize("join", ["inner", "outer"])
def test_concat_duplicates_error(index_maker, join):
# if index_maker is tm.makeMultiIndex:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add the issue number as a comment in all of the new tests

# TODO: This generator only makes Indexs of size 4
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is this about?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

def makeMultiIndex(k=10, names=None, **kwargs):
return MultiIndex.from_product((("foo", "bar"), (1, 2)), names=names, **kwargs)

This function only makes a MultiIndex with length 4, ignoring the k parameter

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

# pytest.skip()
index_unique = index_maker(k=4)
index_non_unique = index_unique[[0, 0, 1, 2, 3]]
with pytest.raises(InvalidIndexError):
_ = pd.concat(
[
pd.DataFrame(
np.ones((1, len(index_non_unique))), columns=index_non_unique
),
pd.DataFrame(np.ones((1, len(index_unique))), columns=index_unique),
],
join=join,
)


@pytest.mark.parametrize("pdt", [Series, pd.DataFrame])
@pytest.mark.parametrize("dt", np.sctypes["float"])
def test_concat_no_unnecessary_upcast(dt, pdt):
Expand Down