Skip to content

Commit 332d4c7

Browse files
iamlemecKevin D Smith
authored and
Kevin D Smith
committed
fix inconsistent index naming with union/intersect GH35847 (pandas-dev#36413)
1 parent 29ddbc8 commit 332d4c7

File tree

16 files changed

+196
-72
lines changed

16 files changed

+196
-72
lines changed

doc/source/user_guide/merging.rst

+8
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,14 @@ functionality below.
154154
frames = [ process_your_file(f) for f in files ]
155155
result = pd.concat(frames)
156156

157+
.. note::
158+
159+
When concatenating DataFrames with named axes, pandas will attempt to preserve
160+
these index/column names whenever possible. In the case where all inputs share a
161+
common name, this name will be assigned to the result. When the input names do
162+
not all agree, the result will be unnamed. The same is true for :class:`MultiIndex`,
163+
but the logic is applied separately on a level-by-level basis.
164+
157165

158166
Set logic on the other axes
159167
~~~~~~~~~~~~~~~~~~~~~~~~~~~

doc/source/whatsnew/v1.2.0.rst

+20
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,26 @@ Alternatively, you can also use the dtype object:
157157
behaviour or API may still change without warning. Expecially the behaviour
158158
regarding NaN (distinct from NA missing values) is subject to change.
159159

160+
.. _whatsnew_120.index_name_preservation:
161+
162+
Index/column name preservation when aggregating
163+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
164+
165+
When aggregating using :meth:`concat` or the :class:`DataFrame` constructor, Pandas
166+
will attempt to preserve index (and column) names whenever possible (:issue:`35847`).
167+
In the case where all inputs share a common name, this name will be assigned to the
168+
result. When the input names do not all agree, the result will be unnamed. Here is an
169+
example where the index name is preserved:
170+
171+
.. ipython:: python
172+
173+
idx = pd.Index(range(5), name='abc')
174+
ser = pd.Series(range(5, 10), index=idx)
175+
pd.concat({'x': ser[1:], 'y': ser[:-1]}, axis=1)
176+
177+
The same is true for :class:`MultiIndex`, but the logic is applied separately on a
178+
level-by-level basis.
179+
160180
.. _whatsnew_120.enhancements.other:
161181

162182
Other enhancements

pandas/core/indexes/api.py

+4-28
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@
44
from pandas._libs import NaT, lib
55
from pandas.errors import InvalidIndexError
66

7-
import pandas.core.common as com
87
from pandas.core.indexes.base import (
98
Index,
109
_new_Index,
1110
ensure_index,
1211
ensure_index_from_sequences,
12+
get_unanimous_names,
1313
)
1414
from pandas.core.indexes.category import CategoricalIndex
1515
from pandas.core.indexes.datetimes import DatetimeIndex
@@ -57,7 +57,7 @@
5757
"ensure_index_from_sequences",
5858
"get_objs_combined_axis",
5959
"union_indexes",
60-
"get_consensus_names",
60+
"get_unanimous_names",
6161
"all_indexes_same",
6262
]
6363

@@ -221,9 +221,9 @@ def conv(i):
221221
if not all(index.equals(other) for other in indexes[1:]):
222222
index = _unique_indices(indexes)
223223

224-
name = get_consensus_names(indexes)[0]
224+
name = get_unanimous_names(*indexes)[0]
225225
if name != index.name:
226-
index = index._shallow_copy(name=name)
226+
index = index.rename(name)
227227
return index
228228
else: # kind='list'
229229
return _unique_indices(indexes)
@@ -267,30 +267,6 @@ def _sanitize_and_check(indexes):
267267
return indexes, "array"
268268

269269

270-
def get_consensus_names(indexes):
271-
"""
272-
Give a consensus 'names' to indexes.
273-
274-
If there's exactly one non-empty 'names', return this,
275-
otherwise, return empty.
276-
277-
Parameters
278-
----------
279-
indexes : list of Index objects
280-
281-
Returns
282-
-------
283-
list
284-
A list representing the consensus 'names' found.
285-
"""
286-
# find the non-none names, need to tupleify to make
287-
# the set hashable, then reverse on return
288-
consensus_names = {tuple(i.names) for i in indexes if com.any_not_none(*i.names)}
289-
if len(consensus_names) == 1:
290-
return list(list(consensus_names)[0])
291-
return [None] * indexes[0].nlevels
292-
293-
294270
def all_indexes_same(indexes):
295271
"""
296272
Determine if all indexes contain the same elements.

pandas/core/indexes/base.py

+37-13
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from copy import copy as copy_func
22
from datetime import datetime
3+
from itertools import zip_longest
34
import operator
45
from textwrap import dedent
56
from typing import (
@@ -11,6 +12,7 @@
1112
List,
1213
Optional,
1314
Sequence,
15+
Tuple,
1416
TypeVar,
1517
Union,
1618
)
@@ -2525,7 +2527,7 @@ def _get_reconciled_name_object(self, other):
25252527
"""
25262528
name = get_op_result_name(self, other)
25272529
if self.name != name:
2528-
return self._shallow_copy(name=name)
2530+
return self.rename(name)
25292531
return self
25302532

25312533
def _union_incompatible_dtypes(self, other, sort):
@@ -2633,7 +2635,9 @@ def union(self, other, sort=None):
26332635
if not self._can_union_without_object_cast(other):
26342636
return self._union_incompatible_dtypes(other, sort=sort)
26352637

2636-
return self._union(other, sort=sort)
2638+
result = self._union(other, sort=sort)
2639+
2640+
return self._wrap_setop_result(other, result)
26372641

26382642
def _union(self, other, sort):
26392643
"""
@@ -2655,10 +2659,10 @@ def _union(self, other, sort):
26552659
Index
26562660
"""
26572661
if not len(other) or self.equals(other):
2658-
return self._get_reconciled_name_object(other)
2662+
return self
26592663

26602664
if not len(self):
2661-
return other._get_reconciled_name_object(self)
2665+
return other
26622666

26632667
# TODO(EA): setops-refactor, clean all this up
26642668
lvals = self._values
@@ -2700,12 +2704,16 @@ def _union(self, other, sort):
27002704
stacklevel=3,
27012705
)
27022706

2703-
# for subclasses
2704-
return self._wrap_setop_result(other, result)
2707+
return self._shallow_copy(result)
27052708

27062709
def _wrap_setop_result(self, other, result):
27072710
name = get_op_result_name(self, other)
2708-
return self._shallow_copy(result, name=name)
2711+
if isinstance(result, Index):
2712+
if result.name != name:
2713+
return result.rename(name)
2714+
return result
2715+
else:
2716+
return self._shallow_copy(result, name=name)
27092717

27102718
# TODO: standardize return type of non-union setops type(self vs other)
27112719
def intersection(self, other, sort=False):
@@ -2775,15 +2783,12 @@ def intersection(self, other, sort=False):
27752783
indexer = algos.unique1d(Index(rvals).get_indexer_non_unique(lvals)[0])
27762784
indexer = indexer[indexer != -1]
27772785

2778-
taken = other.take(indexer)
2779-
res_name = get_op_result_name(self, other)
2786+
result = other.take(indexer)
27802787

27812788
if sort is None:
2782-
taken = algos.safe_sort(taken.values)
2783-
return self._shallow_copy(taken, name=res_name)
2789+
result = algos.safe_sort(result.values)
27842790

2785-
taken.name = res_name
2786-
return taken
2791+
return self._wrap_setop_result(other, result)
27872792

27882793
def difference(self, other, sort=None):
27892794
"""
@@ -5968,3 +5973,22 @@ def _maybe_asobject(dtype, klass, data, copy: bool, name: Label, **kwargs):
59685973
return index.astype(object)
59695974

59705975
return klass(data, dtype=dtype, copy=copy, name=name, **kwargs)
5976+
5977+
5978+
def get_unanimous_names(*indexes: Index) -> Tuple[Label, ...]:
5979+
"""
5980+
Return common name if all indices agree, otherwise None (level-by-level).
5981+
5982+
Parameters
5983+
----------
5984+
indexes : list of Index objects
5985+
5986+
Returns
5987+
-------
5988+
list
5989+
A list representing the unanimous 'names' found.
5990+
"""
5991+
name_tups = [tuple(i.names) for i in indexes]
5992+
name_sets = [{*ns} for ns in zip_longest(*name_tups)]
5993+
names = tuple(ns.pop() if len(ns) == 1 else None for ns in name_sets)
5994+
return names

pandas/core/indexes/datetimelike.py

+10-12
Original file line numberDiff line numberDiff line change
@@ -719,33 +719,29 @@ def intersection(self, other, sort=False):
719719
"""
720720
self._validate_sort_keyword(sort)
721721
self._assert_can_do_setop(other)
722-
res_name = get_op_result_name(self, other)
723722

724723
if self.equals(other):
725724
return self._get_reconciled_name_object(other)
726725

727726
if len(self) == 0:
728-
return self.copy()
727+
return self.copy()._get_reconciled_name_object(other)
729728
if len(other) == 0:
730-
return other.copy()
729+
return other.copy()._get_reconciled_name_object(self)
731730

732731
if not isinstance(other, type(self)):
733732
result = Index.intersection(self, other, sort=sort)
734733
if isinstance(result, type(self)):
735734
if result.freq is None:
736735
# TODO: no tests rely on this; needed?
737736
result = result._with_freq("infer")
738-
result.name = res_name
739737
return result
740738

741739
elif not self._can_fast_intersect(other):
742740
result = Index.intersection(self, other, sort=sort)
743741
# We need to invalidate the freq because Index.intersection
744742
# uses _shallow_copy on a view of self._data, which will preserve
745743
# self.freq if we're not careful.
746-
result = result._with_freq(None)._with_freq("infer")
747-
result.name = res_name
748-
return result
744+
return result._with_freq(None)._with_freq("infer")
749745

750746
# to make our life easier, "sort" the two ranges
751747
if self[0] <= other[0]:
@@ -759,11 +755,13 @@ def intersection(self, other, sort=False):
759755
start = right[0]
760756

761757
if end < start:
762-
return type(self)(data=[], dtype=self.dtype, freq=self.freq, name=res_name)
758+
result = type(self)(data=[], dtype=self.dtype, freq=self.freq)
763759
else:
764760
lslice = slice(*left.slice_locs(start, end))
765761
left_chunk = left._values[lslice]
766-
return type(self)._simple_new(left_chunk, name=res_name)
762+
result = type(self)._simple_new(left_chunk)
763+
764+
return self._wrap_setop_result(other, result)
767765

768766
def _can_fast_intersect(self: _T, other: _T) -> bool:
769767
if self.freq is None:
@@ -858,7 +856,7 @@ def _fast_union(self, other, sort=None):
858856
# The can_fast_union check ensures that the result.freq
859857
# should match self.freq
860858
dates = type(self._data)(dates, freq=self.freq)
861-
result = type(self)._simple_new(dates, name=self.name)
859+
result = type(self)._simple_new(dates)
862860
return result
863861
else:
864862
return left
@@ -883,8 +881,8 @@ def _union(self, other, sort):
883881
result = result._with_freq("infer")
884882
return result
885883
else:
886-
i8self = Int64Index._simple_new(self.asi8, name=self.name)
887-
i8other = Int64Index._simple_new(other.asi8, name=other.name)
884+
i8self = Int64Index._simple_new(self.asi8)
885+
i8other = Int64Index._simple_new(other.asi8)
888886
i8result = i8self._union(i8other, sort=sort)
889887
result = type(self)(i8result, dtype=self.dtype, freq="infer")
890888
return result

pandas/core/indexes/datetimes.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525

2626
from pandas.core.arrays.datetimes import DatetimeArray, tz_to_dtype
2727
import pandas.core.common as com
28-
from pandas.core.indexes.base import Index, maybe_extract_name
28+
from pandas.core.indexes.base import Index, get_unanimous_names, maybe_extract_name
2929
from pandas.core.indexes.datetimelike import DatetimeTimedeltaMixin
3030
from pandas.core.indexes.extension import inherit_names
3131
from pandas.core.tools.times import to_time
@@ -405,6 +405,10 @@ def union_many(self, others):
405405
this = this._fast_union(other)
406406
else:
407407
this = Index.union(this, other)
408+
409+
res_name = get_unanimous_names(self, *others)[0]
410+
if this.name != res_name:
411+
return this.rename(res_name)
408412
return this
409413

410414
# --------------------------------------------------------------------

pandas/core/indexes/interval.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1023,7 +1023,7 @@ def intersection(
10231023
if sort is None:
10241024
taken = taken.sort_values()
10251025

1026-
return taken
1026+
return self._wrap_setop_result(other, taken)
10271027

10281028
def _intersection_unique(self, other: "IntervalIndex") -> "IntervalIndex":
10291029
"""

pandas/core/indexes/multi.py

+11-5
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,12 @@
4747
from pandas.core.arrays.categorical import factorize_from_iterables
4848
import pandas.core.common as com
4949
import pandas.core.indexes.base as ibase
50-
from pandas.core.indexes.base import Index, _index_shared_docs, ensure_index
50+
from pandas.core.indexes.base import (
51+
Index,
52+
_index_shared_docs,
53+
ensure_index,
54+
get_unanimous_names,
55+
)
5156
from pandas.core.indexes.frozen import FrozenList
5257
from pandas.core.indexes.numeric import Int64Index
5358
import pandas.core.missing as missing
@@ -3426,7 +3431,7 @@ def union(self, other, sort=None):
34263431
other, result_names = self._convert_can_do_setop(other)
34273432

34283433
if len(other) == 0 or self.equals(other):
3429-
return self
3434+
return self.rename(result_names)
34303435

34313436
# TODO: Index.union returns other when `len(self)` is 0.
34323437

@@ -3468,7 +3473,7 @@ def intersection(self, other, sort=False):
34683473
other, result_names = self._convert_can_do_setop(other)
34693474

34703475
if self.equals(other):
3471-
return self
3476+
return self.rename(result_names)
34723477

34733478
if not is_object_dtype(other.dtype):
34743479
# The intersection is empty
@@ -3539,7 +3544,7 @@ def difference(self, other, sort=None):
35393544
other, result_names = self._convert_can_do_setop(other)
35403545

35413546
if len(other) == 0:
3542-
return self
3547+
return self.rename(result_names)
35433548

35443549
if self.equals(other):
35453550
return MultiIndex(
@@ -3587,7 +3592,8 @@ def _convert_can_do_setop(self, other):
35873592
except TypeError as err:
35883593
raise TypeError(msg) from err
35893594
else:
3590-
result_names = self.names if self.names == other.names else None
3595+
result_names = get_unanimous_names(self, other)
3596+
35913597
return other, result_names
35923598

35933599
# --------------------------------------------------------------------

pandas/core/indexes/range.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -539,7 +539,8 @@ def intersection(self, other, sort=False):
539539
new_index = new_index[::-1]
540540
if sort is None:
541541
new_index = new_index.sort_values()
542-
return new_index
542+
543+
return self._wrap_setop_result(other, new_index)
543544

544545
def _min_fitting_element(self, lower_limit: int) -> int:
545546
"""Returns the smallest element greater than or equal to the limit"""

pandas/core/reshape/concat.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@
2323
MultiIndex,
2424
all_indexes_same,
2525
ensure_index,
26-
get_consensus_names,
2726
get_objs_combined_axis,
27+
get_unanimous_names,
2828
)
2929
import pandas.core.indexes.base as ibase
3030
from pandas.core.internals import concatenate_block_managers
@@ -655,7 +655,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde
655655
)
656656

657657
# also copies
658-
names = names + get_consensus_names(indexes)
658+
names = list(names) + list(get_unanimous_names(*indexes))
659659

660660
return MultiIndex(
661661
levels=levels, codes=codes_list, names=names, verify_integrity=False

0 commit comments

Comments
 (0)