Skip to content

Commit 20d0ad1

Browse files
ms7463TomAugspurger
authored andcommitted
ENH - Index set operation modifications to address issue #23525 (#23538)
1 parent d3a1912 commit 20d0ad1

21 files changed

+343
-161
lines changed

doc/source/whatsnew/v0.25.0.rst

+27
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,33 @@ returned if all the columns were dummy encoded, and a :class:`DataFrame` otherwi
154154
Providing any ``SparseSeries`` or ``SparseDataFrame`` to :func:`concat` will
155155
cause a ``SparseSeries`` or ``SparseDataFrame`` to be returned, as before.
156156

157+
.. _whatsnew_0250.api_breaking.incompatible_index_unions
158+
159+
Incompatible Index Type Unions
160+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
161+
162+
When performing :func:`Index.union` operations between objects of incompatible dtypes,
163+
the result will be a base :class:`Index` of dtype ``object``. This behavior holds true for
164+
unions between :class:`Index` objects that previously would have been prohibited. The dtype
165+
of empty :class:`Index` objects will now be evaluated before performing union operations
166+
rather than simply returning the other :class:`Index` object. :func:`Index.union` can now be
167+
considered commutative, such that ``A.union(B) == B.union(A)`` (:issue:`23525`).
168+
169+
*Previous Behavior*:
170+
171+
In [1]: pd.period_range('19910905', periods=2).union(pd.Int64Index([1, 2, 3]))
172+
...
173+
ValueError: can only call with other PeriodIndex-ed objects
174+
175+
In [2]: pd.Index([], dtype=object).union(pd.Index([1, 2, 3]))
176+
Out[2]: Int64Index([1, 2, 3], dtype='int64')
177+
178+
*New Behavior*:
179+
180+
.. ipython:: python
181+
182+
pd.period_range('19910905', periods=2).union(pd.Int64Index([1, 2, 3]))
183+
pd.Index([], dtype=object).union(pd.Index([1, 2, 3]))
157184
158185
``DataFrame`` groupby ffill/bfill no longer return group labels
159186
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

pandas/core/indexes/base.py

+87-17
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,10 @@
2020
ensure_categorical, ensure_int64, ensure_object, ensure_platform_int,
2121
is_bool, is_bool_dtype, is_categorical, is_categorical_dtype,
2222
is_datetime64_any_dtype, is_datetime64tz_dtype, is_dtype_equal,
23-
is_dtype_union_equal, is_extension_array_dtype, is_float, is_float_dtype,
24-
is_hashable, is_integer, is_integer_dtype, is_interval_dtype, is_iterator,
25-
is_list_like, is_object_dtype, is_period_dtype, is_scalar,
26-
is_signed_integer_dtype, is_timedelta64_dtype, is_unsigned_integer_dtype,
27-
pandas_dtype)
23+
is_extension_array_dtype, is_float, is_float_dtype, is_hashable,
24+
is_integer, is_integer_dtype, is_interval_dtype, is_iterator, is_list_like,
25+
is_object_dtype, is_period_dtype, is_scalar, is_signed_integer_dtype,
26+
is_timedelta64_dtype, is_unsigned_integer_dtype, pandas_dtype)
2827
import pandas.core.dtypes.concat as _concat
2928
from pandas.core.dtypes.generic import (
3029
ABCDataFrame, ABCDateOffset, ABCDatetimeArray, ABCIndexClass,
@@ -2262,6 +2261,47 @@ def _get_reconciled_name_object(self, other):
22622261
return self._shallow_copy(name=name)
22632262
return self
22642263

2264+
def _union_incompatible_dtypes(self, other, sort):
2265+
"""
2266+
Casts this and other index to object dtype to allow the formation
2267+
of a union between incompatible types.
2268+
2269+
Parameters
2270+
----------
2271+
other : Index or array-like
2272+
sort : False or None, default False
2273+
Whether to sort the resulting index.
2274+
2275+
* False : do not sort the result.
2276+
* None : sort the result, except when `self` and `other` are equal
2277+
or when the values cannot be compared.
2278+
2279+
Returns
2280+
-------
2281+
Index
2282+
"""
2283+
this = self.astype(object, copy=False)
2284+
# cast to Index for when `other` is list-like
2285+
other = Index(other).astype(object, copy=False)
2286+
return Index.union(this, other, sort=sort).astype(object, copy=False)
2287+
2288+
def _is_compatible_with_other(self, other):
2289+
"""
2290+
Check whether this and the other dtype are compatible with each other.
2291+
Meaning a union can be formed between them without needing to be cast
2292+
to dtype object.
2293+
2294+
Parameters
2295+
----------
2296+
other : Index or array-like
2297+
2298+
Returns
2299+
-------
2300+
bool
2301+
"""
2302+
return (type(self) is type(other)
2303+
and is_dtype_equal(self.dtype, other.dtype))
2304+
22652305
def _validate_sort_keyword(self, sort):
22662306
if sort not in [None, False]:
22672307
raise ValueError("The 'sort' keyword only takes the values of "
@@ -2271,6 +2311,11 @@ def union(self, other, sort=None):
22712311
"""
22722312
Form the union of two Index objects.
22732313
2314+
If the Index objects are incompatible, both Index objects will be
2315+
cast to dtype('object') first.
2316+
2317+
.. versionchanged:: 0.25.0
2318+
22742319
Parameters
22752320
----------
22762321
other : Index or array-like
@@ -2300,30 +2345,54 @@ def union(self, other, sort=None):
23002345
Examples
23012346
--------
23022347
2348+
Union matching dtypes
2349+
23032350
>>> idx1 = pd.Index([1, 2, 3, 4])
23042351
>>> idx2 = pd.Index([3, 4, 5, 6])
23052352
>>> idx1.union(idx2)
23062353
Int64Index([1, 2, 3, 4, 5, 6], dtype='int64')
2354+
2355+
Union mismatched dtypes
2356+
2357+
>>> idx1 = pd.Index(['a', 'b', 'c', 'd'])
2358+
>>> idx2 = pd.Index([1, 2, 3, 4])
2359+
>>> idx1.union(idx2)
2360+
Index(['a', 'b', 'c', 'd', 1, 2, 3, 4], dtype='object')
23072361
"""
23082362
self._validate_sort_keyword(sort)
23092363
self._assert_can_do_setop(other)
2310-
other = ensure_index(other)
23112364

2312-
if len(other) == 0 or self.equals(other):
2365+
if not self._is_compatible_with_other(other):
2366+
return self._union_incompatible_dtypes(other, sort=sort)
2367+
2368+
return self._union(other, sort=sort)
2369+
2370+
def _union(self, other, sort):
2371+
"""
2372+
Specific union logic should go here. In subclasses, union behavior
2373+
should be overwritten here rather than in `self.union`.
2374+
2375+
Parameters
2376+
----------
2377+
other : Index or array-like
2378+
sort : False or None, default False
2379+
Whether to sort the resulting index.
2380+
2381+
* False : do not sort the result.
2382+
* None : sort the result, except when `self` and `other` are equal
2383+
or when the values cannot be compared.
2384+
2385+
Returns
2386+
-------
2387+
Index
2388+
"""
2389+
2390+
if not len(other) or self.equals(other):
23132391
return self._get_reconciled_name_object(other)
23142392

2315-
if len(self) == 0:
2393+
if not len(self):
23162394
return other._get_reconciled_name_object(self)
23172395

2318-
# TODO: is_dtype_union_equal is a hack around
2319-
# 1. buggy set ops with duplicates (GH #13432)
2320-
# 2. CategoricalIndex lacking setops (GH #10186)
2321-
# Once those are fixed, this workaround can be removed
2322-
if not is_dtype_union_equal(self.dtype, other.dtype):
2323-
this = self.astype('O')
2324-
other = other.astype('O')
2325-
return this.union(other, sort=sort)
2326-
23272396
# TODO(EA): setops-refactor, clean all this up
23282397
if is_period_dtype(self) or is_datetime64tz_dtype(self):
23292398
lvals = self._ndarray_values
@@ -2370,6 +2439,7 @@ def union(self, other, sort=None):
23702439
def _wrap_setop_result(self, other, result):
23712440
return self._constructor(result, name=get_op_result_name(self, other))
23722441

2442+
# TODO: standardize return type of non-union setops type(self vs other)
23732443
def intersection(self, other, sort=False):
23742444
"""
23752445
Form the intersection of two Index objects.

pandas/core/indexes/datetimes.py

+4-30
Original file line numberDiff line numberDiff line change
@@ -451,35 +451,9 @@ def _formatter_func(self):
451451
# --------------------------------------------------------------------
452452
# Set Operation Methods
453453

454-
def union(self, other, sort=None):
455-
"""
456-
Specialized union for DatetimeIndex objects. If combine
457-
overlapping ranges with the same DateOffset, will be much
458-
faster than Index.union
459-
460-
Parameters
461-
----------
462-
other : DatetimeIndex or array-like
463-
sort : bool or None, default None
464-
Whether to sort the resulting Index.
465-
466-
* None : Sort the result, except when
467-
468-
1. `self` and `other` are equal.
469-
2. `self` or `other` has length 0.
470-
3. Some values in `self` or `other` cannot be compared.
471-
A RuntimeWarning is issued in this case.
472-
473-
* False : do not sort the result
474-
475-
.. versionadded:: 0.25.0
476-
477-
Returns
478-
-------
479-
y : Index or DatetimeIndex
480-
"""
481-
self._validate_sort_keyword(sort)
482-
self._assert_can_do_setop(other)
454+
def _union(self, other, sort):
455+
if not len(other) or self.equals(other) or not len(self):
456+
return super()._union(other, sort=sort)
483457

484458
if len(other) == 0 or self.equals(other) or len(self) == 0:
485459
return super().union(other, sort=sort)
@@ -495,7 +469,7 @@ def union(self, other, sort=None):
495469
if this._can_fast_union(other):
496470
return this._fast_union(other, sort=sort)
497471
else:
498-
result = Index.union(this, other, sort=sort)
472+
result = Index._union(this, other, sort=sort)
499473
if isinstance(result, DatetimeIndex):
500474
# TODO: we shouldn't be setting attributes like this;
501475
# in all the tests this equality already holds

pandas/core/indexes/interval.py

+12-14
Original file line numberDiff line numberDiff line change
@@ -964,19 +964,6 @@ def insert(self, loc, item):
964964
new_right = self.right.insert(loc, right_insert)
965965
return self._shallow_copy(new_left, new_right)
966966

967-
def _as_like_interval_index(self, other):
968-
self._assert_can_do_setop(other)
969-
other = ensure_index(other)
970-
if not isinstance(other, IntervalIndex):
971-
msg = ('the other index needs to be an IntervalIndex too, but '
972-
'was type {}').format(other.__class__.__name__)
973-
raise TypeError(msg)
974-
elif self.closed != other.closed:
975-
msg = ('can only do set operations between two IntervalIndex '
976-
'objects that are closed on the same side')
977-
raise ValueError(msg)
978-
return other
979-
980967
def _concat_same_dtype(self, to_concat, name):
981968
"""
982969
assert that we all have the same .closed
@@ -1092,7 +1079,17 @@ def overlaps(self, other):
10921079

10931080
def _setop(op_name, sort=None):
10941081
def func(self, other, sort=sort):
1095-
other = self._as_like_interval_index(other)
1082+
self._assert_can_do_setop(other)
1083+
other = ensure_index(other)
1084+
if not isinstance(other, IntervalIndex):
1085+
result = getattr(self.astype(object), op_name)(other)
1086+
if op_name in ('difference',):
1087+
result = result.astype(self.dtype)
1088+
return result
1089+
elif self.closed != other.closed:
1090+
msg = ('can only do set operations between two IntervalIndex '
1091+
'objects that are closed on the same side')
1092+
raise ValueError(msg)
10961093

10971094
# GH 19016: ensure set op will not return a prohibited dtype
10981095
subtypes = [self.dtype.subtype, other.dtype.subtype]
@@ -1114,6 +1111,7 @@ def func(self, other, sort=sort):
11141111

11151112
return type(self).from_tuples(result, closed=self.closed,
11161113
name=result_name)
1114+
11171115
return func
11181116

11191117
@property

pandas/core/indexes/numeric.py

+8
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
is_bool, is_bool_dtype, is_dtype_equal, is_extension_array_dtype, is_float,
1010
is_integer_dtype, is_scalar, needs_i8_conversion, pandas_dtype)
1111
import pandas.core.dtypes.concat as _concat
12+
from pandas.core.dtypes.generic import ABCInt64Index, ABCRangeIndex
1213
from pandas.core.dtypes.missing import isna
1314

1415
from pandas.core import algorithms
@@ -221,6 +222,13 @@ def _assert_safe_casting(cls, data, subarr):
221222
raise TypeError('Unsafe NumPy casting, you must '
222223
'explicitly cast')
223224

225+
def _is_compatible_with_other(self, other):
226+
return (
227+
super()._is_compatible_with_other(other)
228+
or all(isinstance(type(obj), (ABCInt64Index, ABCRangeIndex))
229+
for obj in [self, other])
230+
)
231+
224232

225233
Int64Index._add_numeric_methods()
226234
Int64Index._add_logical_methods()

pandas/core/indexes/period.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -791,6 +791,11 @@ def join(self, other, how='left', level=None, return_indexers=False,
791791
"""
792792
self._assert_can_do_setop(other)
793793

794+
if not isinstance(other, PeriodIndex):
795+
return self.astype(object).join(other, how=how, level=level,
796+
return_indexers=return_indexers,
797+
sort=sort)
798+
794799
result = Int64Index.join(self, other, how=how, level=level,
795800
return_indexers=return_indexers,
796801
sort=sort)
@@ -807,10 +812,9 @@ def intersection(self, other, sort=False):
807812
def _assert_can_do_setop(self, other):
808813
super()._assert_can_do_setop(other)
809814

810-
if not isinstance(other, PeriodIndex):
811-
raise ValueError('can only call with other PeriodIndex-ed objects')
812-
813-
if self.freq != other.freq:
815+
# *Can't* use PeriodIndexes of different freqs
816+
# *Can* use PeriodIndex/DatetimeIndex
817+
if isinstance(other, PeriodIndex) and self.freq != other.freq:
814818
msg = DIFFERENT_FREQ.format(cls=type(self).__name__,
815819
own_freq=self.freqstr,
816820
other_freq=other.freqstr)

pandas/core/indexes/range.py

+4-6
Original file line numberDiff line numberDiff line change
@@ -470,7 +470,7 @@ def _extended_gcd(self, a, b):
470470
old_t, t = t, old_t - quotient * t
471471
return old_r, old_s, old_t
472472

473-
def union(self, other, sort=None):
473+
def _union(self, other, sort):
474474
"""
475475
Form the union of two Index objects and sorts if possible
476476
@@ -490,9 +490,8 @@ def union(self, other, sort=None):
490490
-------
491491
union : Index
492492
"""
493-
self._assert_can_do_setop(other)
494-
if len(other) == 0 or self.equals(other) or len(self) == 0:
495-
return super().union(other, sort=sort)
493+
if not len(other) or self.equals(other) or not len(self):
494+
return super()._union(other, sort=sort)
496495

497496
if isinstance(other, RangeIndex) and sort is None:
498497
start_s, step_s = self._start, self._step
@@ -530,8 +529,7 @@ def union(self, other, sort=None):
530529
(start_s + step_o >= start_o) and
531530
(end_s - step_o <= end_o)):
532531
return RangeIndex(start_r, end_r + step_o, step_o)
533-
534-
return self._int64index.union(other, sort=sort)
532+
return self._int64index._union(other, sort=sort)
535533

536534
@Appender(_index_shared_docs['join'])
537535
def join(self, other, how='left', level=None, return_indexers=False,

pandas/core/indexes/timedeltas.py

+3-18
Original file line numberDiff line numberDiff line change
@@ -329,24 +329,9 @@ def astype(self, dtype, copy=True):
329329
return Index(result.astype('i8'), name=self.name)
330330
return DatetimeIndexOpsMixin.astype(self, dtype, copy=copy)
331331

332-
def union(self, other):
333-
"""
334-
Specialized union for TimedeltaIndex objects. If combine
335-
overlapping ranges with the same DateOffset, will be much
336-
faster than Index.union
337-
338-
Parameters
339-
----------
340-
other : TimedeltaIndex or array-like
341-
342-
Returns
343-
-------
344-
y : Index or TimedeltaIndex
345-
"""
346-
self._assert_can_do_setop(other)
347-
332+
def _union(self, other, sort):
348333
if len(other) == 0 or self.equals(other) or len(self) == 0:
349-
return super().union(other)
334+
return super()._union(other, sort=sort)
350335

351336
if not isinstance(other, TimedeltaIndex):
352337
try:
@@ -358,7 +343,7 @@ def union(self, other):
358343
if this._can_fast_union(other):
359344
return this._fast_union(other)
360345
else:
361-
result = Index.union(this, other)
346+
result = Index._union(this, other, sort=sort)
362347
if isinstance(result, TimedeltaIndex):
363348
if result.freq is None:
364349
result.freq = to_offset(result.inferred_freq)

0 commit comments

Comments
 (0)