Skip to content

Commit b80df7b

Browse files
TomAugspurgerjreback
authored andcommitted
API/REGR: Convert to float for index union (#27034)
1 parent c1673cf commit b80df7b

File tree

5 files changed

+126
-13
lines changed

5 files changed

+126
-13
lines changed

doc/source/user_guide/indexing.rst

+17-6
Original file line numberDiff line numberDiff line change
@@ -1559,11 +1559,11 @@ See :ref:`Advanced Indexing <advanced>` for usage of MultiIndexes.
15591559
index.levels[1]
15601560
index.set_levels(["a", "b"], level=1)
15611561
1562+
.. _indexing.set_ops:
1563+
15621564
Set operations on Index objects
15631565
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
15641566

1565-
.. _indexing.set_ops:
1566-
15671567
The two main operations are ``union (|)`` and ``intersection (&)``.
15681568
These can be directly called as instance methods or used via overloaded
15691569
operators. Difference is provided via the ``.difference()`` method.
@@ -1592,11 +1592,22 @@ with duplicates dropped.
15921592

15931593
The resulting index from a set operation will be sorted in ascending order.
15941594

1595-
Missing values
1596-
~~~~~~~~~~~~~~
1595+
When performing :meth:`Index.union` between indexes with different dtypes, the indexes
1596+
must be cast to a common dtype. Typically, though not always, this is object dtype. The
1597+
exception is when performing a union between integer and float data. In this case, the
1598+
integer values are converted to float
1599+
1600+
.. ipython:: python
1601+
1602+
idx1 = pd.Index([0, 1, 2])
1603+
idx2 = pd.Index([0.5, 1.5])
1604+
idx1 | idx2
15971605
15981606
.. _indexing.missing:
15991607

1608+
Missing values
1609+
~~~~~~~~~~~~~~
1610+
16001611
.. important::
16011612

16021613
Even though ``Index`` can hold missing values (``NaN``), it should be avoided
@@ -1624,11 +1635,11 @@ Occasionally you will load or create a data set into a DataFrame and want to
16241635
add an index after you've already done so. There are a couple of different
16251636
ways.
16261637

1638+
.. _indexing.set_index:
1639+
16271640
Set an index
16281641
~~~~~~~~~~~~
16291642

1630-
.. _indexing.set_index:
1631-
16321643
DataFrame has a :meth:`~DataFrame.set_index` method which takes a column name
16331644
(for a regular ``Index``) or a list of column names (for a ``MultiIndex``).
16341645
To create a new, re-indexed DataFrame:

doc/source/whatsnew/v0.25.0.rst

+5
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,11 @@ considered commutative, such that ``A.union(B) == B.union(A)`` (:issue:`23525`).
347347
pd.period_range('19910905', periods=2).union(pd.Int64Index([1, 2, 3]))
348348
pd.Index([], dtype=object).union(pd.Index([1, 2, 3]))
349349
350+
Note that integer- and floating-dtype indexes are considered "compatible". The integer
351+
values are coerced to floating point, which may result in loss of precision. See
352+
:ref:`indexing.set_ops` for more.
353+
354+
350355
``DataFrame`` groupby ffill/bfill no longer return group labels
351356
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
352357

pandas/core/indexes/numeric.py

+43-3
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,11 @@
77

88
from pandas.core.dtypes.common import (
99
is_bool, is_bool_dtype, is_dtype_equal, is_extension_array_dtype, is_float,
10-
is_integer_dtype, is_scalar, needs_i8_conversion, pandas_dtype)
10+
is_float_dtype, is_integer_dtype, is_scalar, needs_i8_conversion,
11+
pandas_dtype)
1112
import pandas.core.dtypes.concat as _concat
12-
from pandas.core.dtypes.generic import ABCInt64Index, ABCRangeIndex
13+
from pandas.core.dtypes.generic import (
14+
ABCFloat64Index, ABCInt64Index, ABCRangeIndex, ABCUInt64Index)
1315
from pandas.core.dtypes.missing import isna
1416

1517
from pandas.core import algorithms
@@ -123,6 +125,24 @@ def insert(self, loc, item):
123125
item = self._na_value
124126
return super().insert(loc, item)
125127

128+
def _union(self, other, sort):
129+
# Right now, we treat union(int, float) a bit special.
130+
# See https://github.com/pandas-dev/pandas/issues/26778 for discussion
131+
# We may change union(int, float) to go to object.
132+
# float | [u]int -> float (the special case)
133+
# <T> | <T> -> T
134+
# <T> | <U> -> object
135+
needs_cast = (
136+
(is_integer_dtype(self.dtype) and is_float_dtype(other.dtype)) or
137+
(is_integer_dtype(other.dtype) and is_float_dtype(self.dtype))
138+
)
139+
if needs_cast:
140+
first = self.astype("float")
141+
second = other.astype("float")
142+
return first._union(second, sort)
143+
else:
144+
return super()._union(other, sort)
145+
126146

127147
_num_index_shared_docs['class_descr'] = """
128148
Immutable ndarray implementing an ordered, sliceable set. The basic object
@@ -225,7 +245,9 @@ def _assert_safe_casting(cls, data, subarr):
225245
def _is_compatible_with_other(self, other):
226246
return (
227247
super()._is_compatible_with_other(other)
228-
or all(isinstance(type(obj), (ABCInt64Index, ABCRangeIndex))
248+
or all(isinstance(type(obj), (ABCInt64Index,
249+
ABCFloat64Index,
250+
ABCRangeIndex))
229251
for obj in [self, other])
230252
)
231253

@@ -301,6 +323,14 @@ def _assert_safe_casting(cls, data, subarr):
301323
raise TypeError('Unsafe NumPy casting, you must '
302324
'explicitly cast')
303325

326+
def _is_compatible_with_other(self, other):
327+
return (
328+
super()._is_compatible_with_other(other)
329+
or all(isinstance(type(obj), (ABCUInt64Index,
330+
ABCFloat64Index))
331+
for obj in [self, other])
332+
)
333+
304334

305335
UInt64Index._add_numeric_methods()
306336
UInt64Index._add_logical_methods()
@@ -447,6 +477,16 @@ def isin(self, values, level=None):
447477
self._validate_index_level(level)
448478
return algorithms.isin(np.array(self), values)
449479

480+
def _is_compatible_with_other(self, other):
481+
return (
482+
super()._is_compatible_with_other(other)
483+
or all(isinstance(type(obj), (ABCInt64Index,
484+
ABCFloat64Index,
485+
ABCUInt64Index,
486+
ABCRangeIndex))
487+
for obj in [self, other])
488+
)
489+
450490

451491
Float64Index._add_numeric_methods()
452492
Float64Index._add_logical_methods_disabled()

pandas/tests/indexes/test_numeric.py

+26
Original file line numberDiff line numberDiff line change
@@ -1118,3 +1118,29 @@ def test_join_outer(self):
11181118
tm.assert_index_equal(res, eres)
11191119
tm.assert_numpy_array_equal(lidx, elidx)
11201120
tm.assert_numpy_array_equal(ridx, eridx)
1121+
1122+
1123+
@pytest.mark.parametrize("dtype", ['int64', 'uint64'])
1124+
def test_int_float_union_dtype(dtype):
1125+
# https://github.com/pandas-dev/pandas/issues/26778
1126+
# [u]int | float -> float
1127+
index = pd.Index([0, 2, 3], dtype=dtype)
1128+
other = pd.Float64Index([0.5, 1.5])
1129+
expected = pd.Float64Index([0.0, 0.5, 1.5, 2.0, 3.0])
1130+
result = index.union(other)
1131+
tm.assert_index_equal(result, expected)
1132+
1133+
result = other.union(index)
1134+
tm.assert_index_equal(result, expected)
1135+
1136+
1137+
def test_range_float_union_dtype():
1138+
# https://github.com/pandas-dev/pandas/issues/26778
1139+
index = pd.RangeIndex(start=0, stop=3)
1140+
other = pd.Float64Index([0.5, 1.5])
1141+
result = index.union(other)
1142+
expected = pd.Float64Index([0.0, 0.5, 1, 1.5, 2.0])
1143+
tm.assert_index_equal(result, expected)
1144+
1145+
result = other.union(index)
1146+
tm.assert_index_equal(result, expected)

pandas/tests/indexes/test_setops.py

+35-4
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
The tests in this package are to ensure the proper resultant dtypes of
33
set operations.
44
'''
5+
from collections import OrderedDict
56
import itertools as it
67

78
import numpy as np
@@ -10,13 +11,17 @@
1011
from pandas.core.dtypes.common import is_dtype_equal
1112

1213
import pandas as pd
13-
from pandas import Int64Index, RangeIndex
14+
from pandas import Float64Index, Int64Index, RangeIndex, UInt64Index
15+
from pandas.api.types import pandas_dtype
1416
from pandas.tests.indexes.conftest import indices_list
1517
import pandas.util.testing as tm
1618

17-
COMPATIBLE_INCONSISTENT_PAIRS = {
18-
(Int64Index, RangeIndex): (tm.makeIntIndex, tm.makeRangeIndex)
19-
}
19+
COMPATIBLE_INCONSISTENT_PAIRS = OrderedDict([
20+
((Int64Index, RangeIndex), (tm.makeIntIndex, tm.makeRangeIndex)),
21+
((Float64Index, Int64Index), (tm.makeFloatIndex, tm.makeIntIndex)),
22+
((Float64Index, RangeIndex), (tm.makeFloatIndex, tm.makeIntIndex)),
23+
((Float64Index, UInt64Index), (tm.makeFloatIndex, tm.makeUIntIndex)),
24+
])
2025

2126

2227
@pytest.fixture(params=list(it.combinations(indices_list, 2)),
@@ -74,3 +79,29 @@ def test_compatible_inconsistent_pairs(idx_fact1, idx_fact2):
7479

7580
assert res1.dtype in (idx1.dtype, idx2.dtype)
7681
assert res2.dtype in (idx1.dtype, idx2.dtype)
82+
83+
84+
@pytest.mark.parametrize('left, right, expected', [
85+
('int64', 'int64', 'int64'),
86+
('int64', 'uint64', 'object'),
87+
('int64', 'float64', 'float64'),
88+
('uint64', 'float64', 'float64'),
89+
('uint64', 'uint64', 'uint64'),
90+
('float64', 'float64', 'float64'),
91+
('datetime64[ns]', 'int64', 'object'),
92+
('datetime64[ns]', 'uint64', 'object'),
93+
('datetime64[ns]', 'float64', 'object'),
94+
('datetime64[ns, CET]', 'int64', 'object'),
95+
('datetime64[ns, CET]', 'uint64', 'object'),
96+
('datetime64[ns, CET]', 'float64', 'object'),
97+
('Period[D]', 'int64', 'object'),
98+
('Period[D]', 'uint64', 'object'),
99+
('Period[D]', 'float64', 'object'),
100+
])
101+
def test_union_dtypes(left, right, expected):
102+
left = pandas_dtype(left)
103+
right = pandas_dtype(right)
104+
a = pd.Index([], dtype=left)
105+
b = pd.Index([], dtype=right)
106+
result = (a | b).dtype
107+
assert result == expected

0 commit comments

Comments
 (0)