Skip to content

API/REGR: Convert to float for index union #27034

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jun 27, 2019
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 17 additions & 6 deletions doc/source/user_guide/indexing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1559,11 +1559,11 @@ See :ref:`Advanced Indexing <advanced>` for usage of MultiIndexes.
index.levels[1]
index.set_levels(["a", "b"], level=1)

.. _indexing.set_ops:

Set operations on Index objects
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. _indexing.set_ops:

The two main operations are ``union (|)`` and ``intersection (&)``.
These can be directly called as instance methods or used via overloaded
operators. Difference is provided via the ``.difference()`` method.
Expand Down Expand Up @@ -1592,11 +1592,22 @@ with duplicates dropped.

The resulting index from a set operation will be sorted in ascending order.

Missing values
~~~~~~~~~~~~~~
When performing :meth:`Index.union` between indexes with different dtypes, the indexes
must be cast to a common dtype. Typically, though not always, this is object dtype. The
exception is when performing a union between integer and float data. In this case, the
integer values are converted to float

.. ipython:: python

idx1 = pd.Index([0, 1, 2])
idx2 = pd.Index([0.5, 1.5])
idx1 | idx2

.. _indexing.missing:

Missing values
~~~~~~~~~~~~~~

.. important::

Even though ``Index`` can hold missing values (``NaN``), it should be avoided
Expand Down Expand Up @@ -1624,11 +1635,11 @@ Occasionally you will load or create a data set into a DataFrame and want to
add an index after you've already done so. There are a couple of different
ways.

.. _indexing.set_index:

Set an index
~~~~~~~~~~~~

.. _indexing.set_index:

DataFrame has a :meth:`~DataFrame.set_index` method which takes a column name
(for a regular ``Index``) or a list of column names (for a ``MultiIndex``).
To create a new, re-indexed DataFrame:
Expand Down
5 changes: 5 additions & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,11 @@ considered commutative, such that ``A.union(B) == B.union(A)`` (:issue:`23525`).
pd.period_range('19910905', periods=2).union(pd.Int64Index([1, 2, 3]))
pd.Index([], dtype=object).union(pd.Index([1, 2, 3]))

Note that integer- and floating-dtype indexes are considered "compatible". The integer
values are coerced to floating point, which may result in loss of precision. See
:ref:`indexing.set_ops` for more.


``DataFrame`` groupby ffill/bfill no longer return group labels
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Expand Down
46 changes: 43 additions & 3 deletions pandas/core/indexes/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@

from pandas.core.dtypes.common import (
is_bool, is_bool_dtype, is_dtype_equal, is_extension_array_dtype, is_float,
is_integer_dtype, is_scalar, needs_i8_conversion, pandas_dtype)
is_float_dtype, is_integer_dtype, is_scalar, needs_i8_conversion,
pandas_dtype)
import pandas.core.dtypes.concat as _concat
from pandas.core.dtypes.generic import ABCInt64Index, ABCRangeIndex
from pandas.core.dtypes.generic import (
ABCFloat64Index, ABCInt64Index, ABCRangeIndex, ABCUInt64Index)
from pandas.core.dtypes.missing import isna

from pandas.core import algorithms
Expand Down Expand Up @@ -123,6 +125,24 @@ def insert(self, loc, item):
item = self._na_value
return super().insert(loc, item)

def _union(self, other, sort):
# Right now, we treat union(int, float) a bit special.
# See https://github.com/pandas-dev/pandas/issues/26778 for discussion
# We may change union(int, float) to go to object.
# float | [u]int -> float (the special case)
# <T> | <T> -> T
# <T> | <U> -> object
needs_cast = (
(is_integer_dtype(self.dtype) and is_float_dtype(other.dtype)) or
(is_integer_dtype(other.dtype) and is_float_dtype(self.dtype))
)
if needs_cast:
first = self.astype("float")
second = other.astype("float")
return first._union(second, sort)
else:
return super()._union(other, sort)


_num_index_shared_docs['class_descr'] = """
Immutable ndarray implementing an ordered, sliceable set. The basic object
Expand Down Expand Up @@ -225,7 +245,9 @@ def _assert_safe_casting(cls, data, subarr):
def _is_compatible_with_other(self, other):
return (
super()._is_compatible_with_other(other)
or all(isinstance(type(obj), (ABCInt64Index, ABCRangeIndex))
or all(isinstance(type(obj), (ABCInt64Index,
ABCFloat64Index,
ABCRangeIndex))
for obj in [self, other])
)

Expand Down Expand Up @@ -301,6 +323,14 @@ def _assert_safe_casting(cls, data, subarr):
raise TypeError('Unsafe NumPy casting, you must '
'explicitly cast')

def _is_compatible_with_other(self, other):
return (
super()._is_compatible_with_other(other)
or all(isinstance(type(obj), (ABCUInt64Index,
ABCFloat64Index))
for obj in [self, other])
)


UInt64Index._add_numeric_methods()
UInt64Index._add_logical_methods()
Expand Down Expand Up @@ -447,6 +477,16 @@ def isin(self, values, level=None):
self._validate_index_level(level)
return algorithms.isin(np.array(self), values)

def _is_compatible_with_other(self, other):
return (
super()._is_compatible_with_other(other)
or all(isinstance(type(obj), (ABCInt64Index,
ABCFloat64Index,
ABCUInt64Index,
ABCRangeIndex))
for obj in [self, other])
)


Float64Index._add_numeric_methods()
Float64Index._add_logical_methods_disabled()
26 changes: 26 additions & 0 deletions pandas/tests/indexes/test_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -1118,3 +1118,29 @@ def test_join_outer(self):
tm.assert_index_equal(res, eres)
tm.assert_numpy_array_equal(lidx, elidx)
tm.assert_numpy_array_equal(ridx, eridx)


@pytest.mark.parametrize("dtype", ['int64', 'uint64'])
def test_int_float_union_dtype(dtype):
# https://github.com/pandas-dev/pandas/issues/26778
# [u]int | float -> float
index = pd.Index([0, 2, 3], dtype=dtype)
other = pd.Float64Index([0.5, 1.5])
expected = pd.Float64Index([0.0, 0.5, 1.5, 2.0, 3.0])
# result = index.union(other)
# tm.assert_index_equal(result, expected)

result = other.union(index)
tm.assert_index_equal(result, expected)


def test_range_float_union_dtype():
# https://github.com/pandas-dev/pandas/issues/26778
index = pd.RangeIndex(start=0, stop=3)
other = pd.Float64Index([0.5, 1.5])
result = index.union(other)
expected = pd.Float64Index([0.0, 0.5, 1, 1.5, 2.0])
tm.assert_index_equal(result, expected)

result = other.union(index)
tm.assert_index_equal(result, expected)
39 changes: 35 additions & 4 deletions pandas/tests/indexes/test_setops.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
The tests in this package are to ensure the proper resultant dtypes of
set operations.
'''
from collections import OrderedDict
import itertools as it

import numpy as np
Expand All @@ -10,13 +11,17 @@
from pandas.core.dtypes.common import is_dtype_equal

import pandas as pd
from pandas import Int64Index, RangeIndex
from pandas import Float64Index, Int64Index, RangeIndex, UInt64Index
from pandas.api.types import pandas_dtype
from pandas.tests.indexes.conftest import indices_list
import pandas.util.testing as tm

COMPATIBLE_INCONSISTENT_PAIRS = {
(Int64Index, RangeIndex): (tm.makeIntIndex, tm.makeRangeIndex)
}
COMPATIBLE_INCONSISTENT_PAIRS = OrderedDict([
((Int64Index, RangeIndex), (tm.makeIntIndex, tm.makeRangeIndex)),
((Float64Index, Int64Index), (tm.makeFloatIndex, tm.makeIntIndex)),
((Float64Index, RangeIndex), (tm.makeFloatIndex, tm.makeIntIndex)),
((Float64Index, UInt64Index), (tm.makeFloatIndex, tm.makeUIntIndex)),
])


@pytest.fixture(params=list(it.combinations(indices_list, 2)),
Expand Down Expand Up @@ -74,3 +79,29 @@ def test_compatible_inconsistent_pairs(idx_fact1, idx_fact2):

assert res1.dtype in (idx1.dtype, idx2.dtype)
assert res2.dtype in (idx1.dtype, idx2.dtype)


@pytest.mark.parametrize('left, right, expected', [
('int64', 'int64', 'int64'),
('int64', 'uint64', 'object'),
('int64', 'float64', 'float64'),
('uint64', 'float64', 'float64'),
('uint64', 'uint64', 'uint64'),
('float64', 'float64', 'float64'),
('datetime64[ns]', 'int64', 'object'),
('datetime64[ns]', 'uint64', 'object'),
('datetime64[ns]', 'float64', 'object'),
('datetime64[ns, CET]', 'int64', 'object'),
('datetime64[ns, CET]', 'uint64', 'object'),
('datetime64[ns, CET]', 'float64', 'object'),
('Period[D]', 'int64', 'object'),
('Period[D]', 'uint64', 'object'),
('Period[D]', 'float64', 'object'),
])
def test_union_dtypes(left, right, expected):
left = pandas_dtype(left)
right = pandas_dtype(right)
a = pd.Index([], dtype=left)
b = pd.Index([], dtype=right)
result = (a | b).dtype
assert result == expected