diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 4ea7c656fd197..02522e95a2d79 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -1559,11 +1559,11 @@ See :ref:`Advanced Indexing ` for usage of MultiIndexes. index.levels[1] index.set_levels(["a", "b"], level=1) +.. _indexing.set_ops: + Set operations on Index objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. _indexing.set_ops: - The two main operations are ``union (|)`` and ``intersection (&)``. These can be directly called as instance methods or used via overloaded operators. Difference is provided via the ``.difference()`` method. @@ -1592,11 +1592,22 @@ with duplicates dropped. The resulting index from a set operation will be sorted in ascending order. -Missing values -~~~~~~~~~~~~~~ +When performing :meth:`Index.union` between indexes with different dtypes, the indexes +must be cast to a common dtype. Typically, though not always, this is object dtype. The +exception is when performing a union between integer and float data. In this case, the +integer values are converted to float + +.. ipython:: python + + idx1 = pd.Index([0, 1, 2]) + idx2 = pd.Index([0.5, 1.5]) + idx1 | idx2 .. _indexing.missing: +Missing values +~~~~~~~~~~~~~~ + .. important:: Even though ``Index`` can hold missing values (``NaN``), it should be avoided @@ -1624,11 +1635,11 @@ Occasionally you will load or create a data set into a DataFrame and want to add an index after you've already done so. There are a couple of different ways. +.. _indexing.set_index: + Set an index ~~~~~~~~~~~~ -.. _indexing.set_index: - DataFrame has a :meth:`~DataFrame.set_index` method which takes a column name (for a regular ``Index``) or a list of column names (for a ``MultiIndex``). To create a new, re-indexed DataFrame: diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 901e4f6942897..7c0e04d04f6ac 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -347,6 +347,11 @@ considered commutative, such that ``A.union(B) == B.union(A)`` (:issue:`23525`). pd.period_range('19910905', periods=2).union(pd.Int64Index([1, 2, 3])) pd.Index([], dtype=object).union(pd.Index([1, 2, 3])) +Note that integer- and floating-dtype indexes are considered "compatible". The integer +values are coerced to floating point, which may result in loss of precision. See +:ref:`indexing.set_ops` for more. + + ``DataFrame`` groupby ffill/bfill no longer return group labels ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index b6c8ba588f9d6..a228895e527aa 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -7,9 +7,11 @@ from pandas.core.dtypes.common import ( is_bool, is_bool_dtype, is_dtype_equal, is_extension_array_dtype, is_float, - is_integer_dtype, is_scalar, needs_i8_conversion, pandas_dtype) + is_float_dtype, is_integer_dtype, is_scalar, needs_i8_conversion, + pandas_dtype) import pandas.core.dtypes.concat as _concat -from pandas.core.dtypes.generic import ABCInt64Index, ABCRangeIndex +from pandas.core.dtypes.generic import ( + ABCFloat64Index, ABCInt64Index, ABCRangeIndex, ABCUInt64Index) from pandas.core.dtypes.missing import isna from pandas.core import algorithms @@ -123,6 +125,24 @@ def insert(self, loc, item): item = self._na_value return super().insert(loc, item) + def _union(self, other, sort): + # Right now, we treat union(int, float) a bit special. + # See https://github.com/pandas-dev/pandas/issues/26778 for discussion + # We may change union(int, float) to go to object. + # float | [u]int -> float (the special case) + # | -> T + # | -> object + needs_cast = ( + (is_integer_dtype(self.dtype) and is_float_dtype(other.dtype)) or + (is_integer_dtype(other.dtype) and is_float_dtype(self.dtype)) + ) + if needs_cast: + first = self.astype("float") + second = other.astype("float") + return first._union(second, sort) + else: + return super()._union(other, sort) + _num_index_shared_docs['class_descr'] = """ Immutable ndarray implementing an ordered, sliceable set. The basic object @@ -225,7 +245,9 @@ def _assert_safe_casting(cls, data, subarr): def _is_compatible_with_other(self, other): return ( super()._is_compatible_with_other(other) - or all(isinstance(type(obj), (ABCInt64Index, ABCRangeIndex)) + or all(isinstance(type(obj), (ABCInt64Index, + ABCFloat64Index, + ABCRangeIndex)) for obj in [self, other]) ) @@ -301,6 +323,14 @@ def _assert_safe_casting(cls, data, subarr): raise TypeError('Unsafe NumPy casting, you must ' 'explicitly cast') + def _is_compatible_with_other(self, other): + return ( + super()._is_compatible_with_other(other) + or all(isinstance(type(obj), (ABCUInt64Index, + ABCFloat64Index)) + for obj in [self, other]) + ) + UInt64Index._add_numeric_methods() UInt64Index._add_logical_methods() @@ -447,6 +477,16 @@ def isin(self, values, level=None): self._validate_index_level(level) return algorithms.isin(np.array(self), values) + def _is_compatible_with_other(self, other): + return ( + super()._is_compatible_with_other(other) + or all(isinstance(type(obj), (ABCInt64Index, + ABCFloat64Index, + ABCUInt64Index, + ABCRangeIndex)) + for obj in [self, other]) + ) + Float64Index._add_numeric_methods() Float64Index._add_logical_methods_disabled() diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index c61e0fa6d6021..3437f501aa910 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -1118,3 +1118,29 @@ def test_join_outer(self): tm.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) tm.assert_numpy_array_equal(ridx, eridx) + + +@pytest.mark.parametrize("dtype", ['int64', 'uint64']) +def test_int_float_union_dtype(dtype): + # https://github.com/pandas-dev/pandas/issues/26778 + # [u]int | float -> float + index = pd.Index([0, 2, 3], dtype=dtype) + other = pd.Float64Index([0.5, 1.5]) + expected = pd.Float64Index([0.0, 0.5, 1.5, 2.0, 3.0]) + result = index.union(other) + tm.assert_index_equal(result, expected) + + result = other.union(index) + tm.assert_index_equal(result, expected) + + +def test_range_float_union_dtype(): + # https://github.com/pandas-dev/pandas/issues/26778 + index = pd.RangeIndex(start=0, stop=3) + other = pd.Float64Index([0.5, 1.5]) + result = index.union(other) + expected = pd.Float64Index([0.0, 0.5, 1, 1.5, 2.0]) + tm.assert_index_equal(result, expected) + + result = other.union(index) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index b626ced2ccb1b..8c0762c7e7e5a 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -2,6 +2,7 @@ The tests in this package are to ensure the proper resultant dtypes of set operations. ''' +from collections import OrderedDict import itertools as it import numpy as np @@ -10,13 +11,17 @@ from pandas.core.dtypes.common import is_dtype_equal import pandas as pd -from pandas import Int64Index, RangeIndex +from pandas import Float64Index, Int64Index, RangeIndex, UInt64Index +from pandas.api.types import pandas_dtype from pandas.tests.indexes.conftest import indices_list import pandas.util.testing as tm -COMPATIBLE_INCONSISTENT_PAIRS = { - (Int64Index, RangeIndex): (tm.makeIntIndex, tm.makeRangeIndex) -} +COMPATIBLE_INCONSISTENT_PAIRS = OrderedDict([ + ((Int64Index, RangeIndex), (tm.makeIntIndex, tm.makeRangeIndex)), + ((Float64Index, Int64Index), (tm.makeFloatIndex, tm.makeIntIndex)), + ((Float64Index, RangeIndex), (tm.makeFloatIndex, tm.makeIntIndex)), + ((Float64Index, UInt64Index), (tm.makeFloatIndex, tm.makeUIntIndex)), +]) @pytest.fixture(params=list(it.combinations(indices_list, 2)), @@ -74,3 +79,29 @@ def test_compatible_inconsistent_pairs(idx_fact1, idx_fact2): assert res1.dtype in (idx1.dtype, idx2.dtype) assert res2.dtype in (idx1.dtype, idx2.dtype) + + +@pytest.mark.parametrize('left, right, expected', [ + ('int64', 'int64', 'int64'), + ('int64', 'uint64', 'object'), + ('int64', 'float64', 'float64'), + ('uint64', 'float64', 'float64'), + ('uint64', 'uint64', 'uint64'), + ('float64', 'float64', 'float64'), + ('datetime64[ns]', 'int64', 'object'), + ('datetime64[ns]', 'uint64', 'object'), + ('datetime64[ns]', 'float64', 'object'), + ('datetime64[ns, CET]', 'int64', 'object'), + ('datetime64[ns, CET]', 'uint64', 'object'), + ('datetime64[ns, CET]', 'float64', 'object'), + ('Period[D]', 'int64', 'object'), + ('Period[D]', 'uint64', 'object'), + ('Period[D]', 'float64', 'object'), +]) +def test_union_dtypes(left, right, expected): + left = pandas_dtype(left) + right = pandas_dtype(right) + a = pd.Index([], dtype=left) + b = pd.Index([], dtype=right) + result = (a | b).dtype + assert result == expected