From 602d57a68ee088857170f34e261a4ea76e4f3056 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 27 Jan 2020 14:00:15 -0800 Subject: [PATCH 1/4] BUG: consistently retain object dtype for set ops --- pandas/core/indexes/base.py | 8 ++++++-- pandas/core/indexes/category.py | 7 ------- pandas/core/indexes/datetimes.py | 2 +- pandas/tests/indexes/test_base.py | 20 ++++++++++++++++++++ 4 files changed, 27 insertions(+), 10 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4573fd7fc6da3..57392b4cf3a36 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -506,6 +506,7 @@ def _shallow_copy(self, values=None, **kwargs): values = self.values attributes = self._get_attributes_dict() + attributes.update(kwargs) if not len(values) and "dtype" not in kwargs: attributes["dtype"] = self.dtype @@ -2576,6 +2577,7 @@ def _union(self, other, sort): # worth making this faster? a very unusual case value_set = set(lvals) result.extend([x for x in rvals if x not in value_set]) + result = Index(result)._values # do type inference here else: # find indexes of things in "other" that are not in "self" if self.is_unique: @@ -2605,7 +2607,8 @@ def _union(self, other, sort): return self._wrap_setop_result(other, result) def _wrap_setop_result(self, other, result): - return self._constructor(result, name=get_op_result_name(self, other)) + name = get_op_result_name(self, other) + return self._shallow_copy(result, name=name) _index_shared_docs[ "intersection" @@ -2666,9 +2669,10 @@ def intersection(self, other, sort=False): if self.is_monotonic and other.is_monotonic: try: result = self._inner_indexer(lvals, rvals)[0] - return self._wrap_setop_result(other, result) except TypeError: pass + else: + return self._wrap_setop_result(other, result) try: indexer = Index(rvals).get_indexer(lvals) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 268ab9ba4e4c4..fe303bd931aeb 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -29,7 +29,6 @@ from pandas.core.indexes.base import Index, _index_shared_docs, maybe_extract_name from pandas.core.indexes.extension import ExtensionIndex import pandas.core.missing as missing -from pandas.core.ops import get_op_result_name _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update(dict(target_klass="CategoricalIndex")) @@ -378,12 +377,6 @@ def values(self): """ return the underlying data, which is a Categorical """ return self._data - def _wrap_setop_result(self, other, result): - name = get_op_result_name(self, other) - # We use _shallow_copy rather than the Index implementation - # (which uses _constructor) in order to preserve dtype. - return self._shallow_copy(result, name=name) - @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) def __contains__(self, key: Any) -> bool: # if key is a NaN, check if any NaN is in self. diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index b269239ed10ac..3afd1ff35806d 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -377,7 +377,7 @@ def union_many(self, others): def _wrap_setop_result(self, other, result): name = get_op_result_name(self, other) - return self._shallow_copy(result, name=name, freq=None, tz=self.tz) + return self._shallow_copy(result, name=name, freq=None) # -------------------------------------------------------------------- diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index e72963de09ab4..9c03adc8a10e0 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1047,6 +1047,26 @@ def test_setops_disallow_true(self, method): with pytest.raises(ValueError, match="The 'sort' keyword only takes"): getattr(idx1, method)(idx2, sort=True) + def test_setops_preserve_object_dtype(self): + idx = pd.Index([1, 2, 3], dtype=object) + result = idx.intersection(idx[1:]) + expected = idx[1:] + tm.assert_index_equal(result, expected) + + # if other is not monotonic increasing, intersection goes through + # a different route + result = idx.intersection(idx[1:][::-1]) + tm.assert_index_equal(result, expected) + + result = idx._union(idx[1:], sort=None) + expected = idx + tm.assert_index_equal(result, expected) + + # if other is not monotonic increasing, _union goes through + # a different route + result = idx._union(idx[1:][::-1], sort=None) + tm.assert_index_equal(result, expected) + def test_map_identity_mapping(self, indices): # GH 12766 tm.assert_index_equal(indices, indices.map(lambda x: x)) From 010508df0f38613b96cba64ff35cd1332098e0c1 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 28 Jan 2020 09:55:58 -0800 Subject: [PATCH 2/4] make DTI._wrap_setop_result unnecessary --- pandas/core/indexes/datetimelike.py | 9 ++++----- pandas/core/indexes/datetimes.py | 9 --------- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index b87dd0f02252f..225d9be2562d4 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -791,11 +791,10 @@ def _union(self, other, sort): if this._can_fast_union(other): return this._fast_union(other, sort=sort) else: - result = Index._union(this, other, sort=sort) - if isinstance(result, type(self)): - assert result._data.dtype == this.dtype - if result.freq is None: - result._set_freq("infer") + i8self = Int64Index._simple_new(self.asi8, name=self.name) + i8other = Int64Index._simple_new(other.asi8, name=other.name) + i8result = i8self._union(i8other, sort=sort) + result = type(self)(i8result, dtype=self.dtype, freq="infer") return result # -------------------------------------------------------------------- diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 3afd1ff35806d..6e434f5aa2235 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -367,18 +367,9 @@ def union_many(self, others): if this._can_fast_union(other): this = this._fast_union(other) else: - dtype = this.dtype this = Index.union(this, other) - if isinstance(this, DatetimeIndex): - # TODO: we shouldn't be setting attributes like this; - # in all the tests this equality already holds - this._data._dtype = dtype return this - def _wrap_setop_result(self, other, result): - name = get_op_result_name(self, other) - return self._shallow_copy(result, name=name, freq=None) - # -------------------------------------------------------------------- def _get_time_micros(self): From 1080b0e83fa0f950627fde1bb55fc41f138518d0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 30 Jan 2020 17:13:08 -0800 Subject: [PATCH 3/4] whatsnew --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 920919755dc23..a8c78430790cf 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -208,7 +208,7 @@ Other ^^^^^ - Appending a dictionary to a :class:`DataFrame` without passing ``ignore_index=True`` will raise ``TypeError: Can only append a dict if ignore_index=True`` instead of ``TypeError: Can only append a Series if ignore_index=True or if the Series has a name`` (:issue:`30871`) -- +- Set operations on an object-dtype :class:`Index` now always return object-dtype results (:issue:`31401`) .. --------------------------------------------------------------------------- From ec7b9677f8aecda3d9b3f6bcb8be68bcc9935a7c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 30 Jan 2020 21:07:35 -0800 Subject: [PATCH 4/4] test public method too --- pandas/tests/indexes/test_base.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 9c03adc8a10e0..811bbe4eddfa9 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1062,11 +1062,17 @@ def test_setops_preserve_object_dtype(self): expected = idx tm.assert_index_equal(result, expected) + result = idx.union(idx[1:], sort=None) + tm.assert_index_equal(result, expected) + # if other is not monotonic increasing, _union goes through # a different route result = idx._union(idx[1:][::-1], sort=None) tm.assert_index_equal(result, expected) + result = idx.union(idx[1:][::-1], sort=None) + tm.assert_index_equal(result, expected) + def test_map_identity_mapping(self, indices): # GH 12766 tm.assert_index_equal(indices, indices.map(lambda x: x))