Skip to content

BUG: set_index() and reset_index() not preserving object dtypes #30870

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 15 commits into from
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ Indexing
^^^^^^^^
- Bug in slicing on a :class:`DatetimeIndex` with a partial-timestamp dropping high-resolution indices near the end of a year, quarter, or month (:issue:`31064`)
- Bug in :meth:`PeriodIndex.get_loc` treating higher-resolution strings differently from :meth:`PeriodIndex.get_value` (:issue:`31172`)
- Bug in :meth:`DataFrame.set_index` not preserving column dtype when creating :class:`Index` from a single column (:issue:`30517`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

move this to a sub-section; show a typical example of before & after (see how this is done in the Enhancements section)

- Bug in :meth:`DataFrame.reset_index` not preserving object dtype when resetting an :class:`Index` (:issue:`30517`)
- Bug in :meth:`Series.at` and :meth:`DataFrame.at` not matching ``.loc`` behavior when looking up an integer in a :class:`Float64Index` (:issue:`31329`)
- Bug in :meth:`PeriodIndex.is_monotonic` incorrectly returning ``True`` when containing leading ``NaT`` entries (:issue:`31437`)
-
Expand Down
19 changes: 13 additions & 6 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@
ensure_platform_int,
infer_dtype_from_object,
is_bool_dtype,
is_datetime64_any_dtype,
is_dict_like,
is_dtype_equal,
is_extension_array_dtype,
Expand Down Expand Up @@ -109,9 +110,7 @@
from pandas.core.generic import NDFrame, _shared_docs
from pandas.core.indexes import base as ibase
from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences
from pandas.core.indexes.datetimes import DatetimeIndex
from pandas.core.indexes.multi import maybe_droplevels
from pandas.core.indexes.period import PeriodIndex
from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable
from pandas.core.internals import BlockManager
from pandas.core.internals.construction import (
Expand Down Expand Up @@ -4307,6 +4306,7 @@ def set_index(
"one-dimensional arrays."
)

current_dtype = None
missing: List[Optional[Hashable]] = []
for col in keys:
if isinstance(
Expand All @@ -4320,6 +4320,16 @@ def set_index(
# everything else gets tried as a key; see GH 24969
try:
found = col in self.columns
if found:
# get current dtype to preserve through index creation,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what does this all mean? why would we not always preserve?

# unless it's datetime64; too much functionality
# expects type coercion for dates
if not is_datetime64_any_dtype(self[col]):
try:
current_dtype = self.dtypes.get(col).type
except (AttributeError, TypeError):
# leave current_dtype=None if exception occurs
pass
except TypeError:
raise TypeError(f"{err_msg}. Received column of type {type(col)}")
else:
Expand Down Expand Up @@ -4375,7 +4385,7 @@ def set_index(
f"received array of length {len(arrays[-1])}"
)

index = ensure_index_from_sequences(arrays, names)
index = ensure_index_from_sequences(arrays, names, current_dtype)

if verify_integrity and not index.is_unique:
duplicates = index[index.duplicated()].unique()
Expand Down Expand Up @@ -4550,9 +4560,6 @@ class max type

def _maybe_casted_values(index, labels=None):
values = index._values
if not isinstance(index, (PeriodIndex, DatetimeIndex)):
if values.dtype == np.object_:
values = lib.maybe_convert_objects(values)

# if we have the labels, extract the values with a mask
if labels is not None:
Expand Down
5 changes: 3 additions & 2 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5447,7 +5447,7 @@ def shape(self):
Index._add_comparison_methods()


def ensure_index_from_sequences(sequences, names=None):
def ensure_index_from_sequences(sequences, names=None, dtype=None):
"""
Construct an index from sequences of data.

Expand All @@ -5458,6 +5458,7 @@ def ensure_index_from_sequences(sequences, names=None):
----------
sequences : sequence of sequences
names : sequence of str
dtype : NumPy dtype
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this a NumPy dtype? type dtype: Dtype


Returns
-------
Expand All @@ -5483,7 +5484,7 @@ def ensure_index_from_sequences(sequences, names=None):
if len(sequences) == 1:
if names is not None:
names = names[0]
return Index(sequences[0], name=names)
return Index(sequences[0], name=names, dtype=dtype)
else:
return MultiIndex.from_arrays(sequences, names=names)

Expand Down
1 change: 1 addition & 0 deletions pandas/tests/extension/base/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def test_grouping_grouper(self, data_for_grouping):
tm.assert_numpy_array_equal(gr1.grouper, df.A.values)
tm.assert_extension_array_equal(gr2.grouper, data_for_grouping)

@pytest.mark.skip(reason="logic change to stop coercing dtypes on set_index()")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So you are skipping these because the existing tests are incompatible with the new logic right?

If that's correct then we should actually update the tests and not just skip

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I will refactor them. From what I could see looking through tests, it seemed to me like a lot of them are skipped rather than refactored.

Would best practice be to try and always refactor? Thanks for your feedback

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR is rarer than others as it intentionally is breaking existing behavior; in such case yes would want to update existing tests and not just skip them

Also needs a little more communication in whatsnew around backwards compatibility but can leave that until later; I think would be helpful to see tests updated for reviewers to have a better understanding of scope of changes this would cause

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, that make sense. I'll work on getting these tests updated so it is clearer what the scope of the change is. In general, these few I skipped were asserting two df's to be equal, which is true except of course for the index of each df were no longer equal, thus they failed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, you need to actually change the results

@pytest.mark.parametrize("as_index", [True, False])
def test_groupby_extension_agg(self, as_index, data_for_grouping):
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/extension/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ def test_grouping_grouper(self, data_for_grouping):
tm.assert_numpy_array_equal(gr1.grouper, df.A.values)
tm.assert_extension_array_equal(gr2.grouper, data_for_grouping)

@pytest.mark.skip(reason="removed obj coercion from reset_index()")
@pytest.mark.parametrize("as_index", [True, False])
def test_groupby_extension_agg(self, as_index, data_for_grouping):
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping})
Expand Down
37 changes: 37 additions & 0 deletions pandas/tests/frame/test_alter_axes.py
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,7 @@ def test_set_index_dst(self):
exp = DataFrame({"b": [3, 4, 5]}, index=exp_index)
tm.assert_frame_equal(res, exp)

@pytest.mark.skip(reason="changes to type coercion logic in set_index()")
def test_reset_index_with_intervals(self):
idx = IntervalIndex.from_breaks(np.arange(11), name="x")
original = DataFrame({"x": idx, "y": np.arange(10)})[["x", "y"]]
Expand Down Expand Up @@ -1486,6 +1487,42 @@ def test_droplevel(self):
result = df.droplevel("level_2", axis="columns")
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("test_dtype", [object, "int64"])
def test_dtypes(self, test_dtype):
df = DataFrame({"A": Series([1, 2, 3], dtype=test_dtype), "B": [1, 2, 3]})
expected = df.dtypes.values[0].type

result = df.set_index("A").index.dtype.type
assert result == expected

@pytest.fixture
def mixed_series(self):
return Series([1, 2, 3, "apple", "corn"], dtype=object)

@pytest.fixture
def int_series(self):
return Series([100, 200, 300, 400, 500])

def test_dtypes_between_queries(self, mixed_series, int_series):
df = DataFrame({"item": mixed_series, "cost": int_series})

orig_dtypes = df.dtypes
item_dtype = orig_dtypes.get("item").type
cost_dtype = orig_dtypes.get("cost").type
expected = {"item": item_dtype, "cost": cost_dtype}

# after applying a query that would remove strings from the 'item' series with
# dtype: object, that series should remain as dtype: object as it becomes an
# index, and again as it becomes a column again after calling reset_index()
dtypes_transformed = (
df.query("cost < 400").set_index("item").reset_index().dtypes
)
item_dtype_transformed = dtypes_transformed.get("item").type
cost_dtype_transformed = dtypes_transformed.get("cost").type
result = {"item": item_dtype_transformed, "cost": cost_dtype_transformed}

assert result == expected


class TestIntervalIndex:
def test_setitem(self):
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/frame/test_period.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def test_as_frame_columns(self):
ts = df["1/1/2000"]
tm.assert_series_equal(ts, df.iloc[:, 0])

@pytest.mark.skip(reason="removed type coercion from set_index()")
def test_frame_setitem(self):
rng = period_range("1/1/2000", periods=5, name="index")
df = DataFrame(np.random.randn(5, 3), index=rng)
Expand Down