-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: set_index() and reset_index() not preserving object dtypes #30870
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d37df39
c572d3a
cf2100f
5229cea
c6b02a4
d52bbe8
9c8439d
7eadae8
0fcd059
a33e307
5c65100
e3e27a3
23bab18
57f7e6d
85155d8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -76,6 +76,7 @@ | |
ensure_platform_int, | ||
infer_dtype_from_object, | ||
is_bool_dtype, | ||
is_datetime64_any_dtype, | ||
is_dict_like, | ||
is_dtype_equal, | ||
is_extension_array_dtype, | ||
|
@@ -109,9 +110,7 @@ | |
from pandas.core.generic import NDFrame, _shared_docs | ||
from pandas.core.indexes import base as ibase | ||
from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences | ||
from pandas.core.indexes.datetimes import DatetimeIndex | ||
from pandas.core.indexes.multi import maybe_droplevels | ||
from pandas.core.indexes.period import PeriodIndex | ||
from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable | ||
from pandas.core.internals import BlockManager | ||
from pandas.core.internals.construction import ( | ||
|
@@ -4307,6 +4306,7 @@ def set_index( | |
"one-dimensional arrays." | ||
) | ||
|
||
current_dtype = None | ||
missing: List[Optional[Hashable]] = [] | ||
for col in keys: | ||
if isinstance( | ||
|
@@ -4320,6 +4320,16 @@ def set_index( | |
# everything else gets tried as a key; see GH 24969 | ||
try: | ||
found = col in self.columns | ||
if found: | ||
# get current dtype to preserve through index creation, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what does this all mean? why would we not always preserve? |
||
# unless it's datetime64; too much functionality | ||
# expects type coercion for dates | ||
if not is_datetime64_any_dtype(self[col]): | ||
try: | ||
current_dtype = self.dtypes.get(col).type | ||
except (AttributeError, TypeError): | ||
# leave current_dtype=None if exception occurs | ||
pass | ||
except TypeError: | ||
raise TypeError(f"{err_msg}. Received column of type {type(col)}") | ||
else: | ||
|
@@ -4375,7 +4385,7 @@ def set_index( | |
f"received array of length {len(arrays[-1])}" | ||
) | ||
|
||
index = ensure_index_from_sequences(arrays, names) | ||
index = ensure_index_from_sequences(arrays, names, current_dtype) | ||
trevorbye marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
if verify_integrity and not index.is_unique: | ||
duplicates = index[index.duplicated()].unique() | ||
|
@@ -4550,9 +4560,6 @@ class max type | |
|
||
def _maybe_casted_values(index, labels=None): | ||
values = index._values | ||
if not isinstance(index, (PeriodIndex, DatetimeIndex)): | ||
trevorbye marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if values.dtype == np.object_: | ||
values = lib.maybe_convert_objects(values) | ||
|
||
# if we have the labels, extract the values with a mask | ||
if labels is not None: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5447,7 +5447,7 @@ def shape(self): | |
Index._add_comparison_methods() | ||
|
||
|
||
def ensure_index_from_sequences(sequences, names=None): | ||
def ensure_index_from_sequences(sequences, names=None, dtype=None): | ||
""" | ||
Construct an index from sequences of data. | ||
|
||
|
@@ -5458,6 +5458,7 @@ def ensure_index_from_sequences(sequences, names=None): | |
---------- | ||
sequences : sequence of sequences | ||
names : sequence of str | ||
dtype : NumPy dtype | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is this a NumPy dtype? type dtype: Dtype |
||
|
||
Returns | ||
------- | ||
|
@@ -5483,7 +5484,7 @@ def ensure_index_from_sequences(sequences, names=None): | |
if len(sequences) == 1: | ||
if names is not None: | ||
names = names[0] | ||
return Index(sequences[0], name=names) | ||
return Index(sequences[0], name=names, dtype=dtype) | ||
else: | ||
return MultiIndex.from_arrays(sequences, names=names) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ def test_grouping_grouper(self, data_for_grouping): | |
tm.assert_numpy_array_equal(gr1.grouper, df.A.values) | ||
tm.assert_extension_array_equal(gr2.grouper, data_for_grouping) | ||
|
||
@pytest.mark.skip(reason="logic change to stop coercing dtypes on set_index()") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So you are skipping these because the existing tests are incompatible with the new logic right? If that's correct then we should actually update the tests and not just skip There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I will refactor them. From what I could see looking through tests, it seemed to me like a lot of them are skipped rather than refactored. Would best practice be to try and always refactor? Thanks for your feedback There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This PR is rarer than others as it intentionally is breaking existing behavior; in such case yes would want to update existing tests and not just skip them Also needs a little more communication in whatsnew around backwards compatibility but can leave that until later; I think would be helpful to see tests updated for reviewers to have a better understanding of scope of changes this would cause There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay, that make sense. I'll work on getting these tests updated so it is clearer what the scope of the change is. In general, these few I skipped were asserting two df's to be equal, which is true except of course for the index of each df were no longer equal, thus they failed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, you need to actually change the results |
||
@pytest.mark.parametrize("as_index", [True, False]) | ||
def test_groupby_extension_agg(self, as_index, data_for_grouping): | ||
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
move this to a sub-section; show a typical example of before & after (see how this is done in the Enhancements section)