Skip to content

BUG/API: DTI/TDI/PI.insert cast to object on failure #39068

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jan 21, 2021
Merged
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,7 @@ Indexing
- Bug in :meth:`DataFrame.loc` dropping levels of :class:`MultiIndex` when :class:`DataFrame` used as input has only one row (:issue:`10521`)
- Bug in setting ``timedelta64`` values into numeric :class:`Series` failing to cast to object dtype (:issue:`39086`)
- Bug in setting :class:`Interval` values into a :class:`Series` or :class:`DataFrame` with mismatched :class:`IntervalDtype` incorrectly casting the new values to the existing dtype (:issue:`39120`)
- Bug in incorrectly raising when setting a new column that cannot be held in the existing ``frame.columns`` instead of casting to a compatible dtype (:issue:`39068`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm isn't this directly Index.insert ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, could flesh this out (.reset_index is also affected)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah i think its worth being more explict here as this is a non-trivial change


Missing
^^^^^^^
Expand Down
1 change: 1 addition & 0 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5716,6 +5716,7 @@ def insert(self, loc: int, item):
"""
# Note: this method is overridden by all ExtensionIndex subclasses,
# so self is never backed by an EA.
item = lib.item_from_zerodim(item)

try:
item = self._validate_fill_value(item)
Expand Down
18 changes: 5 additions & 13 deletions pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,7 +600,11 @@ def delete(self, loc):

@doc(NDArrayBackedExtensionIndex.insert)
def insert(self, loc: int, item):
result = super().insert(loc, item)
try:
result = super().insert(loc, item)
except (ValueError, TypeError):
# i.e. self._data._validate_scalar raised
return self.astype(object).insert(loc, item)

result._data._freq = self._get_insert_freq(loc, item)
return result
Expand Down Expand Up @@ -865,15 +869,3 @@ def join(
def _maybe_utc_convert(self: _T, other: Index) -> Tuple[_T, Index]:
# Overridden by DatetimeIndex
return self, other

# --------------------------------------------------------------------
# List-Like Methods

@Appender(DatetimeIndexOpsMixin.insert.__doc__)
def insert(self, loc, item):
if isinstance(item, str):
# TODO: Why are strings special?
# TODO: Should we attempt _scalar_from_string?
return self.astype(object).insert(loc, item)

return DatetimeIndexOpsMixin.insert(self, loc, item)
6 changes: 0 additions & 6 deletions pandas/core/indexes/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,12 +422,6 @@ def inferred_type(self) -> str:
# indexing
return "period"

def insert(self, loc: int, item):
if not isinstance(item, Period) or self.freq != item.freq:
return self.astype(object).insert(loc, item)

return DatetimeIndexOpsMixin.insert(self, loc, item)

def join(self, other, how="left", level=None, return_indexers=False, sort=False):
"""
See Index.join
Expand Down
33 changes: 30 additions & 3 deletions pandas/tests/frame/methods/test_reset_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,18 +314,45 @@ def test_reset_index_multiindex_nan(self):
rs = df.set_index(["A", "B"]).reset_index()
tm.assert_frame_equal(rs, df)

def test_reset_index_with_datetimeindex_cols(self):
@pytest.mark.parametrize(
"name",
[
None,
"foo",
2,
3.0,
pd.Timedelta(6),
Timestamp("2012-12-30", tz="UTC"),
"2012-12-31",
],
)
def test_reset_index_with_datetimeindex_cols(self, name):
# GH#5818
warn = None
if isinstance(name, Timestamp) and name.tz is not None:
# _deprecate_mismatched_indexing
warn = FutureWarning

df = DataFrame(
[[1, 2], [3, 4]],
columns=date_range("1/1/2013", "1/2/2013"),
index=["A", "B"],
)
df.index.name = name

with tm.assert_produces_warning(warn, check_stacklevel=False):
result = df.reset_index()

item = name if name is not None else "index"
columns = Index([item, datetime(2013, 1, 1), datetime(2013, 1, 2)])
if isinstance(item, str) and item == "2012-12-31":
columns = columns.astype("datetime64[ns]")
else:
assert columns.dtype == object

result = df.reset_index()
expected = DataFrame(
[["A", 1, 2], ["B", 3, 4]],
columns=["index", datetime(2013, 1, 1), datetime(2013, 1, 2)],
columns=columns,
)
tm.assert_frame_equal(result, expected)

Expand Down
92 changes: 72 additions & 20 deletions pandas/tests/indexes/datetimes/test_insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,11 @@ def test_insert_nat(self, tz, null):
@pytest.mark.parametrize("tz", [None, "UTC", "US/Eastern"])
def test_insert_invalid_na(self, tz):
idx = DatetimeIndex(["2017-01-01"], tz=tz)
msg = "value should be a 'Timestamp' or 'NaT'. Got 'timedelta64' instead."
with pytest.raises(TypeError, match=msg):
idx.insert(0, np.timedelta64("NaT"))

item = np.timedelta64("NaT")
result = idx.insert(0, item)
expected = Index([item] + list(idx), dtype=object)
tm.assert_index_equal(result, expected)

def test_insert_empty_preserves_freq(self, tz_naive_fixture):
# GH#33573
Expand Down Expand Up @@ -114,17 +116,6 @@ def test_insert(self):
assert result.name == expected.name
assert result.freq is None

# see gh-7299
idx = date_range("1/1/2000", periods=3, freq="D", tz="Asia/Tokyo", name="idx")
with pytest.raises(TypeError, match="Cannot compare tz-naive and tz-aware"):
idx.insert(3, Timestamp("2000-01-04"))
with pytest.raises(TypeError, match="Cannot compare tz-naive and tz-aware"):
idx.insert(3, datetime(2000, 1, 4))
with pytest.raises(ValueError, match="Timezones don't match"):
idx.insert(3, Timestamp("2000-01-04", tz="US/Eastern"))
with pytest.raises(ValueError, match="Timezones don't match"):
idx.insert(3, datetime(2000, 1, 4, tzinfo=pytz.timezone("US/Eastern")))

for tz in ["US/Pacific", "Asia/Singapore"]:
idx = date_range("1/1/2000 09:00", periods=6, freq="H", tz=tz, name="idx")
# preserve freq
Expand Down Expand Up @@ -167,6 +158,48 @@ def test_insert(self):
assert result.tz == expected.tz
assert result.freq is None

# TODO: also changes DataFrame.__setitem__ with expansion
def test_insert_mismatched_tzawareness(self):
# see GH#7299
idx = date_range("1/1/2000", periods=3, freq="D", tz="Asia/Tokyo", name="idx")

# mismatched tz-awareness
item = Timestamp("2000-01-04")
result = idx.insert(3, item)
expected = Index(
list(idx[:3]) + [item] + list(idx[3:]), dtype=object, name="idx"
)
tm.assert_index_equal(result, expected)

# mismatched tz-awareness
item = datetime(2000, 1, 4)
result = idx.insert(3, item)
expected = Index(
list(idx[:3]) + [item] + list(idx[3:]), dtype=object, name="idx"
)
tm.assert_index_equal(result, expected)

# TODO: also changes DataFrame.__setitem__ with expansion
def test_insert_mismatched_tz(self):
# see GH#7299
idx = date_range("1/1/2000", periods=3, freq="D", tz="Asia/Tokyo", name="idx")

# mismatched tz -> cast to object (could reasonably cast to same tz or UTC)
item = Timestamp("2000-01-04", tz="US/Eastern")
result = idx.insert(3, item)
expected = Index(
list(idx[:3]) + [item] + list(idx[3:]), dtype=object, name="idx"
)
tm.assert_index_equal(result, expected)

# mismatched tz -> cast to object (could reasonably cast to same tz)
item = datetime(2000, 1, 4, tzinfo=pytz.timezone("US/Eastern"))
result = idx.insert(3, item)
expected = Index(
list(idx[:3]) + [item] + list(idx[3:]), dtype=object, name="idx"
)
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize(
"item", [0, np.int64(0), np.float64(0), np.array(0), np.timedelta64(456)]
)
Expand All @@ -175,17 +208,36 @@ def test_insert_mismatched_types_raises(self, tz_aware_fixture, item):
tz = tz_aware_fixture
dti = date_range("2019-11-04", periods=9, freq="-1D", name=9, tz=tz)

msg = "value should be a 'Timestamp' or 'NaT'. Got '.*' instead"
with pytest.raises(TypeError, match=msg):
dti.insert(1, item)
result = dti.insert(1, item)

if isinstance(item, np.ndarray):
# FIXME: without doing .item() here this segfaults
assert item.item() == 0
expected = Index([dti[0], 0] + list(dti[1:]), dtype=object, name=9)
else:
expected = Index([dti[0], item] + list(dti[1:]), dtype=object, name=9)

tm.assert_index_equal(result, expected)

def test_insert_object_casting(self, tz_aware_fixture):
def test_insert_castable_str(self, tz_aware_fixture):
# GH#33703
tz = tz_aware_fixture
dti = date_range("2019-11-04", periods=3, freq="-1D", name=9, tz=tz)

# ATM we treat this as a string, but we could plausibly wrap it in Timestamp
value = "2019-11-05"
result = dti.insert(0, value)
expected = Index(["2019-11-05"] + list(dti), dtype=object, name=9)

ts = Timestamp(value).tz_localize(tz)
expected = DatetimeIndex([ts] + list(dti), dtype=dti.dtype, name=9)
tm.assert_index_equal(result, expected)

def test_insert_non_castable_str(self, tz_aware_fixture):
# GH#33703
tz = tz_aware_fixture
dti = date_range("2019-11-04", periods=3, freq="-1D", name=9, tz=tz)

value = "foo"
result = dti.insert(0, value)

expected = Index(["foo"] + list(dti), dtype=object, name=9)
tm.assert_index_equal(result, expected)
41 changes: 30 additions & 11 deletions pandas/tests/indexes/timedeltas/test_insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import numpy as np
import pytest

from pandas._libs import lib

import pandas as pd
from pandas import Index, Timedelta, TimedeltaIndex, timedelta_range
import pandas._testing as tm
Expand Down Expand Up @@ -79,9 +81,14 @@ def test_insert_nat(self, null):

def test_insert_invalid_na(self):
idx = TimedeltaIndex(["4day", "1day", "2day"], name="idx")
msg = r"value should be a 'Timedelta' or 'NaT'\. Got 'datetime64' instead\."
with pytest.raises(TypeError, match=msg):
idx.insert(0, np.datetime64("NaT"))

# FIXME: assert_index_equal fails if we pass a different
# instance of np.datetime64("NaT")
item = np.datetime64("NaT")
result = idx.insert(0, item)

expected = Index([item] + list(idx), dtype=object, name="idx")
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize(
"item", [0, np.int64(0), np.float64(0), np.array(0), np.datetime64(456, "us")]
Expand All @@ -90,18 +97,30 @@ def test_insert_mismatched_types_raises(self, item):
# GH#33703 dont cast these to td64
tdi = TimedeltaIndex(["4day", "1day", "2day"], name="idx")

msg = r"value should be a 'Timedelta' or 'NaT'\. Got '.*' instead\."
with pytest.raises(TypeError, match=msg):
tdi.insert(1, item)
result = tdi.insert(1, item)

expected = Index(
[tdi[0], lib.item_from_zerodim(item)] + list(tdi[1:]),
dtype=object,
name="idx",
)
tm.assert_index_equal(result, expected)

def test_insert_dont_cast_strings(self):
# To match DatetimeIndex and PeriodIndex behavior, dont try to
# parse strings to Timedelta
def test_insert_castable_str(self):
idx = timedelta_range("1day", "3day")

result = idx.insert(0, "1 Day")
assert result.dtype == object
assert result[0] == "1 Day"

expected = TimedeltaIndex([idx[0]] + list(idx))
tm.assert_index_equal(result, expected)

def test_insert_non_castable_str(self):
idx = timedelta_range("1day", "3day")

result = idx.insert(0, "foo")

expected = Index(["foo"] + list(idx), dtype=object)
tm.assert_index_equal(result, expected)

def test_insert_empty(self):
# Corner case inserting with length zero doesnt raise IndexError
Expand Down
69 changes: 51 additions & 18 deletions pandas/tests/indexing/test_coercion.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,14 +441,6 @@ def test_insert_index_float64(self, insert, coerced_val, coerced_dtype):
[pd.Timestamp("2012-01-01"), pd.Timestamp("2012-01-01", tz="Asia/Tokyo"), 1],
)
def test_insert_index_datetimes(self, request, fill_val, exp_dtype, insert_value):
if not hasattr(insert_value, "tz"):
request.node.add_marker(
pytest.mark.xfail(reason="ToDo: must coerce to object")
)
elif fill_val.tz != insert_value.tz:
request.node.add_marker(
pytest.mark.xfail(reason="GH 37605 - require tz equality?")
)

obj = pd.DatetimeIndex(
["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"], tz=fill_val.tz
Expand All @@ -461,7 +453,36 @@ def test_insert_index_datetimes(self, request, fill_val, exp_dtype, insert_value
)
self._assert_insert_conversion(obj, fill_val, exp, exp_dtype)

obj.insert(1, insert_value)
if fill_val.tz:

# mismatched tzawareness
ts = pd.Timestamp("2012-01-01")
result = obj.insert(1, ts)
expected = obj.astype(object).insert(1, ts)
assert expected.dtype == object
tm.assert_index_equal(result, expected)

# mismatched tz --> cast to object (could reasonably cast to commom tz)
ts = pd.Timestamp("2012-01-01", tz="Asia/Tokyo")
result = obj.insert(1, ts)
expected = obj.astype(object).insert(1, ts)
assert expected.dtype == object
tm.assert_index_equal(result, expected)

else:
# mismatched tzawareness
ts = pd.Timestamp("2012-01-01", tz="Asia/Tokyo")
result = obj.insert(1, ts)
expected = obj.astype(object).insert(1, ts)
assert expected.dtype == object
tm.assert_index_equal(result, expected)

item = 1
result = obj.insert(1, item)
expected = obj.astype(object).insert(1, item)
assert expected[1] == item
assert expected.dtype == object
tm.assert_index_equal(result, expected)

def test_insert_index_timedelta64(self):
obj = pd.TimedeltaIndex(["1 day", "2 day", "3 day", "4 day"])
Expand All @@ -473,15 +494,11 @@ def test_insert_index_timedelta64(self):
obj, pd.Timedelta("10 day"), exp, "timedelta64[ns]"
)

# ToDo: must coerce to object
msg = "value should be a 'Timedelta' or 'NaT'. Got 'Timestamp' instead."
with pytest.raises(TypeError, match=msg):
obj.insert(1, pd.Timestamp("2012-01-01"))

# ToDo: must coerce to object
msg = "value should be a 'Timedelta' or 'NaT'. Got 'int' instead."
with pytest.raises(TypeError, match=msg):
obj.insert(1, 1)
for item in [pd.Timestamp("2012-01-01"), 1]:
result = obj.insert(1, item)
expected = obj.astype(object).insert(1, item)
assert expected.dtype == object
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize(
"insert, coerced_val, coerced_dtype",
Expand All @@ -506,7 +523,23 @@ def test_insert_index_period(self, insert, coerced_val, coerced_dtype):
if isinstance(insert, pd.Period):
exp = pd.PeriodIndex(data, freq="M")
self._assert_insert_conversion(obj, insert, exp, coerced_dtype)

# string that can be parsed to appropriate PeriodDtype
self._assert_insert_conversion(obj, str(insert), exp, coerced_dtype)

else:
result = obj.insert(0, insert)
expected = obj.astype(object).insert(0, insert)
tm.assert_index_equal(result, expected)

# TODO: ATM inserting '2012-01-01 00:00:00' when we have obj.freq=="M"
# casts that string to Period[M], not clear that is desirable
if not isinstance(insert, pd.Timestamp):
# non-castable string
result = obj.insert(0, str(insert))
expected = obj.astype(object).insert(0, str(insert))
tm.assert_index_equal(result, expected)

msg = r"Unexpected keyword arguments {'freq'}"
with pytest.raises(TypeError, match=msg):
with tm.assert_produces_warning(FutureWarning):
Expand Down
Loading