Skip to content

BUG: DTA/TDA/PA setitem incorrectly allowing i8 #33717

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 30 additions & 11 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -773,7 +773,7 @@ def _validate_fill_value(self, fill_value):
return fill_value

def _validate_shift_value(self, fill_value):
# TODO(2.0): once this deprecation is enforced, used _validate_fill_value
# TODO(2.0): once this deprecation is enforced, use _validate_fill_value
if is_valid_nat_for_dtype(fill_value, self.dtype):
fill_value = NaT
elif not isinstance(fill_value, self._recognized_scalars):
Expand Down Expand Up @@ -813,6 +813,9 @@ def _validate_searchsorted_value(self, value):
elif isinstance(value, self._recognized_scalars):
value = self._scalar_type(value)

elif isinstance(value, type(self)):
pass

elif is_list_like(value) and not isinstance(value, type(self)):
value = array(value)

Expand All @@ -822,7 +825,7 @@ def _validate_searchsorted_value(self, value):
f"not {type(value).__name__}"
)

if not (isinstance(value, (self._scalar_type, type(self))) or (value is NaT)):
else:
raise TypeError(f"Unexpected type for 'value': {type(value)}")

if isinstance(value, type(self)):
Expand All @@ -834,25 +837,41 @@ def _validate_searchsorted_value(self, value):
return value

def _validate_setitem_value(self, value):
if lib.is_scalar(value) and not isna(value):
value = com.maybe_box_datetimelike(value)

if is_list_like(value):
value = type(self)._from_sequence(value, dtype=self.dtype)
self._check_compatible_with(value, setitem=True)
value = value.asi8
elif isinstance(value, self._scalar_type):
self._check_compatible_with(value, setitem=True)
value = self._unbox_scalar(value)
value = array(value)
if is_dtype_equal(value.dtype, "string"):
# We got a StringArray
Copy link
Member

@simonjayhawkins simonjayhawkins Apr 22, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this assumption important or is value pre-validated

>>> value = [1, 2, object()]
>>> value = pd.array(value)
>>> value
<PandasArray>
[1, 2, <object object at 0x0000015F4B631DA0>]
Length: 3, dtype: object
>>>
>>> pd.core.dtypes.common.is_string_dtype(value.dtype)
True
>>>

might be worth adding type annotations to _validate_setitem_value

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yikes, that looks like a problem with is_string_dtype

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jbrockmendel addressing here? or followup?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

follow-up

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's not a "problem" with is_string_dtype, because that function simply checks for object dtype. That's how it has been for a long time, and not sure if it is easy to solve, as actually checking for strings would mean to infer the dtypes from the values each time. We have #15585 about this.

(it's a problem in practice of course, since it makes this function rather useless. But the solution is probably improving the new "string" dtype so this can become the default string dtype in the future)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right actually I would say this is a bug in is_string_dtype which is actually just checking for object dtype and not an inferred string itself.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can't infer string values from a dtype object, though ..

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure you can, this is excatly what infer_from_dtype does; we just don't do it inside this function because it can be non-cheap

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated to punt on is_string_dtype by instead checking is_dtype_equal(value.dtype, "string")

try:
# TODO: Could use from_sequence_of_strings if implemented
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So related to my comment above, this TODO item is not correct (also the "We got a StringArray" comment above is thus not correct)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you have a suggestion for what to do here? _from_sequence is clearly too permissive

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is _from_sequence too permissive for this purpose? It seems that whathever is allowed to create an array of this dtype, can also be allowed to set values?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this gets back to your point of from_sequence being too permissive for DTA/TDA/PA; it would let through e.g. float64 or int64 which shouldnt allow

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This then becomes a bit ambiguous on what to allow and what not. Why allow ints to create but not to set, while allow strings in both creating and setting?
So why not disallow setting with string as well?

I suppose the answer is: that is the current behaviour .. But so to come back to your question what to use here: if you only want to allow actual strings, and not eg integers in an object dtype array, you can actually use infer_dtype to check it are strings or datetime objects?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Im assuming thats a rhetorical question; LMK if you actually are looking for an explanation.

So I'm clear: are you advocating for the status quo, or do you have something else in mind?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, it's not really a rhetorical question, I actually find it inconsistent to allow certain data types in creation and others in setting (I am fine with being more strict in setting compared to creation, but then we should be strict altogether and also not allow strings in setting, IMO).
But it's a question that doesn't need to be solved in this PR, that for sure!

To be clear: I am fine with the code, I only commented about the comment not really being correct. But now you updated the dtype check, I assume the comment is actually correct.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually find it inconsistent to allow certain data types in creation and others in setting

Yes. The upshot is that the preferred way of minimizing inconsistencies is to make from_sequence more strict. Let's leave that for another thread.

# Note: passing dtype is necessary for PeriodArray tests
value = type(self)._from_sequence(value, dtype=self.dtype)
except ValueError:
pass

if not type(self)._is_recognized_dtype(value):
raise TypeError(
"setitem requires compatible dtype or scalar, "
f"not {type(value).__name__}"
)

elif isinstance(value, self._recognized_scalars):
value = self._scalar_type(value)
elif is_valid_nat_for_dtype(value, self.dtype):
value = iNaT
value = NaT
else:
msg = (
f"'value' should be a '{self._scalar_type.__name__}', 'NaT', "
f"or array of those. Got '{type(value).__name__}' instead."
)
raise TypeError(msg)

self._check_compatible_with(value, setitem=True)
if isinstance(value, type(self)):
value = value.asi8
else:
value = self._unbox_scalar(value)

return value

def _validate_insert_value(self, value):
Expand Down
21 changes: 21 additions & 0 deletions pandas/tests/arrays/test_datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,16 @@ def test_setitem(self):
expected[:2] = expected[-2:]
tm.assert_numpy_array_equal(arr.asi8, expected)

def test_setitem_str_array(self, arr1d):
if isinstance(arr1d, DatetimeArray) and arr1d.tz is not None:
pytest.xfail(reason="timezone comparisons inconsistent")
expected = arr1d.copy()
expected[[0, 1]] = arr1d[-2:]

arr1d[:2] = [str(x) for x in arr1d[-2:]]

tm.assert_equal(arr1d, expected)

def test_setitem_raises(self):
data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9
arr = self.array_cls(data, freq="D")
Expand All @@ -252,6 +262,17 @@ def test_setitem_raises(self):
with pytest.raises(TypeError, match="'value' should be a.* 'object'"):
arr[0] = object()

@pytest.mark.parametrize("box", [list, np.array, pd.Index, pd.Series])
def test_setitem_numeric_raises(self, arr1d, box):
# We dont case e.g. int64 to our own dtype for setitem

msg = "requires compatible dtype"
with pytest.raises(TypeError, match=msg):
arr1d[:2] = box([0, 1])

with pytest.raises(TypeError, match=msg):
arr1d[:2] = box([0.0, 1.0])

def test_inplace_arithmetic(self):
# GH#24115 check that iadd and isub are actually in-place
data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9
Expand Down