Skip to content

BUG: _convert_and_box_cache raise ValueError: Tz-aware datetime.datetime cannot be converted to datetime64 unless utc=True #26097

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 31 commits into from
Jul 3, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
655ec31
fixed _convert_and_box_cache that raised ValueError: Tz-aware datetim…
anmyachev Apr 15, 2019
6a9856e
Revert bandaid workaround
vnlitvinov Apr 16, 2019
ffd9ecf
Add test that fails only when to_datetime gets cache=True
vnlitvinov Apr 16, 2019
d6c584e
Fix to_datetime caching logic so test_to_datetime_offset passes
vnlitvinov Apr 16, 2019
00f72e0
Fix flake8 issues
vnlitvinov Apr 16, 2019
5ad9911
removed debugging stuff; 'name' is default argument now
anmyachev Apr 17, 2019
428cae0
added 'whatsnew'
anmyachev Apr 17, 2019
7ed05f2
Test that to_datetime produces equal result for cache on and off
vnlitvinov Apr 18, 2019
b60f1d5
fixed isort errors
anmyachev Apr 18, 2019
3e2df79
Rework _box_if_needed into _box_as_indexlike
vnlitvinov Apr 19, 2019
3f0285e
Clarify added tests intention
vnlitvinov Apr 19, 2019
d19c2cf
first using notations
anmyachev Apr 22, 2019
b1cf140
changed wildcard import in '/pandas/core/index.py' to explicit import
anmyachev Apr 22, 2019
56db677
changed 'notations' -> 'annotations'
anmyachev Apr 22, 2019
4f9ea36
fixed isort errors
anmyachev Apr 22, 2019
c72a561
rollback of a certain style for annotations
anmyachev Apr 23, 2019
67a0c40
added 'Scalar' and 'DatetimeScalar' unions
anmyachev Apr 24, 2019
1e0d953
added annotations for some arguments; changed formatting
anmyachev Apr 26, 2019
1942bbe
using 'assert_almost_equal' now
anmyachev Apr 28, 2019
97e4548
rerun CI tests
anmyachev Jun 12, 2019
342d7d0
fixed problems found by review
anmyachev Jun 12, 2019
12f9853
replaced '# noqa' statement for flake8 linter
anmyachev Jun 14, 2019
62e75f8
removed 'Optional' using
anmyachev Jun 14, 2019
71ca9be
using TypeVar for 'DatetimeScalar' definition
anmyachev Jun 14, 2019
c35e124
fixed isort error
anmyachev Jun 14, 2019
d3412e2
fixed bug: 'UTC' and 'Etc/GMT' should be the same
anmyachev Jun 16, 2019
3141cb6
added comment about timezones
anmyachev Jun 16, 2019
ca200cd
removed 'UTC_EQ_STR', 'UTC_EQ'
anmyachev Jun 16, 2019
1cc469e
removing extra stuff
anmyachev Jul 2, 2019
137395f
fixed mypy errors
anmyachev Jul 2, 2019
2d8921b
renamed arg in '_box_as_indexlike' func: tz -> utc
anmyachev Jul 3, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -705,6 +705,8 @@ Datetimelike
- Bug in :func:`date_range` with unnecessary ``OverflowError`` being raised for very large or very small dates (:issue:`26651`)
- Bug where adding :class:`Timestamp` to a ``np.timedelta64`` object would raise instead of returning a :class:`Timestamp` (:issue:`24775`)
- Bug where comparing a zero-dimensional numpy array containing a ``np.datetime64`` object to a :class:`Timestamp` would incorrect raise ``TypeError`` (:issue:`26916`)
- Bug in :func:`to_datetime` which would raise ``ValueError: Tz-aware datetime.datetime cannot be converted to datetime64 unless utc=True`` when called with ``cache=True``, with ``arg`` including datetime strings with different offset (:issue:`26097`)
-

Timedelta
^^^^^^^^^
Expand Down
10 changes: 7 additions & 3 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# flake8: noqa
from pandas.core.indexes.api import *
from pandas.core.indexes.multi import _sparsify
from pandas.core.indexes.api import ( # noqa:F401
CategoricalIndex, DatetimeIndex, Float64Index, Index, Int64Index,
IntervalIndex, InvalidIndexError, MultiIndex, NaT, NumericIndex,
PeriodIndex, RangeIndex, TimedeltaIndex, UInt64Index, _all_indexes_same,
_get_combined_index, _get_consensus_names, _get_objs_combined_axis,
_new_Index, _union_indexes, ensure_index, ensure_index_from_sequences)
from pandas.core.indexes.multi import _sparsify # noqa:F401
81 changes: 59 additions & 22 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from collections import abc
from datetime import datetime, time
from functools import partial
from typing import Optional, TypeVar, Union

import numpy as np

Expand All @@ -14,12 +15,25 @@
from pandas.core.dtypes.common import (
ensure_object, is_datetime64_dtype, is_datetime64_ns_dtype,
is_datetime64tz_dtype, is_float, is_integer, is_integer_dtype,
is_list_like, is_numeric_dtype, is_object_dtype, is_scalar)
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
is_list_like, is_numeric_dtype, is_scalar)
from pandas.core.dtypes.generic import (
ABCDataFrame, ABCDatetimeIndex, ABCIndex, ABCIndexClass, ABCSeries)
from pandas.core.dtypes.missing import notna

from pandas._typing import ArrayLike
from pandas.core import algorithms

# ---------------------------------------------------------------------
# types used in annotations

Scalar = Union[int, float, str]
DatetimeScalar = TypeVar('DatetimeScalar', Scalar, datetime)
DatetimeScalarOrArrayConvertible = Union[DatetimeScalar, list, tuple,
ArrayLike, ABCSeries]


# ---------------------------------------------------------------------


def _guess_datetime_format_for_array(arr, **kwargs):
# Try to guess the format based on the first non-NaN element
Expand Down Expand Up @@ -60,7 +74,43 @@ def _maybe_cache(arg, format, cache, convert_listlike):
return cache_array


def _convert_and_box_cache(arg, cache_array, box, errors, name=None):
def _box_as_indexlike(
dt_array: ArrayLike,
utc: Optional[bool] = None,
name: Optional[str] = None
) -> Union[ABCIndex, ABCDatetimeIndex]:
"""
Properly boxes the ndarray of datetimes to DatetimeIndex
if it is possible or to generic Index instead

Parameters
----------
dt_array: 1-d array
array of datetimes to be boxed
tz : object
None or 'utc'
name : string, default None
Name for a resulting index

Returns
-------
result : datetime of converted dates
- DatetimeIndex if convertible to sole datetime64 type
- general Index otherwise
"""
from pandas import DatetimeIndex, Index
if is_datetime64_dtype(dt_array):
tz = 'utc' if utc else None
return DatetimeIndex(dt_array, tz=tz, name=name)
return Index(dt_array, name=name)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to apply tz to this result if Index(dt_array, name=name) returns a DatetimeIndex

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback @mroeschke After a close look at the Index constructor(pandas/core/indexes/base.py), I had a question about dtype default value. The documentation says that it should be object, but actually None is used. What case is correct?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

its default is None, however it will attempt to infer the type of thing passed, so you may need to explicity pass dtype='object' to force it to object here.



def _convert_and_box_cache(
arg: DatetimeScalarOrArrayConvertible,
cache_array: ABCSeries,
box: bool,
name: Optional[str] = None
) -> Union[ABCIndex, np.ndarray]:
"""
Convert array of dates with a cache and box the result

Expand All @@ -71,26 +121,19 @@ def _convert_and_box_cache(arg, cache_array, box, errors, name=None):
Cache of converted, unique dates
box : boolean
True boxes result as an Index-like, False returns an ndarray
errors : string
'ignore' plus box=True will convert result to Index
name : string, default None
Name for a DatetimeIndex

Returns
-------
result : datetime of converted dates
Returns:

- Index-like if box=True
- ndarray if box=False
"""
from pandas import Series, DatetimeIndex, Index
from pandas import Series
result = Series(arg).map(cache_array)
if box:
if errors == 'ignore':
return Index(result, name=name)
else:
return DatetimeIndex(result, name=name)
return _box_as_indexlike(result, utc=None, name=name)
return result.values


Expand Down Expand Up @@ -118,7 +161,6 @@ def _return_parsed_timezone_results(result, timezones, box, tz, name):

- Index-like if box=True
- ndarray of Timestamps if box=False

"""
if tz is not None:
raise ValueError("Cannot pass a tz argument when "
Expand Down Expand Up @@ -324,13 +366,8 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None,
return np.array(result, dtype=object)

if box:
# Ensure we return an Index in all cases where box=True
if is_datetime64_dtype(result):
return DatetimeIndex(result, tz=tz, name=name)
elif is_object_dtype(result):
# e.g. an Index of datetime objects
from pandas import Index
return Index(result, name=name)
utc = tz == 'utc'
return _box_as_indexlike(result, utc=utc, name=name)
return result


Expand Down Expand Up @@ -611,15 +648,15 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
elif isinstance(arg, ABCIndexClass):
cache_array = _maybe_cache(arg, format, cache, convert_listlike)
if not cache_array.empty:
result = _convert_and_box_cache(arg, cache_array, box, errors,
result = _convert_and_box_cache(arg, cache_array, box,
name=arg.name)
else:
convert_listlike = partial(convert_listlike, name=arg.name)
result = convert_listlike(arg, box, format)
elif is_list_like(arg):
cache_array = _maybe_cache(arg, format, cache, convert_listlike)
if not cache_array.empty:
result = _convert_and_box_cache(arg, cache_array, box, errors)
result = _convert_and_box_cache(arg, cache_array, box)
else:
result = convert_listlike(arg, box, format)
else:
Expand Down
11 changes: 11 additions & 0 deletions pandas/tests/indexes/datetimes/test_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,6 +504,17 @@ def test_to_datetime_tz(self, cache):
with pytest.raises(ValueError, match=msg):
pd.to_datetime(arr, cache=cache)

@pytest.mark.parametrize('cache', [True, False])
def test_to_datetime_different_offsets(self, cache):
# inspired by asv timeseries.ToDatetimeNONISO8601 benchmark
# see GH-26097 for more
ts_string_1 = 'March 1, 2018 12:00:00+0400'
ts_string_2 = 'March 1, 2018 12:00:00+0500'
arr = [ts_string_1] * 5 + [ts_string_2] * 5
expected = pd.Index([parse(x) for x in arr])
result = pd.to_datetime(arr, cache=cache)
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize('cache', [True, False])
def test_to_datetime_tz_pytz(self, cache):
# see gh-8260
Expand Down