diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 9492888e7db77..e6bbf52ab1272 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -59,6 +59,7 @@ from pandas._libs.tslibs.nattype cimport ( c_nat_strings as nat_strings, ) from pandas._libs.tslibs.timestamps cimport _Timestamp +from pandas._libs.tslibs.timezones cimport tz_compare from pandas._libs.tslibs import ( Resolution, @@ -447,6 +448,7 @@ cpdef array_to_datetime( bint string_to_dts_failed datetime py_dt tzinfo tz_out = None + bint found_tz = False, found_naive = False # specify error conditions assert is_raise or is_ignore or is_coerce @@ -465,18 +467,34 @@ cpdef array_to_datetime( elif PyDateTime_Check(val): seen_datetime = True if val.tzinfo is not None: + found_tz = True if utc_convert: _ts = convert_datetime_to_tsobject(val, None) iresult[i] = _ts.value - else: + elif found_naive: raise ValueError('Tz-aware datetime.datetime ' 'cannot be converted to ' 'datetime64 unless utc=True') - elif isinstance(val, _Timestamp): - iresult[i] = val.value + elif tz_out is not None and not tz_compare(tz_out, val.tzinfo): + raise ValueError('Tz-aware datetime.datetime ' + 'cannot be converted to ' + 'datetime64 unless utc=True') + else: + found_tz = True + tz_out = val.tzinfo + _ts = convert_datetime_to_tsobject(val, None) + iresult[i] = _ts.value + else: - iresult[i] = pydatetime_to_dt64(val, &dts) - check_dts_bounds(&dts) + found_naive = True + if found_tz: + raise ValueError('Cannot mix tz-aware with ' + 'tz-naive values') + if isinstance(val, _Timestamp): + iresult[i] = val.value + else: + iresult[i] = pydatetime_to_dt64(val, &dts) + check_dts_bounds(&dts) elif PyDate_Check(val): seen_datetime = True diff --git a/pandas/_libs/tslibs/conversion.pyi b/pandas/_libs/tslibs/conversion.pyi index ca6f301673f33..16fe853eef815 100644 --- a/pandas/_libs/tslibs/conversion.pyi +++ b/pandas/_libs/tslibs/conversion.pyi @@ -23,7 +23,4 @@ def ensure_timedelta64ns( arr: np.ndarray, # np.ndarray[timedelta64[ANY]] copy: bool = ..., ) -> np.ndarray: ... # np.ndarray[timedelta64ns] -def datetime_to_datetime64( - values: npt.NDArray[np.object_], -) -> tuple[np.ndarray, tzinfo | None]: ... # (np.ndarray[dt64ns], _) def localize_pydatetime(dt: datetime, tz: tzinfo | None) -> datetime: ... diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index e5217259a3648..5b7da7347a238 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -264,80 +264,6 @@ def ensure_timedelta64ns(arr: ndarray, copy: bool = True): return dt64_result.view(TD64NS_DTYPE) -# ---------------------------------------------------------------------- - - -@cython.boundscheck(False) -@cython.wraparound(False) -def datetime_to_datetime64(ndarray values): - # ndarray[object], but can't declare object without ndim - """ - Convert ndarray of datetime-like objects to int64 array representing - nanosecond timestamps. - - Parameters - ---------- - values : ndarray[object] - - Returns - ------- - result : ndarray[datetime64ns] - inferred_tz : tzinfo or None - """ - cdef: - Py_ssize_t i, n = values.size - object val - int64_t ival - ndarray iresult # int64_t, but can't declare that without specifying ndim - npy_datetimestruct dts - _TSObject _ts - bint found_naive = False - tzinfo inferred_tz = None - - cnp.broadcast mi - - result = np.empty((values).shape, dtype='M8[ns]') - iresult = result.view('i8') - - mi = cnp.PyArray_MultiIterNew2(iresult, values) - for i in range(n): - # Analogous to: val = values[i] - val = (cnp.PyArray_MultiIter_DATA(mi, 1))[0] - - if checknull_with_nat(val): - ival = NPY_NAT - elif PyDateTime_Check(val): - if val.tzinfo is not None: - if found_naive: - raise ValueError('Cannot mix tz-aware with ' - 'tz-naive values') - if inferred_tz is not None: - if not tz_compare(val.tzinfo, inferred_tz): - raise ValueError('Array must be all same time zone') - else: - inferred_tz = val.tzinfo - - _ts = convert_datetime_to_tsobject(val, None) - ival = _ts.value - check_dts_bounds(&_ts.dts) - else: - found_naive = True - if inferred_tz is not None: - raise ValueError('Cannot mix tz-aware with ' - 'tz-naive values') - ival = pydatetime_to_dt64(val, &dts) - check_dts_bounds(&dts) - else: - raise TypeError(f'Unrecognized value type: {type(val)}') - - # Analogous to: iresult[i] = ival - (cnp.PyArray_MultiIter_DATA(mi, 0))[0] = ival - - cnp.PyArray_MultiIter_NEXT(mi) - - return result, inferred_tz - - # ---------------------------------------------------------------------- # _TSObject Conversion diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index dadfad394b903..ec6da61bde6c6 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2263,14 +2263,6 @@ def objects_to_datetime64ns( allow_mixed=allow_mixed, ) result = result.reshape(data.shape, order=order) - except ValueError as err: - try: - values, tz_parsed = conversion.datetime_to_datetime64(data) - # If tzaware, these values represent unix timestamps, so we - # return them as i8 to distinguish from wall times - return values.view("i8"), tz_parsed - except (ValueError, TypeError): - raise err except OverflowError as err: # Exception is raised when a part of date is greater than 32 bit signed int raise OutOfBoundsDatetime("Out of bounds nanosecond timestamp") from err diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index d231dc10d1004..d6dda373bdf92 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -24,7 +24,6 @@ OutOfBoundsDatetime, Timedelta, Timestamp, - conversion, iNaT, nat_strings, parsing, @@ -41,6 +40,7 @@ ArrayLike, DateTimeErrorChoices, Timezone, + npt, ) from pandas.util._exceptions import find_stack_level @@ -467,8 +467,6 @@ def _array_strptime_with_fallback( try: result, timezones = array_strptime(arg, fmt, exact=exact, errors=errors) - if "%Z" in fmt or "%z" in fmt: - return _return_parsed_timezone_results(result, timezones, tz, name) except OutOfBoundsDatetime: if errors == "raise": raise @@ -494,6 +492,9 @@ def _array_strptime_with_fallback( else: # Indicates to the caller to fallback to objects_to_datetime64ns return None + else: + if "%Z" in fmt or "%z" in fmt: + return _return_parsed_timezone_results(result, timezones, tz, name) return _box_as_indexlike(result, utc=utc, name=name) @@ -512,38 +513,28 @@ def _to_datetime_with_format( Try parsing with the given format, returning None on failure. """ result = None - try: - # shortcut formatting here - if fmt == "%Y%m%d": - # pass orig_arg as float-dtype may have been converted to - # datetime64[ns] - orig_arg = ensure_object(orig_arg) - try: - # may return None without raising - result = _attempt_YYYYMMDD(orig_arg, errors=errors) - except (ValueError, TypeError, OutOfBoundsDatetime) as err: - raise ValueError( - "cannot convert the input to '%Y%m%d' date format" - ) from err - if result is not None: - utc = tz == "utc" - return _box_as_indexlike(result, utc=utc, name=name) - # fallback - res = _array_strptime_with_fallback( - arg, name, tz, fmt, exact, errors, infer_datetime_format - ) - return res - - except ValueError as err: - # Fallback to try to convert datetime objects if timezone-aware - # datetime objects are found without passing `utc=True` + # shortcut formatting here + if fmt == "%Y%m%d": + # pass orig_arg as float-dtype may have been converted to + # datetime64[ns] + orig_arg = ensure_object(orig_arg) try: - values, tz = conversion.datetime_to_datetime64(arg) - dta = DatetimeArray(values, dtype=tz_to_dtype(tz)) - return DatetimeIndex._simple_new(dta, name=name) - except (ValueError, TypeError): - raise err + # may return None without raising + result = _attempt_YYYYMMDD(orig_arg, errors=errors) + except (ValueError, TypeError, OutOfBoundsDatetime) as err: + raise ValueError( + "cannot convert the input to '%Y%m%d' date format" + ) from err + if result is not None: + utc = tz == "utc" + return _box_as_indexlike(result, utc=utc, name=name) + + # fallback + res = _array_strptime_with_fallback( + arg, name, tz, fmt, exact, errors, infer_datetime_format + ) + return res def _to_datetime_with_unit(arg, unit, name, tz, errors: str) -> Index: @@ -1007,17 +998,6 @@ def to_datetime( DatetimeIndex(['2020-01-01 01:00:00-01:00', '2020-01-01 02:00:00-01:00'], dtype='datetime64[ns, pytz.FixedOffset(-60)]', freq=None) - - Finally, mixing timezone-aware strings and :class:`datetime.datetime` always - raises an error, even if the elements all have the same time offset. - - >>> from datetime import datetime, timezone, timedelta - >>> d = datetime(2020, 1, 1, 18, tzinfo=timezone(-timedelta(hours=1))) - >>> pd.to_datetime(["2020-01-01 17:00 -0100", d]) - Traceback (most recent call last): - ... - ValueError: Tz-aware datetime.datetime cannot be converted to datetime64 - unless utc=True - | Setting ``utc=True`` solves most of the above issues: @@ -1243,7 +1223,7 @@ def coerce(values): return values -def _attempt_YYYYMMDD(arg: np.ndarray, errors: str) -> np.ndarray | None: +def _attempt_YYYYMMDD(arg: npt.NDArray[np.object_], errors: str) -> np.ndarray | None: """ try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like, arg is a passed in as an object dtype, but could really be ints/strings @@ -1257,7 +1237,7 @@ def _attempt_YYYYMMDD(arg: np.ndarray, errors: str) -> np.ndarray | None: def calc(carg): # calculate the actual result - carg = carg.astype(object) + carg = carg.astype(object, copy=False) parsed = parsing.try_parse_year_month_day( carg / 10000, carg / 100 % 100, carg % 100 ) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 7597d4345cfce..0bd93a78227ff 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -5,6 +5,7 @@ from datetime import ( datetime, timedelta, + timezone, ) from decimal import Decimal import locale @@ -455,6 +456,14 @@ def test_to_datetime_parse_timezone_keeps_name(self): class TestToDatetime: + def test_to_datetime_mixed_datetime_and_string(self): + # GH#47018 adapted old doctest with new behavior + d1 = datetime(2020, 1, 1, 17, tzinfo=timezone(-timedelta(hours=1))) + d2 = datetime(2020, 1, 1, 18, tzinfo=timezone(-timedelta(hours=1))) + res = to_datetime(["2020-01-01 17:00 -0100", d2]) + expected = to_datetime([d1, d2]).tz_convert(pytz.FixedOffset(-60)) + tm.assert_index_equal(res, expected) + def test_to_datetime_np_str(self): # GH#32264 value = np.str_("2019-02-04 10:18:46.297000+0000")