Skip to content

BUG: to_datetime with decimal number doesn't fail for %Y%m%d #50054

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,7 @@ Datetimelike
- Bug in ``pandas.tseries.holiday.Holiday`` where a half-open date interval causes inconsistent return types from :meth:`USFederalHolidayCalendar.holidays` (:issue:`49075`)
- Bug in rendering :class:`DatetimeIndex` and :class:`Series` and :class:`DataFrame` with timezone-aware dtypes with ``dateutil`` or ``zoneinfo`` timezones near daylight-savings transitions (:issue:`49684`)
- Bug in :func:`to_datetime` was raising ``ValueError`` when parsing :class:`Timestamp` or ``datetime`` objects with non-ISO8601 ``format`` (:issue:`49298`)
- Bug in :func:`to_datetime` with ``exact`` and ``format=%Y%m%d`` wasn't raising if the input didn't match the format (:issue:`50051`)
-

Timedelta
Expand Down
20 changes: 8 additions & 12 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -546,17 +546,10 @@ cpdef array_to_datetime(
seen_datetime = True
iresult[i] = get_datetime64_nanos(val, NPY_FR_ns)

elif is_integer_object(val) or is_float_object(val):
if require_iso8601:
if is_coerce:
iresult[i] = NPY_NAT
continue
elif is_raise:
raise ValueError(
f"time data \"{val}\" at position {i} doesn't "
f"match format \"{format}\""
)
return values, tz_out
elif (
(is_integer_object(val) or is_float_object(val))
and format is None
):
# these must be ns unit by-definition
seen_integer = True

Expand All @@ -575,7 +568,10 @@ cpdef array_to_datetime(
except OverflowError:
iresult[i] = NPY_NAT

elif isinstance(val, str):
elif (
(is_integer_object(val) or is_float_object(val))
or isinstance(val, str)
):
# string
if type(val) is not str:
# GH#32264 np.str_ object
Expand Down
5 changes: 0 additions & 5 deletions pandas/_libs/tslibs/parsing.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,6 @@ def try_parse_dates(
dayfirst: bool = ...,
default: datetime | None = ...,
) -> npt.NDArray[np.object_]: ...
def try_parse_year_month_day(
years: npt.NDArray[np.object_], # object[:]
months: npt.NDArray[np.object_], # object[:]
days: npt.NDArray[np.object_], # object[:]
) -> npt.NDArray[np.object_]: ...
Comment on lines -30 to -34
Copy link
Member Author

@MarcoGorelli MarcoGorelli Dec 4, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

removing this function altogether, special-casing %Y%m%d just introduces bugs

The non-ISO8601 path can handle this format just fine, let's use that

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this impact perf? im assuming this was introduced as a fastpath

def try_parse_datetime_components(
years: npt.NDArray[np.object_], # object[:]
months: npt.NDArray[np.object_], # object[:]
Expand Down
21 changes: 1 addition & 20 deletions pandas/_libs/tslibs/parsing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -744,25 +744,6 @@ def try_parse_dates(
return result.base # .base to access underlying ndarray


def try_parse_year_month_day(
object[:] years, object[:] months, object[:] days
) -> np.ndarray:
cdef:
Py_ssize_t i, n
object[::1] result

n = len(years)
# TODO(cython3): Use len instead of `shape[0]`
if months.shape[0] != n or days.shape[0] != n:
raise ValueError("Length of years/months/days must all be equal")
result = np.empty(n, dtype="O")

for i in range(n):
result[i] = datetime(int(years[i]), int(months[i]), int(days[i]))

return result.base # .base to access underlying ndarray


def try_parse_datetime_components(object[:] years,
object[:] months,
object[:] days,
Expand Down Expand Up @@ -890,7 +871,7 @@ def format_is_iso(f: str) -> bint:
but must be consistent. Leading 0s in dates and times are optional.
"""
iso_template = "%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}".format
excluded_formats = ["%Y%m%d", "%Y%m", "%Y"]
excluded_formats = ["%Y%m", "%Y"]

for date_sep in [" ", "/", "\\", "-", ".", ""]:
for time_sep in [" ", "T"]:
Expand Down
101 changes: 2 additions & 99 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@
Timedelta,
Timestamp,
iNaT,
nat_strings,
parsing,
timezones as libtimezones,
)
from pandas._libs.tslibs.parsing import (
Expand All @@ -38,7 +36,6 @@
AnyArrayLike,
ArrayLike,
DateTimeErrorChoices,
npt,
)

from pandas.core.dtypes.common import (
Expand All @@ -57,13 +54,11 @@
ABCDataFrame,
ABCSeries,
)
from pandas.core.dtypes.missing import notna

from pandas.arrays import (
DatetimeArray,
IntegerArray,
)
from pandas.core import algorithms
from pandas.core.algorithms import unique
from pandas.core.arrays.base import ExtensionArray
from pandas.core.arrays.datetimes import (
Expand Down Expand Up @@ -407,7 +402,6 @@ def _convert_listlike_datetimes(

# warn if passing timedelta64, raise for PeriodDtype
# NB: this must come after unit transformation
orig_arg = arg
try:
arg, _ = maybe_convert_dtype(arg, copy=False, tz=libtimezones.maybe_get_tz(tz))
except TypeError:
Expand Down Expand Up @@ -435,8 +429,8 @@ def _convert_listlike_datetimes(
require_iso8601 = not infer_datetime_format

if format is not None and not require_iso8601:
res = _to_datetime_with_format(
arg, orig_arg, name, utc, format, exact, errors, infer_datetime_format
res = _array_strptime_with_fallback(
arg, name, utc, format, exact, errors, infer_datetime_format
)
if res is not None:
return res
Expand Down Expand Up @@ -510,43 +504,6 @@ def _array_strptime_with_fallback(
return _box_as_indexlike(result, utc=utc, name=name)


def _to_datetime_with_format(
arg,
orig_arg,
name,
utc: bool,
fmt: str,
exact: bool,
errors: str,
infer_datetime_format: bool,
) -> Index | None:
"""
Try parsing with the given format, returning None on failure.
"""
result = None

# shortcut formatting here
if fmt == "%Y%m%d":
# pass orig_arg as float-dtype may have been converted to
# datetime64[ns]
orig_arg = ensure_object(orig_arg)
try:
# may return None without raising
result = _attempt_YYYYMMDD(orig_arg, errors=errors)
except (ValueError, TypeError, OutOfBoundsDatetime) as err:
raise ValueError(
"cannot convert the input to '%Y%m%d' date format"
) from err
if result is not None:
return _box_as_indexlike(result, utc=utc, name=name)

# fallback
res = _array_strptime_with_fallback(
arg, name, utc, fmt, exact, errors, infer_datetime_format
)
return res


def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index:
"""
to_datetime specalized to the case where a 'unit' is passed.
Expand Down Expand Up @@ -1244,60 +1201,6 @@ def coerce(values):
return values


def _attempt_YYYYMMDD(arg: npt.NDArray[np.object_], errors: str) -> np.ndarray | None:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think there's an issue about this fastpath not actually being fast. if this is removed, the issue is probably closeable

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately, not quite

I've left a comment there, but removing it wouldn't fix the issue because 6-digit %Y%m%d dates wouldn't get parsed

"""
try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like,
arg is a passed in as an object dtype, but could really be ints/strings
with nan-like/or floats (e.g. with nan)

Parameters
----------
arg : np.ndarray[object]
errors : {'raise','ignore','coerce'}
"""

def calc(carg):
# calculate the actual result
carg = carg.astype(object, copy=False)
parsed = parsing.try_parse_year_month_day(
carg / 10000, carg / 100 % 100, carg % 100
)
return tslib.array_to_datetime(parsed, errors=errors)[0]

def calc_with_mask(carg, mask):
result = np.empty(carg.shape, dtype="M8[ns]")
iresult = result.view("i8")
iresult[~mask] = iNaT

masked_result = calc(carg[mask].astype(np.float64).astype(np.int64))
result[mask] = masked_result.astype("M8[ns]")
return result

# try intlike / strings that are ints
try:
return calc(arg.astype(np.int64))
except (ValueError, OverflowError, TypeError):
pass

# a float with actual np.nan
try:
carg = arg.astype(np.float64)
return calc_with_mask(carg, notna(carg))
except (ValueError, OverflowError, TypeError):
pass

# string with NaN-like
try:
# error: Argument 2 to "isin" has incompatible type "List[Any]"; expected
# "Union[Union[ExtensionArray, ndarray], Index, Series]"
mask = ~algorithms.isin(arg, list(nat_strings)) # type: ignore[arg-type]
return calc_with_mask(arg, mask)
except (ValueError, OverflowError, TypeError):
pass

return None


__all__ = [
"DateParseError",
"should_cache",
Expand Down
24 changes: 12 additions & 12 deletions pandas/tests/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,24 +125,21 @@ def test_to_datetime_format_YYYYMMDD_with_nat(self, cache):
expected[2] = np.nan
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't need to have expected if it raises. Although the reason it is raising is because np.nan changes the dtype to float. So this could be a loss of functionality when people have missing data.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks - I've marked as draft as there might be a better solution out there which doesn't degrade performance, just working on it

ser[2] = np.nan

result = to_datetime(ser, format="%Y%m%d", cache=cache)
tm.assert_series_equal(result, expected)
with pytest.raises(ValueError, match=None):
to_datetime(ser, format="%Y%m%d", cache=cache)
Comment on lines -128 to +130
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should've always raised - 19801222.0 doesn't match the format "%Y%m%d"


# string with NaT
ser2 = ser.apply(str)
ser2[2] = "nat"
result = to_datetime(ser2, format="%Y%m%d", cache=cache)
tm.assert_series_equal(result, expected)
with pytest.raises(ValueError, match=None):
to_datetime(ser2, format="%Y%m%d", cache=cache)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be worth adding a test where the NaN value is pd.NA, as the following should work:

ser3 = ser.astype("Int64")
ser3[2] = pd.NA
to_datetime(ser3, format="%Y%m%d")


def test_to_datetime_format_YYYYMMDD_ignore(self, cache):
# coercion
# GH 7930
ser = Series([20121231, 20141231, 99991231])
expected = Series([20121231, 20141231, 99991231], dtype=object)
result = to_datetime(ser, format="%Y%m%d", errors="ignore", cache=cache)
expected = Series(
[datetime(2012, 12, 31), datetime(2014, 12, 31), datetime(9999, 12, 31)],
dtype=object,
)
tm.assert_series_equal(result, expected)
Copy link
Member Author

@MarcoGorelli MarcoGorelli Dec 4, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test looks like it wasn't right to begin with. If the input is invalid, errors='ignore' should return the input

that's what we do for literally any other format, e.g.

In [46]: to_datetime(Series([201212, 201412, 999912]), errors='ignore', format='%Y%m')
Out[46]:
0    201212
1    201412
2    999912
dtype: object


def test_to_datetime_format_YYYYMMDD_coercion(self, cache):
Expand Down Expand Up @@ -249,7 +246,7 @@ def test_to_datetime_format_integer(self, cache):
# valid date, length == 8
[20121030, datetime(2012, 10, 30)],
# short valid date, length == 6
[199934, datetime(1999, 3, 4)],
[199934, 199934],
# long integer date partially parsed to datetime(2012,1,1), length > 8
[2012010101, 2012010101],
# invalid date partially parsed to datetime(2012,9,9), length == 8
Expand Down Expand Up @@ -1714,8 +1711,8 @@ def test_dataframe_coerce(self, cache):
df2 = DataFrame({"year": [2015, 2016], "month": [2, 20], "day": [4, 5]})

msg = (
"cannot assemble the datetimes: time data .+ does not "
r"match format '%Y%m%d' \(match\)"
r"cannot assemble the datetimes: time data .+ doesn't "
r'match format "%Y%m%d"'
)
with pytest.raises(ValueError, match=msg):
to_datetime(df2, cache=cache)
Expand Down Expand Up @@ -1791,7 +1788,10 @@ def test_dataframe_mixed(self, cache):
def test_dataframe_float(self, cache):
# float
df = DataFrame({"year": [2000, 2001], "month": [1.5, 1], "day": [1, 1]})
msg = "cannot assemble the datetimes: unconverted data remains: 1"
msg = (
r'cannot assemble the datetimes: time data "20000151" at '
r'position 0 doesn\'t match format "%Y%m%d"'
)
with pytest.raises(ValueError, match=msg):
to_datetime(df, cache=cache)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/tslibs/test_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ def test_parse_time_string_check_instance_type_raise_exception():
("%Y-%m-%dT%H:%M:%S.%f", True),
("%Y-%m-%dT%H:%M:%S.%f%z", True),
("%Y-%m-%dT%H:%M:%S.%f%Z", False),
("%Y%m%d", False),
("%Y%m%d", True),
("%Y%m", False),
("%Y", False),
("%Y-%m-%d", True),
Expand Down