-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: to_datetime with decimal number doesn't fail for %Y%m%d #50054
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2004fb7
65149de
83a4ec6
754fdfe
69a487a
a004790
ad8a56a
22ca184
6e8280b
55dc073
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,8 +24,6 @@ | |
Timedelta, | ||
Timestamp, | ||
iNaT, | ||
nat_strings, | ||
parsing, | ||
timezones as libtimezones, | ||
) | ||
from pandas._libs.tslibs.parsing import ( | ||
|
@@ -38,7 +36,6 @@ | |
AnyArrayLike, | ||
ArrayLike, | ||
DateTimeErrorChoices, | ||
npt, | ||
) | ||
|
||
from pandas.core.dtypes.common import ( | ||
|
@@ -57,13 +54,11 @@ | |
ABCDataFrame, | ||
ABCSeries, | ||
) | ||
from pandas.core.dtypes.missing import notna | ||
|
||
from pandas.arrays import ( | ||
DatetimeArray, | ||
IntegerArray, | ||
) | ||
from pandas.core import algorithms | ||
from pandas.core.algorithms import unique | ||
from pandas.core.arrays.base import ExtensionArray | ||
from pandas.core.arrays.datetimes import ( | ||
|
@@ -407,7 +402,6 @@ def _convert_listlike_datetimes( | |
|
||
# warn if passing timedelta64, raise for PeriodDtype | ||
# NB: this must come after unit transformation | ||
orig_arg = arg | ||
try: | ||
arg, _ = maybe_convert_dtype(arg, copy=False, tz=libtimezones.maybe_get_tz(tz)) | ||
except TypeError: | ||
|
@@ -435,8 +429,8 @@ def _convert_listlike_datetimes( | |
require_iso8601 = not infer_datetime_format | ||
|
||
if format is not None and not require_iso8601: | ||
res = _to_datetime_with_format( | ||
arg, orig_arg, name, utc, format, exact, errors, infer_datetime_format | ||
res = _array_strptime_with_fallback( | ||
arg, name, utc, format, exact, errors, infer_datetime_format | ||
) | ||
if res is not None: | ||
return res | ||
|
@@ -510,43 +504,6 @@ def _array_strptime_with_fallback( | |
return _box_as_indexlike(result, utc=utc, name=name) | ||
|
||
|
||
def _to_datetime_with_format( | ||
arg, | ||
orig_arg, | ||
name, | ||
utc: bool, | ||
fmt: str, | ||
exact: bool, | ||
errors: str, | ||
infer_datetime_format: bool, | ||
) -> Index | None: | ||
""" | ||
Try parsing with the given format, returning None on failure. | ||
""" | ||
result = None | ||
|
||
# shortcut formatting here | ||
if fmt == "%Y%m%d": | ||
# pass orig_arg as float-dtype may have been converted to | ||
# datetime64[ns] | ||
orig_arg = ensure_object(orig_arg) | ||
try: | ||
# may return None without raising | ||
result = _attempt_YYYYMMDD(orig_arg, errors=errors) | ||
except (ValueError, TypeError, OutOfBoundsDatetime) as err: | ||
raise ValueError( | ||
"cannot convert the input to '%Y%m%d' date format" | ||
) from err | ||
if result is not None: | ||
return _box_as_indexlike(result, utc=utc, name=name) | ||
|
||
# fallback | ||
res = _array_strptime_with_fallback( | ||
arg, name, utc, fmt, exact, errors, infer_datetime_format | ||
) | ||
return res | ||
|
||
|
||
def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: | ||
""" | ||
to_datetime specalized to the case where a 'unit' is passed. | ||
|
@@ -973,7 +930,7 @@ def to_datetime( | |
in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`. | ||
|
||
>>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore') | ||
datetime.datetime(1300, 1, 1, 0, 0) | ||
'13000101' | ||
Comment on lines
932
to
+933
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this one also looks like it was wrong to begin with - we don't return
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are there many other cases where we get pydatetime objects instead of Timestamps? If no, then i think it makes sense to tighten and say "you always get Timestamps or ignore/raise". If yes, then i think returning the pydatetime objects is better here than the string (and probably also for '1300' with '%Y') There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've opened #50072, let's move this discussion there |
||
>>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') | ||
NaT | ||
|
||
|
@@ -1241,60 +1198,6 @@ def coerce(values): | |
return values | ||
|
||
|
||
def _attempt_YYYYMMDD(arg: npt.NDArray[np.object_], errors: str) -> np.ndarray | None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think there's an issue about this fastpath not actually being fast. if this is removed, the issue is probably closeable There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately, not quite I've left a comment there, but removing it wouldn't fix the issue because 6-digit %Y%m%d dates wouldn't get parsed |
||
""" | ||
try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like, | ||
arg is a passed in as an object dtype, but could really be ints/strings | ||
with nan-like/or floats (e.g. with nan) | ||
|
||
Parameters | ||
---------- | ||
arg : np.ndarray[object] | ||
errors : {'raise','ignore','coerce'} | ||
""" | ||
|
||
def calc(carg): | ||
# calculate the actual result | ||
carg = carg.astype(object, copy=False) | ||
parsed = parsing.try_parse_year_month_day( | ||
carg / 10000, carg / 100 % 100, carg % 100 | ||
) | ||
return tslib.array_to_datetime(parsed, errors=errors)[0] | ||
|
||
def calc_with_mask(carg, mask): | ||
result = np.empty(carg.shape, dtype="M8[ns]") | ||
iresult = result.view("i8") | ||
iresult[~mask] = iNaT | ||
|
||
masked_result = calc(carg[mask].astype(np.float64).astype(np.int64)) | ||
result[mask] = masked_result.astype("M8[ns]") | ||
return result | ||
|
||
# try intlike / strings that are ints | ||
try: | ||
return calc(arg.astype(np.int64)) | ||
except (ValueError, OverflowError, TypeError): | ||
pass | ||
|
||
# a float with actual np.nan | ||
try: | ||
carg = arg.astype(np.float64) | ||
return calc_with_mask(carg, notna(carg)) | ||
except (ValueError, OverflowError, TypeError): | ||
pass | ||
|
||
# string with NaN-like | ||
try: | ||
# error: Argument 2 to "isin" has incompatible type "List[Any]"; expected | ||
# "Union[Union[ExtensionArray, ndarray], Index, Series]" | ||
mask = ~algorithms.isin(arg, list(nat_strings)) # type: ignore[arg-type] | ||
return calc_with_mask(arg, mask) | ||
except (ValueError, OverflowError, TypeError): | ||
pass | ||
|
||
return None | ||
|
||
|
||
__all__ = [ | ||
"DateParseError", | ||
"should_cache", | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -117,6 +117,7 @@ def test_to_datetime_format_YYYYMMDD(self, cache): | |
tm.assert_series_equal(result, expected) | ||
|
||
def test_to_datetime_format_YYYYMMDD_with_nat(self, cache): | ||
# GH50051 | ||
ser = Series([19801222, 19801222] + [19810105] * 5) | ||
# with NaT | ||
expected = Series( | ||
|
@@ -125,24 +126,21 @@ def test_to_datetime_format_YYYYMMDD_with_nat(self, cache): | |
expected[2] = np.nan | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't need to have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks - I've marked as draft as there might be a better solution out there which doesn't degrade performance, just working on it |
||
ser[2] = np.nan | ||
|
||
result = to_datetime(ser, format="%Y%m%d", cache=cache) | ||
tm.assert_series_equal(result, expected) | ||
with pytest.raises(ValueError, match=None): | ||
to_datetime(ser, format="%Y%m%d", cache=cache) | ||
Comment on lines
-128
to
+130
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this should've always raised - |
||
|
||
# string with NaT | ||
ser2 = ser.apply(str) | ||
ser2[2] = "nat" | ||
result = to_datetime(ser2, format="%Y%m%d", cache=cache) | ||
tm.assert_series_equal(result, expected) | ||
with pytest.raises(ValueError, match=None): | ||
to_datetime(ser2, format="%Y%m%d", cache=cache) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Might be worth adding a test where the ser3 = ser.astype("Int64")
ser3[2] = pd.NA
to_datetime(ser3, format="%Y%m%d") |
||
|
||
def test_to_datetime_format_YYYYMMDD_ignore(self, cache): | ||
# coercion | ||
# GH 7930 | ||
# GH 7930, GH50054 | ||
ser = Series([20121231, 20141231, 99991231]) | ||
expected = Series([20121231, 20141231, 99991231], dtype=object) | ||
result = to_datetime(ser, format="%Y%m%d", errors="ignore", cache=cache) | ||
expected = Series( | ||
[datetime(2012, 12, 31), datetime(2014, 12, 31), datetime(9999, 12, 31)], | ||
dtype=object, | ||
) | ||
tm.assert_series_equal(result, expected) | ||
|
||
def test_to_datetime_format_YYYYMMDD_coercion(self, cache): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
removing this function altogether, special-casing
%Y%m%d
just introduces bugsThe non-ISO8601 path can handle this format just fine, let's use that
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
does this impact perf? im assuming this was introduced as a fastpath