Skip to content

REF: consolidate datetime parsing paths and exception handling #50790

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/user_guide/timeseries.rst
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ The default behavior, ``errors='raise'``, is to raise when unparsable:
.. code-block:: ipython

In [2]: pd.to_datetime(['2009/07/31', 'asd'], errors='raise')
ValueError: Unknown string format
ValueError: Unknown datetime string format

Pass ``errors='ignore'`` to return the original input when unparsable:

Expand Down
13 changes: 3 additions & 10 deletions pandas/_libs/tslibs/conversion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -538,16 +538,9 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, str unit,
maybe_localize_tso(obj, tz, obj.creso)
return obj

try:
dt = parse_datetime_string(
ts, dayfirst=dayfirst, yearfirst=yearfirst
)
except ValueError as err:
if "out of range for month" in str(err):
# dateutil raised when constructing a datetime object,
# let's give a nicer exception message
raise ValueError("could not convert string to Timestamp") from err
raise
dt = parse_datetime_string(
ts, dayfirst=dayfirst, yearfirst=yearfirst
)

return convert_datetime_to_tsobject(dt, tz)

Expand Down
53 changes: 26 additions & 27 deletions pandas/_libs/tslibs/parsing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -286,19 +286,10 @@ def parse_datetime_string(
except ValueError:
pass

try:
dt = du_parse(date_string, default=_DEFAULT_DATETIME,
dayfirst=dayfirst, yearfirst=yearfirst)
except TypeError:
# following may be raised from dateutil
# TypeError: 'NoneType' object is not iterable
raise ValueError(f'Given date string "{date_string}" not likely a datetime')
except OverflowError as err:
# with e.g. "08335394550" dateutil raises when trying to pass
# year=8335394550 to datetime.replace
raise OutOfBoundsDatetime(
f'Parsing "{date_string}" to datetime overflows'
) from err
dt, _ = dateutil_parse(date_string, default=_DEFAULT_DATETIME,
dayfirst=dayfirst, yearfirst=yearfirst,
ignoretz=False)

if dt.tzinfo is not None:
# dateutil can return a datetime with a tzoffset outside of (-24H, 24H)
# bounds, which is invalid (can be constructed, but raises if we call
Expand Down Expand Up @@ -413,15 +404,9 @@ def parse_datetime_string_with_reso(
except ValueError:
pass

try:
parsed, reso = dateutil_parse(date_string, _DEFAULT_DATETIME,
dayfirst=dayfirst, yearfirst=yearfirst,
ignoretz=False)
except (ValueError, OverflowError) as err:
# TODO: allow raise of errors within instead
raise DateParseError(err)
if parsed is None:
raise DateParseError(f"Could not parse {date_string}")
parsed, reso = dateutil_parse(date_string, _DEFAULT_DATETIME,
dayfirst=dayfirst, yearfirst=yearfirst,
ignoretz=False)
return parsed, reso


Expand Down Expand Up @@ -608,7 +593,7 @@ cpdef quarter_to_myear(int year, int quarter, str freq):

cdef dateutil_parse(
str timestr,
object default,
datetime default,
bint ignoretz=False,
bint dayfirst=False,
bint yearfirst=False,
Expand All @@ -619,13 +604,15 @@ cdef dateutil_parse(
str attr
datetime ret
object res
object reso = None
str reso = None
dict repl = {}

res, _ = DEFAULTPARSER._parse(timestr, dayfirst=dayfirst, yearfirst=yearfirst)

if res is None:
raise ValueError(f"Unknown datetime string format, unable to parse: {timestr}")
raise DateParseError(
f"Unknown datetime string format, unable to parse: {timestr}"
)

for attr in ["year", "month", "day", "hour",
"minute", "second", "microsecond"]:
Expand All @@ -635,15 +622,27 @@ cdef dateutil_parse(
reso = attr

if reso is None:
raise ValueError(f"Unable to parse datetime string: {timestr}")
raise DateParseError(f"Unable to parse datetime string: {timestr}")

if reso == "microsecond":
if repl["microsecond"] == 0:
reso = "second"
elif repl["microsecond"] % 1000 == 0:
reso = "millisecond"

ret = default.replace(**repl)
try:
ret = default.replace(**repl)
except ValueError as err:
# e.g. "day is out of range for month"
# we re-raise to match dateutil's exception message
raise DateParseError(str(err) + ": " + timestr) from err
except OverflowError as err:
# with e.g. "08335394550" dateutil raises when trying to pass
# year=8335394550 to datetime.replace
raise OutOfBoundsDatetime(
f'Parsing "{timestr}" to datetime overflows'
) from err

if res.weekday is not None and not res.day:
ret = ret + relativedelta.relativedelta(weekday=res.weekday)
if not ignoretz:
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/test_block_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ def f(dtype):
f("float64")

# 10822
msg = "^Unknown string format: aa, at position 0$"
msg = "^Unknown datetime string format, unable to parse: aa, at position 0$"
with pytest.raises(ValueError, match=msg):
f("M8[ns]")

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexes/datetimes/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1037,7 +1037,7 @@ def test_from_freq_recreate_from_data(self, freq):

def test_datetimeindex_constructor_misc(self):
arr = ["1/1/2005", "1/2/2005", "Jn 3, 2005", "2005-01-04"]
msg = r"(\(')?Unknown string format(:', 'Jn 3, 2005'\))?"
msg = r"(\(')?Unknown datetime string format(:', 'Jn 3, 2005'\))?"
with pytest.raises(ValueError, match=msg):
DatetimeIndex(arr)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexes/datetimes/test_date_range.py
Original file line number Diff line number Diff line change
Expand Up @@ -980,7 +980,7 @@ def test_misc(self):
def test_date_parse_failure(self):
badly_formed_date = "2007/100/1"

msg = "Unknown string format: 2007/100/1"
msg = "Unknown datetime string format, unable to parse: 2007/100/1"
with pytest.raises(ValueError, match=msg):
Timestamp(badly_formed_date)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/io/pytables/test_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,7 @@ def test_frame_select(setup_path):
# invalid terms
df = tm.makeTimeDataFrame()
store.append("df_time", df)
msg = "could not convert string to Timestamp"
msg = "day is out of range for month: 0"
with pytest.raises(ValueError, match=msg):
store.select("df_time", "index>0")

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/scalar/timestamp/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,7 @@ def test_constructor_nanosecond(self, result):
@pytest.mark.parametrize("z", ["Z0", "Z00"])
def test_constructor_invalid_Z0_isostring(self, z):
# GH 8910
msg = f"Unknown string format: 2014-11-02 01:00{z}"
msg = f"Unknown datetime string format, unable to parse: 2014-11-02 01:00{z}"
with pytest.raises(ValueError, match=msg):
Timestamp(f"2014-11-02 01:00{z}")

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def test_infer_with_date_and_datetime(self):
def test_unparseable_strings_with_dt64_dtype(self):
# pre-2.0 these would be silently ignored and come back with object dtype
vals = ["aa"]
msg = "^Unknown string format: aa, at position 0$"
msg = "^Unknown datetime string format, unable to parse: aa, at position 0$"
with pytest.raises(ValueError, match=msg):
Series(vals, dtype="datetime64[ns]")

Expand Down
11 changes: 6 additions & 5 deletions pandas/tests/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -2477,7 +2477,7 @@ def test_string_na_nat_conversion_malformed(self, cache):
malformed = np.array(["1/100/2000", np.nan], dtype=object)

# GH 10636, default is now 'raise'
msg = r"Unknown string format:|day is out of range for month"
msg = r"Unknown datetime string format"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(
UserWarning, match="Could not infer format"
Expand Down Expand Up @@ -2791,7 +2791,7 @@ def test_day_not_in_month_coerce(self, cache, arg, format, warning):
assert isna(to_datetime(arg, errors="coerce", format=format, cache=cache))

def test_day_not_in_month_raise(self, cache):
msg = "could not convert string to Timestamp"
msg = "day is out of range for month: 2015-02-29, at position 0"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(
UserWarning, match="Could not infer format"
Expand Down Expand Up @@ -3218,9 +3218,10 @@ def test_invalid_origins_tzinfo(self):

def test_incorrect_value_exception(self):
# GH47495
with pytest.raises(
ValueError, match="Unknown string format: yesterday, at position 1"
):
msg = (
"Unknown datetime string format, unable to parse: yesterday, at position 1"
)
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(
UserWarning, match="Could not infer format"
):
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/tseries/frequencies/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ def test_invalid_index_types_unicode():
# see gh-10822
#
# Odd error message on conversions to datetime for unicode.
msg = "Unknown string format"
msg = "Unknown datetime string format"

with pytest.raises(ValueError, match=msg):
frequencies.infer_freq(tm.makeStringIndex(10))
Expand Down Expand Up @@ -422,7 +422,7 @@ def test_series_invalid_type(end):

def test_series_inconvertible_string():
# see gh-6407
msg = "Unknown string format"
msg = "Unknown datetime string format"

with pytest.raises(ValueError, match=msg):
frequencies.infer_freq(Series(["foo", "bar"]))
Expand Down