diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index b1b38505b9476..b7229c8f56080 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -485,7 +485,16 @@ cpdef array_to_datetime( # string seen_string = True - if len(val) == 0 or val in nat_strings: + if len(val) == 0: + if is_coerce: + iresult[i] = NPY_NAT + continue + elif is_ignore: + raise TypeError("Empty string is not a valid datetime") + else: + raise ValueError("Empty string is not a valid datetime") + + if val in nat_strings: iresult[i] = NPY_NAT continue @@ -710,7 +719,10 @@ cdef array_to_datetime_object( # GH 25978. No need to parse NaT-like or datetime-like vals oresult[i] = val elif isinstance(val, str): - if len(val) == 0 or val in nat_strings: + if len(val) == 0: + oresult[i] = val + continue + if val in nat_strings: oresult[i] = 'NaT' continue try: diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index adf1dfbc1ac72..349c0b625de98 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -596,7 +596,9 @@ cdef _TSObject _convert_str_to_tsobject(object ts, tzinfo tz, str unit, int out_local = 0, out_tzoffset = 0 bint do_parse_datetime_string = False - if len(ts) == 0 or ts in nat_strings: + if len(ts) == 0: + raise ValueError("Empty string is not a valid timestamp") + elif ts in nat_strings: ts = NaT elif ts == 'now': # Issue 9000, we short-circuit rather than going diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index ee32ed53a908b..1c06178363c43 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -294,7 +294,9 @@ cdef inline int64_t parse_timedelta_string(str ts) except? -1: # have_value : track if we have at least 1 leading unit # have_hhmmss : tracks if we have a regular format hh:mm:ss - if len(ts) == 0 or ts in nat_strings: + if len(ts) == 0: + raise ValueError("Empty string is not a valid timedelta") + if ts in nat_strings: return NPY_NAT for c in ts: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5f2b901844dad..5baadf237e264 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2182,7 +2182,7 @@ def isna(self): For datetimes, `NaT` (Not a Time) is considered as an NA value. >>> idx = pd.DatetimeIndex([pd.Timestamp('1940-04-25'), - ... pd.Timestamp(''), None, pd.NaT]) + ... pd.Timestamp('NaT'), None, pd.NaT]) >>> idx DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'], dtype='datetime64[ns]', freq=None) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index db5c4af9c6f53..212b45639d7c2 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -643,7 +643,7 @@ def predictions(tool): { "Key": ["B", "B", "A", "A"], "State": ["step1", "step2", "step1", "step2"], - "oTime": ["", "2016-09-19 05:24:33", "", "2016-09-19 23:59:04"], + "oTime": ["NaT", "2016-09-19 05:24:33", "NaT", "2016-09-19 23:59:04"], "Machine": ["23", "36L", "36R", "36R"], } ) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 662659982c0b3..c7f166726f397 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1129,9 +1129,7 @@ def test_parse_dates_empty_string(all_parsers): data = "Date,test\n2012-01-01,1\n,2" result = parser.read_csv(StringIO(data), parse_dates=["Date"], na_filter=False) - expected = DataFrame( - [[datetime(2012, 1, 1), 1], [pd.NaT, 2]], columns=["Date", "test"] - ) + expected = DataFrame([[datetime(2012, 1, 1), 1], ["", 2]], columns=["Date", "test"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 09d5d9c1677d0..91fac70b886f5 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -104,8 +104,13 @@ def test_identity(klass, value): @pytest.mark.parametrize("klass", [Timestamp, Timedelta, Period]) @pytest.mark.parametrize("value", ["", "nat", "NAT", None, np.nan]) def test_equality(klass, value): - if klass is Period and value == "": - pytest.skip("Period cannot parse empty string") + if value == "": + if klass is Period and value == "": + pytest.skip("Period cannot parse empty string") + elif klass is Timedelta: + pytest.skip("Timedelta cannot parse empty string") + elif klass is Timestamp: + pytest.skip("Timestamp cannot parse empty string") assert klass(value).value == iNaT diff --git a/pandas/tests/scalar/timedelta/test_constructors.py b/pandas/tests/scalar/timedelta/test_constructors.py index 23fb25b838da6..d48182076ccb2 100644 --- a/pandas/tests/scalar/timedelta/test_constructors.py +++ b/pandas/tests/scalar/timedelta/test_constructors.py @@ -131,6 +131,12 @@ def test_construction(): Timedelta("foo bar") +def test_construction_empty_string(): + # Issue #36550, empty string + with pytest.raises(ValueError): + Timedelta("") + + @pytest.mark.parametrize( "item", list( diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index d1c3ad508d877..6aa6a5f88a22e 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -552,6 +552,10 @@ def test_constructor_fromisocalendar(self): assert result == expected_stdlib assert isinstance(result, Timestamp) + def test_constructior_empty_string(self): + with pytest.raises(ValueError): + Timestamp("") + def test_constructor_ambigous_dst(): # GH 24329 diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index 3836c1d56bf87..9badc92c5b89d 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -80,3 +80,21 @@ def test_isin_empty(self, empty): result = s.isin(empty) tm.assert_series_equal(expected, result) + + @pytest.mark.parametrize( + "values, in_list, expected", + [ + ([""], ["", pd.Timedelta(0)], [True]), + (["", pd.Timedelta(0)], [""], [True, False]), + ([""], ["", pd.to_datetime("2020-01-01")], [True]), + (["", pd.to_datetime("2020-01-01")], [""], [True, False]), + ], + ) + def test_empty_string_category(self, values, in_list, expected): + # Issue #36550 + # Mixed empty string with datetimelike + s = pd.Series(values) + pd.testing.assert_series_equal( + s.isin(in_list), + pd.Series(expected), + ) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 819474e1f32e7..c6231f40debaa 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1090,6 +1090,16 @@ def test_to_datetime_fixed_offset(self): result = to_datetime(dates) assert result.tz == fixed_off + def test_to_datetime_empty_string(self): + with pytest.raises(ValueError): + pd.to_datetime("", errors="raise") + + result = pd.to_datetime("", errors="ignore") + assert result == "" + + result = pd.to_datetime("", errors="coerce") + assert result is pd.NaT + class TestToDatetimeUnit: @pytest.mark.parametrize("cache", [True, False]) @@ -1574,11 +1584,11 @@ def test_to_datetime_with_apply(self, cache): def test_to_datetime_types(self, cache): # empty string - result = to_datetime("", cache=cache) - assert result is NaT + with pytest.raises(ValueError): + result = to_datetime("", cache=cache) - result = to_datetime(["", ""], cache=cache) - assert isna(result).all() + with pytest.raises(ValueError): + result = to_datetime(["", ""], cache=cache) # ints result = Timestamp(0) diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py index f68d83f7f4d58..26a692970d581 100644 --- a/pandas/tests/tools/test_to_timedelta.py +++ b/pandas/tests/tools/test_to_timedelta.py @@ -4,15 +4,15 @@ import pytest import pandas as pd -from pandas import Series, TimedeltaIndex, isna, to_timedelta +from pandas import Series, TimedeltaIndex, to_timedelta import pandas._testing as tm class TestTimedeltas: def test_to_timedelta(self): - result = to_timedelta(["", ""]) - assert isna(result).all() + with pytest.raises(ValueError): + to_timedelta(["", ""]) # pass thru result = to_timedelta(np.array([np.timedelta64(1, "s")]))