diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 769889dfbe75d..a5a7bc5b5c8bd 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -277,6 +277,16 @@ def time_dup_string_tzoffset_dates(self, cache): to_datetime(self.dup_string_with_tz, cache=cache) +# GH 43901 +class ToDatetimeInferDatetimeFormat: + def setup(self): + rng = date_range(start="1/1/2000", periods=100000, freq="H") + self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist() + + def time_infer_datetime_format(self): + to_datetime(self.strings, infer_datetime_format=True) + + class ToTimedelta: def setup(self): self.ints = np.random.randint(0, 60, size=10000) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index daf0d0d000079..dc967c775ea2b 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -358,6 +358,7 @@ Performance improvements - Performance improvement in some :meth:`GroupBy.apply` operations (:issue:`42992`) - Performance improvement in :func:`read_stata` (:issue:`43059`) - Performance improvement in :meth:`to_datetime` with ``uint`` dtypes (:issue:`42606`) +- Performance improvement in :meth:`to_datetime` with ``infer_datetime_format`` set to ``True`` (:issue:`43901`) - Performance improvement in :meth:`Series.sparse.to_coo` (:issue:`42880`) - Performance improvement in indexing with a :class:`MultiIndex` indexer on another :class:`MultiIndex` (:issue:43370`) - Performance improvement in :meth:`GroupBy.quantile` (:issue:`43469`) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index afbb63ecbd2d7..7298b42610a13 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -911,6 +911,9 @@ def guess_datetime_format( (('second', 'microsecond'), '%S.%f', 0), (('tzinfo',), '%z', 0), (('tzinfo',), '%Z', 0), + (('day_of_week',), '%a', 0), + (('day_of_week',), '%A', 0), + (('meridiem',), '%p', 0), ] if dayfirst: @@ -967,15 +970,17 @@ def guess_datetime_format( if set(attrs) & found_attrs: continue - if all(getattr(parsed_datetime, attr) is not None for attr in attrs): - for i, token_format in enumerate(format_guess): - token_filled = tokens[i].zfill(padding) - if (token_format is None and - token_filled == parsed_datetime.strftime(attr_format)): - format_guess[i] = attr_format - tokens[i] = token_filled - found_attrs.update(attrs) - break + if parsed_datetime.tzinfo is None and attr_format in ("%Z", "%z"): + continue + + parsed_formatted = parsed_datetime.strftime(attr_format) + for i, token_format in enumerate(format_guess): + token_filled = tokens[i].zfill(padding) + if token_format is None and token_filled == parsed_formatted: + format_guess[i] = attr_format + tokens[i] = token_filled + found_attrs.update(attrs) + break # Only consider it a valid guess if we have a year, month and day if len({'year', 'month', 'day'} & found_attrs) != 3: diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 3992457c9c361..3d2daec442c38 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -167,6 +167,8 @@ def test_parsers_month_freq(date_str, expected): ("2011-12-30T00:00:00.000000+9:0", "%Y-%m-%dT%H:%M:%S.%f%z"), ("2011-12-30T00:00:00.000000+09:", None), ("2011-12-30 00:00:00.000000", "%Y-%m-%d %H:%M:%S.%f"), + ("Tue 24 Aug 2021 01:30:48 AM", "%a %d %b %Y %H:%M:%S %p"), + ("Tuesday 24 Aug 2021 01:30:48 AM", "%A %d %b %Y %H:%M:%S %p"), ], ) def test_guess_datetime_format_with_parseable_formats(string, fmt):