Skip to content

Commit 78bb467

Browse files
committed
PERF: Speed up pd.to_datetime() by optionally inferring dt format pandas-dev#5490
Given an array of strings that represent datetimes, infer_format=True will attempt to guess the format of the datetimes, and if it can infer the format, it will use a faster function to convert/import the datetimes. In cases where this speed-up can be used, the function should be about 10x faster.
1 parent 4dcecb0 commit 78bb467

File tree

2 files changed

+347
-14
lines changed

2 files changed

+347
-14
lines changed

pandas/tseries/tests/test_timeseries.py

+178-5
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from pandas.core.daterange import DateRange
1919
import pandas.core.datetools as datetools
2020
import pandas.tseries.offsets as offsets
21+
import pandas.tseries.tools as tools
2122
import pandas.tseries.frequencies as fmod
2223
import pandas as pd
2324

@@ -49,6 +50,11 @@ def _skip_if_no_pytz():
4950
except ImportError:
5051
raise nose.SkipTest("pytz not installed")
5152

53+
def _skip_if_has_locale():
54+
import locale
55+
lang, _ = locale.getlocale()
56+
if lang is not None:
57+
raise nose.SkipTest("Specific locale is set {0}".format(lang))
5258

5359
class TestTimeSeriesDuplicates(tm.TestCase):
5460
_multiprocess_can_split_ = True
@@ -909,12 +915,8 @@ def test_to_datetime_on_datetime64_series(self):
909915
self.assertEquals(result[0], s[0])
910916

911917
def test_to_datetime_with_apply(self):
912-
913918
# this is only locale tested with US/None locales
914-
import locale
915-
(lang,encoding) = locale.getlocale()
916-
if lang is not None:
917-
raise nose.SkipTest("format codes cannot work with a locale of {0}".format(lang))
919+
_skip_if_has_locale()
918920

919921
# GH 5195
920922
# with a format and coerce a single item to_datetime fails
@@ -3124,6 +3126,177 @@ def test_date_range_fy5252(self):
31243126
self.assertEqual(dr[1], Timestamp('2014-01-30'))
31253127

31263128

3129+
class TestToDatetimeInferFormat(tm.TestCase):
3130+
def test_to_datetime_infer_datetime_format_consistent_format(self):
3131+
time_series = pd.Series(
3132+
pd.date_range('20000101', periods=50, freq='H')
3133+
)
3134+
3135+
test_formats = [
3136+
'%m-%d-%Y',
3137+
'%m/%d/%Y %H:%M:%S.%f',
3138+
'%Y-%m-%dT%H:%M:%S.%f',
3139+
]
3140+
3141+
for test_format in test_formats:
3142+
s_as_dt_strings = time_series.apply(
3143+
lambda x: x.strftime(test_format)
3144+
)
3145+
3146+
with_format = pd.to_datetime(s_as_dt_strings, format=test_format)
3147+
no_infer = pd.to_datetime(
3148+
s_as_dt_strings, infer_datetime_format=False
3149+
)
3150+
yes_infer = pd.to_datetime(
3151+
s_as_dt_strings, infer_datetime_format=True
3152+
)
3153+
3154+
# Whether the format is explicitly passed, it is inferred, or
3155+
# it is not inferred, the results should all be the same
3156+
self.assert_(np.array_equal(with_format, no_infer))
3157+
self.assert_(np.array_equal(no_infer, yes_infer))
3158+
3159+
def test_to_datetime_infer_datetime_format_inconsistent_format(self):
3160+
test_series = pd.Series(
3161+
np.array([
3162+
'01/01/2011 00:00:00',
3163+
'01-02-2011 00:00:00',
3164+
'2011-01-03T00:00:00',
3165+
]))
3166+
3167+
# When the format is inconsistent, infer_datetime_format should just
3168+
# fallback to the default parsing
3169+
self.assert_(np.array_equal(
3170+
pd.to_datetime(test_series, infer_datetime_format=False),
3171+
pd.to_datetime(test_series, infer_datetime_format=True)
3172+
))
3173+
3174+
test_series = pd.Series(
3175+
np.array([
3176+
'Jan/01/2011',
3177+
'Feb/01/2011',
3178+
'Mar/01/2011',
3179+
]))
3180+
3181+
self.assert_(np.array_equal(
3182+
pd.to_datetime(test_series, infer_datetime_format=False),
3183+
pd.to_datetime(test_series, infer_datetime_format=True)
3184+
))
3185+
3186+
def test_to_datetime_infer_datetime_format_series_with_nans(self):
3187+
test_series = pd.Series(
3188+
np.array([
3189+
'01/01/2011 00:00:00',
3190+
np.nan,
3191+
'01/03/2011 00:00:00',
3192+
np.nan,
3193+
]))
3194+
3195+
self.assert_(np.array_equal(
3196+
pd.to_datetime(test_series, infer_datetime_format=False),
3197+
pd.to_datetime(test_series, infer_datetime_format=True)
3198+
))
3199+
3200+
def test_to_datetime_infer_datetime_format_series_starting_with_nans(self):
3201+
test_series = pd.Series(
3202+
np.array([
3203+
np.nan,
3204+
np.nan,
3205+
'01/01/2011 00:00:00',
3206+
'01/02/2011 00:00:00',
3207+
'01/03/2011 00:00:00',
3208+
]))
3209+
3210+
self.assert_(np.array_equal(
3211+
pd.to_datetime(test_series, infer_datetime_format=False),
3212+
pd.to_datetime(test_series, infer_datetime_format=True)
3213+
))
3214+
3215+
3216+
class TestGuessDatetimeFormat(tm.TestCase):
3217+
def test_guess_datetime_format_with_parseable_formats(self):
3218+
dt_string_to_format = (
3219+
('20111230', '%Y%m%d'),
3220+
('2011-12-30', '%Y-%m-%d'),
3221+
('30-12-2011', '%d-%m-%Y'),
3222+
('2011-12-30 00:00:00', '%Y-%m-%d %H:%M:%S'),
3223+
('2011-12-30T00:00:00', '%Y-%m-%dT%H:%M:%S'),
3224+
('2011-12-30 00:00:00.000000', '%Y-%m-%d %H:%M:%S.%f'),
3225+
)
3226+
3227+
for dt_string, dt_format in dt_string_to_format:
3228+
self.assertEquals(
3229+
tools._guess_datetime_format(dt_string),
3230+
dt_format
3231+
)
3232+
3233+
def test_guess_datetime_format_with_dayfirst(self):
3234+
ambiguous_string = '01/01/2011'
3235+
self.assertEquals(
3236+
tools._guess_datetime_format(ambiguous_string, dayfirst=True),
3237+
'%d/%m/%Y'
3238+
)
3239+
self.assertEquals(
3240+
tools._guess_datetime_format(ambiguous_string, dayfirst=False),
3241+
'%m/%d/%Y'
3242+
)
3243+
3244+
def test_guess_datetime_format_with_locale_specific_formats(self):
3245+
# The month names will vary depending on the locale, in which
3246+
# case these wont be parsed properly (dateutil can't parse them)
3247+
_skip_if_has_locale()
3248+
3249+
dt_string_to_format = (
3250+
('30/Dec/2011', '%d/%b/%Y'),
3251+
('30/December/2011', '%d/%B/%Y'),
3252+
('30/Dec/2011 00:00:00', '%d/%b/%Y %H:%M:%S'),
3253+
)
3254+
3255+
for dt_string, dt_format in dt_string_to_format:
3256+
self.assertEquals(
3257+
tools._guess_datetime_format(dt_string),
3258+
dt_format
3259+
)
3260+
3261+
def test_guess_datetime_format_invalid_inputs(self):
3262+
# A datetime string must include a year, month and a day for it
3263+
# to be guessable, in addition to being a string that looks like
3264+
# a datetime
3265+
invalid_dts = [
3266+
'2013',
3267+
'01/2013',
3268+
'12:00:00',
3269+
'1/1/1/1',
3270+
'this_is_not_a_datetime',
3271+
'51a',
3272+
9,
3273+
datetime(2011, 1, 1),
3274+
]
3275+
3276+
for invalid_dt in invalid_dts:
3277+
self.assertTrue(tools._guess_datetime_format(invalid_dt) is None)
3278+
3279+
def test_guess_datetime_format_for_array(self):
3280+
expected_format = '%Y-%m-%d %H:%M:%S.%f'
3281+
dt_string = datetime(2011, 12, 30, 0, 0, 0).strftime(expected_format)
3282+
3283+
test_arrays = [
3284+
np.array([dt_string, dt_string, dt_string], dtype='O'),
3285+
np.array([np.nan, np.nan, dt_string], dtype='O'),
3286+
np.array([dt_string, 'random_string'], dtype='O'),
3287+
]
3288+
3289+
for test_array in test_arrays:
3290+
self.assertEqual(
3291+
tools._guess_datetime_format_for_array(test_array),
3292+
expected_format
3293+
)
3294+
3295+
format_for_string_of_nans = tools._guess_datetime_format_for_array(
3296+
np.array([np.nan, np.nan, np.nan], dtype='O')
3297+
)
3298+
self.assertTrue(format_for_string_of_nans is None)
3299+
31273300
if __name__ == '__main__':
31283301
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
31293302
exit=False)

0 commit comments

Comments
 (0)