Skip to content

Commit 31c2e5f

Browse files
sinhrksjreback
authored andcommitted
PERF: improve DTI string parse
closes #11169 closes #11287 Author: sinhrks <[email protected]> Closes #13692 from sinhrks/dti_perf and squashes the following commits: 8774772 [sinhrks] PERF: improve DTI string parse
1 parent 506520b commit 31c2e5f

File tree

9 files changed

+74
-178
lines changed

9 files changed

+74
-178
lines changed

doc/source/whatsnew/v0.19.0.txt

+4
Original file line numberDiff line numberDiff line change
@@ -566,6 +566,7 @@ Performance Improvements
566566
- Improved performance of float64 hash table operations, fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`)
567567
- Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`)
568568
- Improved performance of ``Index.difference`` (:issue:`12044`)
569+
- Improved performance of datetime string parsing in ``DatetimeIndex`` (:issue:`13692`)
569570

570571
.. _whatsnew_0190.bug_fixes:
571572

@@ -631,6 +632,7 @@ Bug Fixes
631632
- Bug in checking for any null objects in a ``TimedeltaIndex``, which always returned ``True`` (:issue:`13603`)
632633

633634

635+
634636
- Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`)
635637

636638

@@ -654,6 +656,8 @@ Bug Fixes
654656

655657
- Bug in ``pd.to_numeric`` when ``errors='coerce'`` and input contains non-hashable objects (:issue:`13324`)
656658
- Bug in invalid ``Timedelta`` arithmetic and comparison may raise ``ValueError`` rather than ``TypeError`` (:issue:`13624`)
659+
- Bug in invalid datetime parsing in ``to_datetime`` and ``DatetimeIndex`` may raise ``TypeError`` rather than ``ValueError`` (:issue:`11169`, :issue:`11287`)
660+
- Bug in ``Index`` created with tz-aware ``Timestamp`` and mismatched ``tz`` option incorrectly coerces timezone (:issue:`13692`)
657661

658662
- Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`)
659663
- Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`)

pandas/io/parsers.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2440,7 +2440,7 @@ def converter(*date_cols):
24402440
strs = _concat_date_cols(date_cols)
24412441

24422442
try:
2443-
return tools._to_datetime(
2443+
return tools.to_datetime(
24442444
_ensure_object(strs),
24452445
utc=None,
24462446
box=False,

pandas/tests/indexes/test_datetimelike.py

+9-22
Original file line numberDiff line numberDiff line change
@@ -170,16 +170,6 @@ def test_construction_index_with_mixed_timezones(self):
170170
self.assert_index_equal(result, exp, exact=True)
171171
self.assertFalse(isinstance(result, DatetimeIndex))
172172

173-
# passing tz results in DatetimeIndex
174-
result = Index([Timestamp('2011-01-01 10:00'),
175-
Timestamp('2011-01-02 10:00', tz='US/Eastern')],
176-
tz='Asia/Tokyo', name='idx')
177-
exp = DatetimeIndex([Timestamp('2011-01-01 19:00'),
178-
Timestamp('2011-01-03 00:00')],
179-
tz='Asia/Tokyo', name='idx')
180-
self.assert_index_equal(result, exp, exact=True)
181-
self.assertTrue(isinstance(result, DatetimeIndex))
182-
183173
# length = 1
184174
result = Index([Timestamp('2011-01-01')], name='idx')
185175
exp = DatetimeIndex([Timestamp('2011-01-01')], name='idx')
@@ -253,17 +243,6 @@ def test_construction_index_with_mixed_timezones_with_NaT(self):
253243
self.assert_index_equal(result, exp, exact=True)
254244
self.assertFalse(isinstance(result, DatetimeIndex))
255245

256-
# passing tz results in DatetimeIndex
257-
result = Index([pd.NaT, Timestamp('2011-01-01 10:00'),
258-
pd.NaT, Timestamp('2011-01-02 10:00',
259-
tz='US/Eastern')],
260-
tz='Asia/Tokyo', name='idx')
261-
exp = DatetimeIndex([pd.NaT, Timestamp('2011-01-01 19:00'),
262-
pd.NaT, Timestamp('2011-01-03 00:00')],
263-
tz='Asia/Tokyo', name='idx')
264-
self.assert_index_equal(result, exp, exact=True)
265-
self.assertTrue(isinstance(result, DatetimeIndex))
266-
267246
# all NaT
268247
result = Index([pd.NaT, pd.NaT], name='idx')
269248
exp = DatetimeIndex([pd.NaT, pd.NaT], name='idx')
@@ -323,12 +302,13 @@ def test_construction_dti_with_mixed_timezones(self):
323302
self.assertTrue(isinstance(result, DatetimeIndex))
324303

325304
# tz mismatch affecting to tz-aware raises TypeError/ValueError
305+
326306
with tm.assertRaises(ValueError):
327307
DatetimeIndex([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'),
328308
Timestamp('2011-01-02 10:00', tz='US/Eastern')],
329309
name='idx')
330310

331-
with tm.assertRaises(TypeError):
311+
with tm.assertRaisesRegexp(TypeError, 'data is already tz-aware'):
332312
DatetimeIndex([Timestamp('2011-01-01 10:00'),
333313
Timestamp('2011-01-02 10:00', tz='US/Eastern')],
334314
tz='Asia/Tokyo', name='idx')
@@ -338,6 +318,13 @@ def test_construction_dti_with_mixed_timezones(self):
338318
Timestamp('2011-01-02 10:00', tz='US/Eastern')],
339319
tz='US/Eastern', name='idx')
340320

321+
with tm.assertRaisesRegexp(TypeError, 'data is already tz-aware'):
322+
# passing tz should results in DatetimeIndex, then mismatch raises
323+
# TypeError
324+
Index([pd.NaT, Timestamp('2011-01-01 10:00'),
325+
pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')],
326+
tz='Asia/Tokyo', name='idx')
327+
341328
def test_construction_base_constructor(self):
342329
arr = [pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')]
343330
tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr))

pandas/tseries/index.py

+17-76
Original file line numberDiff line numberDiff line change
@@ -292,55 +292,32 @@ def __new__(cls, data=None,
292292
raise ValueError('DatetimeIndex() must be called with a '
293293
'collection of some kind, %s was passed'
294294
% repr(data))
295-
296295
# other iterable of some kind
297296
if not isinstance(data, (list, tuple)):
298297
data = list(data)
299-
300298
data = np.asarray(data, dtype='O')
299+
elif isinstance(data, ABCSeries):
300+
data = data._values
301301

302-
# try a few ways to make it datetime64
303-
if lib.is_string_array(data):
304-
data = tslib.parse_str_array_to_datetime(data, freq=freq,
305-
dayfirst=dayfirst,
306-
yearfirst=yearfirst)
307-
else:
308-
data = tools.to_datetime(data, errors='raise')
309-
data.offset = freq
310-
if isinstance(data, DatetimeIndex):
311-
if name is not None:
312-
data.name = name
313-
314-
if tz is not None:
315-
316-
# we might already be localized to this tz
317-
# so passing the same tz is ok
318-
# however any other tz is a no-no
319-
if data.tz is None:
320-
return data.tz_localize(tz, ambiguous=ambiguous)
321-
elif str(tz) != str(data.tz):
322-
raise TypeError("Already tz-aware, use tz_convert "
323-
"to convert.")
324-
325-
return data._deepcopy_if_needed(ref_to_data, copy)
326-
327-
if issubclass(data.dtype.type, compat.string_types):
328-
data = tslib.parse_str_array_to_datetime(data, freq=freq,
329-
dayfirst=dayfirst,
330-
yearfirst=yearfirst)
302+
# data must be Index or np.ndarray here
303+
if not (is_datetime64_dtype(data) or is_datetimetz(data) or
304+
is_integer_dtype(data)):
305+
data = tools.to_datetime(data, dayfirst=dayfirst,
306+
yearfirst=yearfirst)
331307

332308
if issubclass(data.dtype.type, np.datetime64) or is_datetimetz(data):
333-
if isinstance(data, ABCSeries):
334-
data = data._values
309+
335310
if isinstance(data, DatetimeIndex):
336311
if tz is None:
337312
tz = data.tz
338-
313+
elif data.tz is None:
314+
data = data.tz_localize(tz, ambiguous=ambiguous)
339315
else:
340316
# the tz's must match
341317
if str(tz) != str(data.tz):
342-
raise TypeError("Already tz-aware, use tz_convert "
343-
"to convert.")
318+
msg = ('data is already tz-aware {0}, unable to '
319+
'set specified tz: {1}')
320+
raise TypeError(msg.format(data.tz, tz))
344321

345322
subarr = data.values
346323

@@ -356,35 +333,6 @@ def __new__(cls, data=None,
356333
if isinstance(data, Int64Index):
357334
raise TypeError('cannot convert Int64Index->DatetimeIndex')
358335
subarr = data.view(_NS_DTYPE)
359-
else:
360-
if isinstance(data, (ABCSeries, Index)):
361-
values = data._values
362-
else:
363-
values = data
364-
365-
if lib.is_string_array(values):
366-
subarr = tslib.parse_str_array_to_datetime(
367-
values, freq=freq, dayfirst=dayfirst, yearfirst=yearfirst)
368-
else:
369-
try:
370-
subarr = tools.to_datetime(data, box=False)
371-
372-
# make sure that we have a index/ndarray like (and not a
373-
# Series)
374-
if isinstance(subarr, ABCSeries):
375-
subarr = subarr._values
376-
if subarr.dtype == np.object_:
377-
subarr = tools._to_datetime(subarr, box=False)
378-
379-
except ValueError:
380-
# tz aware
381-
subarr = tools._to_datetime(data, box=False, utc=True)
382-
383-
# we may not have been able to convert
384-
if not (is_datetimetz(subarr) or
385-
np.issubdtype(subarr.dtype, np.datetime64)):
386-
raise ValueError('Unable to convert %s to datetime dtype'
387-
% str(data))
388336

389337
if isinstance(subarr, DatetimeIndex):
390338
if tz is None:
@@ -399,27 +347,21 @@ def __new__(cls, data=None,
399347
ints = subarr.view('i8')
400348
subarr = tslib.tz_localize_to_utc(ints, tz,
401349
ambiguous=ambiguous)
402-
403350
subarr = subarr.view(_NS_DTYPE)
404351

405352
subarr = cls._simple_new(subarr, name=name, freq=freq, tz=tz)
406-
407-
# if dtype is provided, coerce here
408353
if dtype is not None:
409-
410354
if not is_dtype_equal(subarr.dtype, dtype):
411-
355+
# dtype must be coerced to DatetimeTZDtype above
412356
if subarr.tz is not None:
413357
raise ValueError("cannot localize from non-UTC data")
414-
dtype = DatetimeTZDtype.construct_from_string(dtype)
415-
subarr = subarr.tz_localize(dtype.tz)
416358

417359
if verify_integrity and len(subarr) > 0:
418360
if freq is not None and not freq_infer:
419361
inferred = subarr.inferred_freq
420362
if inferred != freq.freqstr:
421-
on_freq = cls._generate(subarr[0], None, len(
422-
subarr), None, freq, tz=tz, ambiguous=ambiguous)
363+
on_freq = cls._generate(subarr[0], None, len(subarr), None,
364+
freq, tz=tz, ambiguous=ambiguous)
423365
if not np.array_equal(subarr.asi8, on_freq.asi8):
424366
raise ValueError('Inferred frequency {0} from passed '
425367
'dates does not conform to passed '
@@ -563,7 +505,6 @@ def _generate(cls, start, end, periods, name, offset,
563505
index = index[1:]
564506
if not right_closed and len(index) and index[-1] == end:
565507
index = index[:-1]
566-
567508
index = cls._simple_new(index, name=name, freq=offset, tz=tz)
568509
return index
569510

@@ -669,7 +610,7 @@ def _cached_range(cls, start=None, end=None, periods=None, offset=None,
669610
xdr = generate_range(offset=offset, start=_CACHE_START,
670611
end=_CACHE_END)
671612

672-
arr = tools._to_datetime(list(xdr), box=False)
613+
arr = tools.to_datetime(list(xdr), box=False)
673614

674615
cachedRange = DatetimeIndex._simple_new(arr)
675616
cachedRange.offset = offset

pandas/tseries/resample.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -1046,7 +1046,12 @@ def _get_binner_for_grouping(self, obj):
10461046
l = []
10471047
for key, group in grouper.get_iterator(self.ax):
10481048
l.extend([key] * len(group))
1049-
grouper = binner.__class__(l, freq=binner.freq, name=binner.name)
1049+
1050+
if isinstance(self.ax, PeriodIndex):
1051+
grouper = binner.__class__(l, freq=binner.freq, name=binner.name)
1052+
else:
1053+
# resampling causes duplicated values, specifying freq is invalid
1054+
grouper = binner.__class__(l, name=binner.name)
10501055

10511056
# since we may have had to sort
10521057
# may need to reorder groups here

pandas/tseries/tests/test_timeseries.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -4087,8 +4087,9 @@ def test_dti_set_index_reindex(self):
40874087

40884088
# 11314
40894089
# with tz
4090-
index = date_range(datetime(2015, 10, 1), datetime(
4091-
2015, 10, 1, 23), freq='H', tz='US/Eastern')
4090+
index = date_range(datetime(2015, 10, 1),
4091+
datetime(2015, 10, 1, 23),
4092+
freq='H', tz='US/Eastern')
40924093
df = DataFrame(np.random.randn(24, 1), columns=['a'], index=index)
40934094
new_index = date_range(datetime(2015, 10, 2),
40944095
datetime(2015, 10, 2, 23),

pandas/tseries/tests/test_tslib.py

+13-9
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
import datetime
88

99
import pandas as pd
10-
from pandas.core.api import Timestamp, Series, Timedelta, Period, to_datetime
10+
from pandas.core.api import (Timestamp, Index, Series, Timedelta, Period,
11+
to_datetime)
1112
from pandas.tslib import get_timezone
1213
from pandas._period import period_asfreq, period_ordinal
1314
from pandas.tseries.index import date_range, DatetimeIndex
@@ -698,14 +699,19 @@ def test_parsers(self):
698699
yearfirst=yearfirst)
699700
result2 = to_datetime(date_str, yearfirst=yearfirst)
700701
result3 = to_datetime([date_str], yearfirst=yearfirst)
702+
# result5 is used below
701703
result4 = to_datetime(np.array([date_str], dtype=object),
702704
yearfirst=yearfirst)
703-
result6 = DatetimeIndex([date_str], yearfirst=yearfirst)[0]
704-
self.assertEqual(result1, expected)
705-
self.assertEqual(result2, expected)
706-
self.assertEqual(result3, expected)
707-
self.assertEqual(result4, expected)
708-
self.assertEqual(result6, expected)
705+
result6 = DatetimeIndex([date_str], yearfirst=yearfirst)
706+
# result7 is used below
707+
result8 = DatetimeIndex(Index([date_str]), yearfirst=yearfirst)
708+
result9 = DatetimeIndex(Series([date_str]), yearfirst=yearfirst)
709+
710+
for res in [result1, result2]:
711+
self.assertEqual(res, expected)
712+
for res in [result3, result4, result6, result8, result9]:
713+
exp = DatetimeIndex([pd.Timestamp(expected)])
714+
tm.assert_index_equal(res, exp)
709715

710716
# these really need to have yearfist, but we don't support
711717
if not yearfirst:
@@ -893,9 +899,7 @@ def test_parsers_monthfreq(self):
893899

894900
for date_str, expected in compat.iteritems(cases):
895901
result1, _, _ = tools.parse_time_string(date_str, freq='M')
896-
result2 = tools._to_datetime(date_str, freq='M')
897902
self.assertEqual(result1, expected)
898-
self.assertEqual(result2, expected)
899903

900904
def test_parsers_quarterly_with_freq(self):
901905
msg = ('Incorrect quarterly string is given, quarter '

0 commit comments

Comments
 (0)