Skip to content

Commit 7dc9406

Browse files
committed
ENH: allow construction of datetimes from columns in a DataFrame
closes #8158
1 parent 2fd0a06 commit 7dc9406

File tree

5 files changed

+308
-16
lines changed

5 files changed

+308
-16
lines changed

doc/source/timeseries.rst

+24-2
Original file line numberDiff line numberDiff line change
@@ -189,9 +189,31 @@ or ``format``, use ``to_datetime`` if these are required.
189189

190190
.. ipython:: python
191191
192-
to_datetime('2010/11/12')
192+
pd.to_datetime('2010/11/12')
193193
194-
Timestamp('2010/11/12')
194+
pd.Timestamp('2010/11/12')
195+
196+
.. versionadded:: 0.18.1
197+
198+
You can also pass a ``DataFrame`` of integer or string columns to assemble into a ``Series``
199+
of ``Timestamps``.
200+
201+
.. ipython:: python
202+
203+
df = pd.pd.DataFrame({'year': [2015, 2016],
204+
'month': [2, 3],
205+
'day': [4, 5],
206+
'hour': [2, 3]})
207+
pd.to_datetime(df)
208+
209+
210+
You can pass only the columns that you need to assemble.
211+
212+
.. ipython:: python
213+
214+
pd.to_datetime(df[['year', 'month', 'day']])
215+
216+
.. _whatsnew_0181.other:
195217

196218

197219
Invalid Data

doc/source/whatsnew/v0.18.1.txt

+18
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,24 @@ Partial string indexing now matches on ``DateTimeIndex`` when part of a ``MultiI
6969
dft2 = dft2.swaplevel(0, 1).sort_index()
7070
dft2.loc[idx[:, '2013-01-05'], :]
7171

72+
.. _whatsnew_0181.enhancements.assembling:
73+
74+
Assembling Datetimes
75+
^^^^^^^^^^^^^^^^^^^^
76+
77+
``pd.to_datetime()`` has gained the ability to assemble datetimes from a passed in ``DataFrame`` or a dict. (:issue:`8158`).
78+
79+
.. ipython:: python
80+
81+
df = pd.DataFrame({'year': [2015, 2016],
82+
'month': [2, 3],
83+
'day': [4, 5],
84+
'hour': [2, 3]})
85+
pd.to_datetime(df)
86+
87+
# pass only the columns that you need to assemble
88+
pd.to_datetime(df[['year', 'month', 'day']])
89+
7290
.. _whatsnew_0181.other:
7391

7492
Other Enhancements

pandas/tseries/tests/test_period.py

-10
Original file line numberDiff line numberDiff line change
@@ -3144,16 +3144,6 @@ def test_to_datetime_1703(self):
31443144
result = index.to_datetime()
31453145
self.assertEqual(result[0], Timestamp('1/1/2012'))
31463146

3147-
def test_to_datetime_dimensions(self):
3148-
# GH 11776
3149-
df = DataFrame({'a': ['1/1/2012', '1/2/2012'],
3150-
'b': ['12/30/2012', '12/31/2012']})
3151-
with tm.assertRaisesRegexp(TypeError, "1-d array"):
3152-
to_datetime(df)
3153-
for errors in ['ignore', 'raise', 'coerce']:
3154-
with tm.assertRaisesRegexp(TypeError, "1-d array"):
3155-
to_datetime(df, errors=errors)
3156-
31573147
def test_get_loc_msg(self):
31583148
idx = period_range('2000-1-1', freq='A', periods=10)
31593149
bad_period = Period('2012', 'A')

pandas/tseries/tests/test_timeseries.py

+125
Original file line numberDiff line numberDiff line change
@@ -2283,6 +2283,131 @@ def _simple_ts(start, end, freq='D'):
22832283
return Series(np.random.randn(len(rng)), index=rng)
22842284

22852285

2286+
class TestToDatetime(tm.TestCase):
2287+
_multiprocess_can_split_ = True
2288+
2289+
# TODO: move all to_datetime tests not covered in other
2290+
# classes here
2291+
2292+
def test_dataframe(self):
2293+
2294+
df = DataFrame({'year': [2015, 2016],
2295+
'month': [2, 3],
2296+
'day': [4, 5],
2297+
'hour': [6, 7],
2298+
'minute': [58, 59],
2299+
'second': [10, 11],
2300+
'ms': [1, 1],
2301+
'us': [2, 2],
2302+
'ns': [3, 3]})
2303+
2304+
result = to_datetime({'year': df['year'],
2305+
'month': df['month'],
2306+
'day': df['day']})
2307+
expected = Series([Timestamp('20150204 00:00:00'),
2308+
Timestamp('20160305 00:0:00')])
2309+
assert_series_equal(result, expected)
2310+
2311+
# dict-like
2312+
result = to_datetime(df[['year', 'month', 'day']].to_dict())
2313+
assert_series_equal(result, expected)
2314+
2315+
# dict but with constructable
2316+
df2 = df[['year', 'month', 'day']].to_dict()
2317+
df2['month'] = 2
2318+
result = to_datetime(df2)
2319+
expected2 = Series([Timestamp('20150204 00:00:00'),
2320+
Timestamp('20160205 00:0:00')])
2321+
assert_series_equal(result, expected2)
2322+
2323+
# unit mappings
2324+
units = [{'year': 'year',
2325+
'month': 'month',
2326+
'day': 'day',
2327+
'hour': 'HH',
2328+
'minute': 'MM',
2329+
'second': 'SS'},
2330+
{'year': '%Y',
2331+
'month': '%m',
2332+
'day': '%d',
2333+
'hour': '%H',
2334+
'minute': '%M',
2335+
'second': '%S'},
2336+
{'year': 'y',
2337+
'month': 'month',
2338+
'day': 'd',
2339+
'hour': 'h',
2340+
'minute': 'm',
2341+
'second': 's'},
2342+
]
2343+
2344+
for d in units:
2345+
result = to_datetime(df[list(d.keys())].rename(columns=d))
2346+
expected = Series([Timestamp('20150204 06:58:10'),
2347+
Timestamp('20160305 07:59:11')])
2348+
assert_series_equal(result, expected)
2349+
2350+
d = {'year': 'y',
2351+
'month': 'month',
2352+
'day': 'd',
2353+
'hour': 'h',
2354+
'minute': 'm',
2355+
'second': 's',
2356+
'ms': 'ms',
2357+
'us': 'us',
2358+
'ns': 'ns'}
2359+
2360+
result = to_datetime(df.rename(columns=d))
2361+
expected = Series([Timestamp('20150204 06:58:10.001002003'),
2362+
Timestamp('20160305 07:59:11.001002003')])
2363+
assert_series_equal(result, expected)
2364+
2365+
# coerce back to int
2366+
result = to_datetime(df.astype(str), unit=d)
2367+
assert_series_equal(result, expected)
2368+
2369+
# passing coerce
2370+
df2 = DataFrame({'year': [2015, 2016],
2371+
'month': [2, 20],
2372+
'day': [4, 5]})
2373+
with self.assertRaises(ValueError):
2374+
to_datetime(df2)
2375+
result = to_datetime(df2, errors='coerce')
2376+
expected = Series([Timestamp('20150204 00:00:00'),
2377+
pd.NaT])
2378+
assert_series_equal(result, expected)
2379+
2380+
# extra columns
2381+
with self.assertRaises(ValueError):
2382+
df2 = df.copy()
2383+
df2['foo'] = 1
2384+
to_datetime(df2)
2385+
2386+
# not enough
2387+
for c in [['year'],
2388+
['year', 'month'],
2389+
['year', 'month', 'second'],
2390+
['month', 'day'],
2391+
['year', 'day', 'second']]:
2392+
with self.assertRaises(ValueError):
2393+
to_datetime(df[c])
2394+
2395+
# duplicates
2396+
df2 = DataFrame({'year': [2015, 2016],
2397+
'month': [2, 20],
2398+
'day': [4, 5]})
2399+
df2.columns = ['year', 'year', 'day']
2400+
with self.assertRaises(ValueError):
2401+
to_datetime(df2)
2402+
2403+
df2 = DataFrame({'year': [2015, 2016],
2404+
'month': [2, 20],
2405+
'day': [4, 5],
2406+
'hour': [4, 5]})
2407+
df2.columns = ['year', 'month', 'day', 'day']
2408+
with self.assertRaises(ValueError):
2409+
to_datetime(df2)
2410+
22862411
class TestDatetimeIndex(tm.TestCase):
22872412
_multiprocess_can_split_ = True
22882413

pandas/tseries/tools.py

+141-4
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
from datetime import datetime, timedelta, time
22
import numpy as np
3+
from collections import MutableMapping
34

45
import pandas.lib as lib
56
import pandas.tslib as tslib
67
import pandas.core.common as com
7-
from pandas.core.common import ABCIndexClass
8+
from pandas.core.common import ABCIndexClass, ABCSeries, ABCDataFrame
89
import pandas.compat as compat
910
from pandas.util.decorators import deprecate_kwarg
1011

@@ -175,7 +176,12 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
175176
176177
Parameters
177178
----------
178-
arg : string, datetime, list, tuple, 1-d array, or Series
179+
arg : string, datetime, list, tuple, 1-d array, Series
180+
181+
.. versionadded: 0.18.1
182+
183+
or DataFrame/dict-like
184+
179185
errors : {'ignore', 'raise', 'coerce'}, default 'raise'
180186
181187
- If 'raise', then invalid parsing will raise an exception
@@ -282,6 +288,18 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
282288
>>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce')
283289
NaT
284290
291+
292+
Assembling a datetime from multiple columns of a DataFrame. The keys can be
293+
strplike (%Y, %m) or common abbreviations like ('year', 'month')
294+
295+
>>> df = pd.DataFrame({'year': [2015, 2016],
296+
'month': [2, 3],
297+
'day': [4, 5]})
298+
>>> pd.to_datetime(df)
299+
0 2015-02-04
300+
1 2016-03-05
301+
dtype: datetime64[ns]
302+
285303
"""
286304
return _to_datetime(arg, errors=errors, dayfirst=dayfirst,
287305
yearfirst=yearfirst,
@@ -296,7 +314,6 @@ def _to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
296314
Same as to_datetime, but accept freq for
297315
DatetimeIndex internal construction
298316
"""
299-
from pandas.core.series import Series
300317
from pandas.tseries.index import DatetimeIndex
301318

302319
def _convert_listlike(arg, box, format, name=None):
@@ -407,16 +424,136 @@ def _convert_listlike(arg, box, format, name=None):
407424
return arg
408425
elif isinstance(arg, tslib.Timestamp):
409426
return arg
410-
elif isinstance(arg, Series):
427+
elif isinstance(arg, ABCSeries):
428+
from pandas import Series
411429
values = _convert_listlike(arg._values, False, format)
412430
return Series(values, index=arg.index, name=arg.name)
431+
elif isinstance(arg, (ABCDataFrame, MutableMapping)):
432+
return _assemble_from_unit_mappings(arg, errors=errors)
413433
elif isinstance(arg, ABCIndexClass):
414434
return _convert_listlike(arg, box, format, name=arg.name)
415435
elif com.is_list_like(arg):
416436
return _convert_listlike(arg, box, format)
417437

418438
return _convert_listlike(np.array([arg]), box, format)[0]
419439

440+
# mappings for assembling units
441+
_unit_map = {'year': 'year',
442+
'y': 'year',
443+
'%Y': 'year',
444+
'month': 'month',
445+
'M': 'month',
446+
'%m': 'month',
447+
'day': 'day',
448+
'days': 'day',
449+
'd': 'day',
450+
'%d': 'day',
451+
'h': 'h',
452+
'hour': 'h',
453+
'hh': 'h',
454+
'%H': 'h',
455+
'minute': 'm',
456+
't': 'm',
457+
'min': 'm',
458+
'%M': 'm',
459+
'mm': 'm',
460+
'MM': 'm',
461+
'%M': 'm',
462+
's': 's',
463+
'seconds': 's',
464+
'second': 's',
465+
'%S': 's',
466+
'ss': 's',
467+
'ms': 'ms',
468+
'millisecond': 'ms',
469+
'milliseconds': 'ms',
470+
'us': 'us',
471+
'microsecond': 'us',
472+
'microseconds': 'us',
473+
'ns': 'ns',
474+
'nanosecond': 'ns',
475+
'nanoseconds': 'ns'
476+
}
477+
478+
479+
def _assemble_from_unit_mappings(arg, errors):
480+
"""
481+
assemble the unit specifed fields from the arg (DataFrame)
482+
Return a Series for actual parsing
483+
484+
Parameters
485+
----------
486+
arg : DataFrame
487+
errors : {'ignore', 'raise', 'coerce'}, default 'raise'
488+
489+
- If 'raise', then invalid parsing will raise an exception
490+
- If 'coerce', then invalid parsing will be set as NaT
491+
- If 'ignore', then invalid parsing will return the input
492+
493+
Returns
494+
-------
495+
Series
496+
"""
497+
from pandas import to_timedelta, to_numeric, DataFrame
498+
arg = DataFrame(arg)
499+
if not arg.columns.is_unique:
500+
raise ValueError("cannot assemble with duplicate keys")
501+
502+
# replace passed unit with _unit_map
503+
def f(value):
504+
if value in _unit_map:
505+
return _unit_map[value]
506+
507+
# m is case significant
508+
if value.lower() in _unit_map and not value.startswith('m'):
509+
return _unit_map[value.lower()]
510+
511+
return value
512+
513+
unit = {k: f(k) for k in arg.keys()}
514+
unit_rev = {v: k for k, v in unit.items()}
515+
516+
# we require at least Ymd
517+
required = ['year', 'month', 'day']
518+
req = sorted(list(set(required) - set(unit_rev.keys())))
519+
if len(req):
520+
raise ValueError("to assemble mappings with a dict of "
521+
"units, requires year, month, day: "
522+
"[{0}] is missing".format(','.join(req)))
523+
524+
# keys we don't recognize
525+
excess = sorted(list(set(unit_rev.keys()) - set(_unit_map.values())))
526+
if len(excess):
527+
raise ValueError("extra keys have been passed "
528+
"to the datetime assemblage: "
529+
"[{0}]".format(','.join(excess)))
530+
531+
def coerce(values):
532+
# we allow coercion to if errors allows
533+
return to_numeric(values, errors=errors)
534+
535+
values = (coerce(arg[unit_rev['year']]) * 10000 +
536+
coerce(arg[unit_rev['month']]) * 100 +
537+
coerce(arg[unit_rev['day']]))
538+
try:
539+
values = to_datetime(values, format='%Y%m%d', errors=errors)
540+
except (TypeError, ValueError) as e:
541+
raise ValueError("cannot assemble the "
542+
"datetimes: {0}".format(e))
543+
544+
for u in ['h', 'm', 's', 'ms', 'us', 'ns']:
545+
value = unit_rev.get(u)
546+
if value is not None and value in arg:
547+
try:
548+
values += to_timedelta(coerce(arg[value]),
549+
unit=u,
550+
errors=errors)
551+
except (TypeError, ValueError) as e:
552+
raise ValueError("cannot assemble the datetimes "
553+
"[{0}]: {1}".format(value, e))
554+
555+
return values
556+
420557

421558
def _attempt_YYYYMMDD(arg, errors):
422559
""" try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like,

0 commit comments

Comments
 (0)