Skip to content

Commit e0b60c0

Browse files
committed
BUG: Bug in DataFrame construction with nulls and datetimes in a list like
closes #15869 Author: Jeff Reback <[email protected]> Closes #15892 from jreback/construct and squashes the following commits: 6bf2148 [Jeff Reback] fix perf 7fcd4e5 [Jeff Reback] BUG: Bug in DataFrame construction with nulls and datetimes in a list-like
1 parent e50d397 commit e0b60c0

File tree

6 files changed

+149
-62
lines changed

6 files changed

+149
-62
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -997,6 +997,7 @@ Conversion
997997
- Bug in ``Series.ffill()`` with mixed dtypes containing tz-aware datetimes. (:issue:`14956`)
998998
- Bug in ``DataFrame.fillna()`` where the argument ``downcast`` was ignored when fillna value was of type ``dict`` (:issue:`15277`)
999999
- Bug in ``.asfreq()``, where frequency was not set for empty ``Series`` (:issue:`14320`)
1000+
- Bug in ``DataFrame`` construction with nulls and datetimes in a list-like (:issue:`15869`)
10001001

10011002
Indexing
10021003
^^^^^^^^

pandas/_libs/src/inference.pyx

+67-12
Original file line numberDiff line numberDiff line change
@@ -439,31 +439,86 @@ def infer_dtype(object value):
439439
return 'mixed'
440440

441441

442-
cpdef bint is_possible_datetimelike_array(object arr):
443-
# determine if we have a possible datetimelike (or null-like) array
442+
cpdef object infer_datetimelike_array(object arr):
443+
"""
444+
infer if we have a datetime or timedelta array
445+
- date: we have *only* date and maybe strings, nulls
446+
- datetime: we have *only* datetimes and maybe strings, nulls
447+
- timedelta: we have *only* timedeltas and maybe strings, nulls
448+
- nat: we do not have *any* date, datetimes or timedeltas, but do have
449+
at least a NaT
450+
- mixed: other objects (strings or actual objects)
451+
452+
Parameters
453+
----------
454+
arr : object array
455+
456+
Returns
457+
-------
458+
string: {datetime, timedelta, date, nat, mixed}
459+
460+
"""
461+
444462
cdef:
445463
Py_ssize_t i, n = len(arr)
446-
bint seen_timedelta = 0, seen_datetime = 0
464+
bint seen_timedelta = 0, seen_date = 0, seen_datetime = 0
465+
bint seen_nat = 0
466+
list objs = []
447467
object v
448468

449469
for i in range(n):
450470
v = arr[i]
451471
if util.is_string_object(v):
452-
continue
472+
objs.append(v)
473+
474+
if len(objs) == 3:
475+
break
476+
453477
elif util._checknull(v):
454-
continue
455-
elif is_datetime(v):
456-
seen_datetime=1
457-
elif is_timedelta(v):
458-
seen_timedelta=1
478+
# nan or None
479+
pass
480+
elif v is NaT:
481+
seen_nat = 1
482+
elif is_datetime(v) or util.is_datetime64_object(v):
483+
# datetime, or np.datetime64
484+
seen_datetime = 1
485+
elif is_date(v):
486+
seen_date = 1
487+
elif is_timedelta(v) or util.is_timedelta64_object(v):
488+
# timedelta, or timedelta64
489+
seen_timedelta = 1
459490
else:
460-
return False
461-
return seen_datetime or seen_timedelta
491+
return 'mixed'
492+
493+
if seen_date and not (seen_datetime or seen_timedelta):
494+
return 'date'
495+
elif seen_datetime and not seen_timedelta:
496+
return 'datetime'
497+
elif seen_timedelta and not seen_datetime:
498+
return 'timedelta'
499+
elif seen_nat:
500+
return 'nat'
501+
502+
# short-circuit by trying to
503+
# actually convert these strings
504+
# this is for performance as we don't need to try
505+
# convert *every* string array
506+
if len(objs):
507+
try:
508+
tslib.array_to_datetime(objs, errors='raise')
509+
return 'datetime'
510+
except:
511+
pass
512+
513+
# we are *not* going to infer from strings
514+
# for timedelta as too much ambiguity
515+
516+
return 'mixed'
462517

463518

464519
cdef inline bint is_null_datetimelike(v):
465520
# determine if we have a null for a timedelta/datetime (or integer
466-
# versions)x
521+
# versions)
467522
if util._checknull(v):
468523
return True
469524
elif v is NaT:

pandas/tests/frame/test_constructors.py

+9
Original file line numberDiff line numberDiff line change
@@ -1366,6 +1366,15 @@ def test_constructor_with_datetimes(self):
13661366
.reset_index(drop=True), 'b': i_no_tz})
13671367
tm.assert_frame_equal(df, expected)
13681368

1369+
def test_constructor_datetimes_with_nulls(self):
1370+
# gh-15869
1371+
for arr in [np.array([None, None, None, None,
1372+
datetime.now(), None]),
1373+
np.array([None, None, datetime.now(), None])]:
1374+
result = DataFrame(arr).get_dtype_counts()
1375+
expected = Series({'datetime64[ns]': 1})
1376+
tm.assert_series_equal(result, expected)
1377+
13691378
def test_constructor_for_list_with_dtypes(self):
13701379
# TODO(wesm): unused
13711380
intname = np.dtype(np.int_).name # noqa

pandas/tests/frame/test_misc_api.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from numpy.random import randn
1313
import numpy as np
1414

15-
from pandas import DataFrame, Series
15+
from pandas import DataFrame, Series, date_range, timedelta_range
1616
import pandas as pd
1717

1818
from pandas.util.testing import (assert_almost_equal,
@@ -328,6 +328,16 @@ def test_empty_nonzero(self):
328328
self.assertTrue(df.empty)
329329
self.assertTrue(df.T.empty)
330330

331+
def test_with_datetimelikes(self):
332+
333+
df = DataFrame({'A': date_range('20130101', periods=10),
334+
'B': timedelta_range('1 day', periods=10)})
335+
t = df.T
336+
337+
result = t.get_dtype_counts()
338+
expected = Series({'object': 10})
339+
tm.assert_series_equal(result, expected)
340+
331341
def test_inplace_return_self(self):
332342
# re #1893
333343

pandas/tests/series/test_constructors.py

+8
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,14 @@ def test_constructor_datelike_coercion(self):
327327
result = df.loc['216']
328328
self.assertTrue(result.dtype == object)
329329

330+
def test_constructor_datetimes_with_nulls(self):
331+
# gh-15869
332+
for arr in [np.array([None, None, None, None,
333+
datetime.now(), None]),
334+
np.array([None, None, datetime.now(), None])]:
335+
result = Series(arr)
336+
assert result.dtype == 'M8[ns]'
337+
330338
def test_constructor_dtype_datetime64(self):
331339

332340
s = Series(iNaT, dtype='M8[ns]', index=lrange(5))

pandas/types/cast.py

+53-49
Original file line numberDiff line numberDiff line change
@@ -748,8 +748,6 @@ def maybe_infer_to_datetimelike(value, convert_dates=False):
748748
this is pretty strict in that a datetime/timedelta is REQUIRED
749749
in addition to possible nulls/string likes
750750
751-
ONLY strings are NOT datetimelike
752-
753751
Parameters
754752
----------
755753
value : np.array / Series / Index / list-like
@@ -770,64 +768,70 @@ def maybe_infer_to_datetimelike(value, convert_dates=False):
770768
if not is_list_like(v):
771769
v = [v]
772770
v = np.array(v, copy=False)
771+
772+
# we only care about object dtypes
773+
if not is_object_dtype(v):
774+
return value
775+
773776
shape = v.shape
774777
if not v.ndim == 1:
775778
v = v.ravel()
776779

777-
if len(v):
778-
779-
def _try_datetime(v):
780-
# safe coerce to datetime64
781-
try:
782-
v = tslib.array_to_datetime(v, errors='raise')
783-
except ValueError:
780+
if not len(v):
781+
return value
784782

785-
# we might have a sequence of the same-datetimes with tz's
786-
# if so coerce to a DatetimeIndex; if they are not the same,
787-
# then these stay as object dtype
788-
try:
789-
from pandas import to_datetime
790-
return to_datetime(v)
791-
except:
792-
pass
783+
def try_datetime(v):
784+
# safe coerce to datetime64
785+
try:
786+
v = tslib.array_to_datetime(v, errors='raise')
787+
except ValueError:
793788

789+
# we might have a sequence of the same-datetimes with tz's
790+
# if so coerce to a DatetimeIndex; if they are not the same,
791+
# then these stay as object dtype
792+
try:
793+
from pandas import to_datetime
794+
return to_datetime(v)
794795
except:
795796
pass
796797

797-
return v.reshape(shape)
798+
except:
799+
pass
798800

799-
def _try_timedelta(v):
800-
# safe coerce to timedelta64
801+
return v.reshape(shape)
801802

802-
# will try first with a string & object conversion
803-
from pandas import to_timedelta
804-
try:
805-
return to_timedelta(v)._values.reshape(shape)
806-
except:
807-
return v
808-
809-
# do a quick inference for perf
810-
sample = v[:min(3, len(v))]
811-
inferred_type = lib.infer_dtype(sample)
812-
813-
if (inferred_type in ['datetime', 'datetime64'] or
814-
(convert_dates and inferred_type in ['date'])):
815-
value = _try_datetime(v)
816-
elif inferred_type in ['timedelta', 'timedelta64']:
817-
value = _try_timedelta(v)
818-
819-
# It's possible to have nulls intermixed within the datetime or
820-
# timedelta. These will in general have an inferred_type of 'mixed',
821-
# so have to try both datetime and timedelta.
822-
823-
# try timedelta first to avoid spurious datetime conversions
824-
# e.g. '00:00:01' is a timedelta but technically is also a datetime
825-
elif inferred_type in ['mixed']:
826-
827-
if lib.is_possible_datetimelike_array(_ensure_object(v)):
828-
value = _try_timedelta(v)
829-
if lib.infer_dtype(value) in ['mixed']:
830-
value = _try_datetime(v)
803+
def try_timedelta(v):
804+
# safe coerce to timedelta64
805+
806+
# will try first with a string & object conversion
807+
from pandas import to_timedelta
808+
try:
809+
return to_timedelta(v)._values.reshape(shape)
810+
except:
811+
return v
812+
813+
inferred_type = lib.infer_datetimelike_array(_ensure_object(v))
814+
815+
if inferred_type == 'date' and convert_dates:
816+
value = try_datetime(v)
817+
elif inferred_type == 'datetime':
818+
value = try_datetime(v)
819+
elif inferred_type == 'timedelta':
820+
value = try_timedelta(v)
821+
elif inferred_type == 'nat':
822+
823+
# if all NaT, return as datetime
824+
if isnull(v).all():
825+
value = try_datetime(v)
826+
else:
827+
828+
# We have at least a NaT and a string
829+
# try timedelta first to avoid spurious datetime conversions
830+
# e.g. '00:00:01' is a timedelta but
831+
# technically is also a datetime
832+
value = try_timedelta(v)
833+
if lib.infer_dtype(value) in ['mixed']:
834+
value = try_datetime(v)
831835

832836
return value
833837

0 commit comments

Comments
 (0)