Skip to content

Commit 7fcd4e5

Browse files
committed
BUG: Bug in DataFrame construction with nulls and datetimes in a list-like
closes #15869
1 parent e50d397 commit 7fcd4e5

File tree

6 files changed

+111
-27
lines changed

6 files changed

+111
-27
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -997,6 +997,7 @@ Conversion
997997
- Bug in ``Series.ffill()`` with mixed dtypes containing tz-aware datetimes. (:issue:`14956`)
998998
- Bug in ``DataFrame.fillna()`` where the argument ``downcast`` was ignored when fillna value was of type ``dict`` (:issue:`15277`)
999999
- Bug in ``.asfreq()``, where frequency was not set for empty ``Series`` (:issue:`14320`)
1000+
- Bug in ``DataFrame`` construction with nulls and datetimes in a list-like (:issue:`15869`)
10001001

10011002
Indexing
10021003
^^^^^^^^

pandas/_libs/src/inference.pyx

+68-12
Original file line numberDiff line numberDiff line change
@@ -439,31 +439,87 @@ def infer_dtype(object value):
439439
return 'mixed'
440440

441441

442-
cpdef bint is_possible_datetimelike_array(object arr):
443-
# determine if we have a possible datetimelike (or null-like) array
442+
cpdef object infer_datetimelike_array(object arr):
443+
"""
444+
infer if we have a datetime or timedelta array
445+
- date: we have *only* date and myabe strings, nulls
446+
- datetime: we have *only* datetimes and maybe strings, nulls
447+
- timedelta: we have *only* timedeltas and maybe strings, nulls
448+
- nat: we do not have *any* date, datetimes or timedeltas, but do have
449+
at least a NaT
450+
- mixed: other objects (strings or actual objects)
451+
452+
Parameters
453+
----------
454+
arr : object array
455+
456+
Returns
457+
-------
458+
string: {datetime, timedelta, date, nat, mixed}
459+
460+
"""
461+
444462
cdef:
445463
Py_ssize_t i, n = len(arr)
446-
bint seen_timedelta = 0, seen_datetime = 0
464+
bint seen_timedelta = 0, seen_date = 0, seen_datetime = 0
465+
bint seen_nat = 0
466+
list objs = []
447467
object v
448468

449469
for i in range(n):
450470
v = arr[i]
451471
if util.is_string_object(v):
452-
continue
472+
objs.append(v)
473+
474+
if len(objs) == 3:
475+
break
476+
453477
elif util._checknull(v):
454-
continue
455-
elif is_datetime(v):
456-
seen_datetime=1
457-
elif is_timedelta(v):
458-
seen_timedelta=1
478+
# nan or None
479+
seen_null = 1
480+
elif v is NaT:
481+
seen_nat = 1
482+
elif is_datetime(v) or util.is_datetime64_object(v):
483+
# datetime, or np.datetime64
484+
seen_datetime = 1
485+
elif is_date(v):
486+
seen_date = 1
487+
elif is_timedelta(v) or util.is_timedelta64_object(v):
488+
# timedelta, or timedelta64
489+
seen_timedelta = 1
459490
else:
460-
return False
461-
return seen_datetime or seen_timedelta
491+
return 'mixed'
492+
493+
if seen_date and not (seen_datetime or seen_timedelta):
494+
return 'date'
495+
elif seen_datetime and not seen_timedelta:
496+
return 'datetime'
497+
elif seen_timedelta and not seen_datetime:
498+
return 'timedelta'
499+
elif seen_nat:
500+
return 'nat'
501+
502+
# short-circuit by trying to
503+
# actually convert these strings
504+
# this is for performance as we don't need to try
505+
# convert *every* string array
506+
if len(objs) == 3:
507+
try:
508+
tslib.array_to_datetime(objs, errors='raise')
509+
return 'datetime'
510+
except:
511+
pass
512+
513+
# we are *not* going to infer from strings
514+
# for timedelta as too much ambiguity
515+
516+
return 'mixed'
517+
462518

463519

464520
cdef inline bint is_null_datetimelike(v):
465521
# determine if we have a null for a timedelta/datetime (or integer
466-
# versions)x
522+
# versions)
467523
if util._checknull(v):
468524
return True
469525
elif v is NaT:

pandas/tests/frame/test_constructors.py

+9
Original file line numberDiff line numberDiff line change
@@ -1366,6 +1366,15 @@ def test_constructor_with_datetimes(self):
13661366
.reset_index(drop=True), 'b': i_no_tz})
13671367
tm.assert_frame_equal(df, expected)
13681368

1369+
def test_constructor_datetimes_with_nulls(self):
1370+
# gh-15869
1371+
for arr in [np.array([None, None, None, None,
1372+
datetime.now(), None]),
1373+
np.array([None, None, datetime.now(), None])]:
1374+
result = DataFrame(arr).get_dtype_counts()
1375+
expected = Series({'datetime64[ns]': 1})
1376+
tm.assert_series_equal(result, expected)
1377+
13691378
def test_constructor_for_list_with_dtypes(self):
13701379
# TODO(wesm): unused
13711380
intname = np.dtype(np.int_).name # noqa

pandas/tests/frame/test_misc_api.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from numpy.random import randn
1313
import numpy as np
1414

15-
from pandas import DataFrame, Series
15+
from pandas import DataFrame, Series, date_range, timedelta_range
1616
import pandas as pd
1717

1818
from pandas.util.testing import (assert_almost_equal,
@@ -328,6 +328,16 @@ def test_empty_nonzero(self):
328328
self.assertTrue(df.empty)
329329
self.assertTrue(df.T.empty)
330330

331+
def test_with_datetimelikes(self):
332+
333+
df = DataFrame({'A': date_range('20130101', periods=2),
334+
'B': timedelta_range('1 day', periods=2)})
335+
t = df.T
336+
337+
result = t.get_dtype_counts()
338+
expected = Series({'object': 2})
339+
tm.assert_series_equal(result, expected)
340+
331341
def test_inplace_return_self(self):
332342
# re #1893
333343

pandas/tests/series/test_constructors.py

+8
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,14 @@ def test_constructor_datelike_coercion(self):
327327
result = df.loc['216']
328328
self.assertTrue(result.dtype == object)
329329

330+
def test_constructor_datetimes_with_nulls(self):
331+
# gh-15869
332+
for arr in [np.array([None, None, None, None,
333+
datetime.now(), None]),
334+
np.array([None, None, datetime.now(), None])]:
335+
result = Series(arr)
336+
assert result.dtype == 'M8[ns]'
337+
330338
def test_constructor_dtype_datetime64(self):
331339

332340
s = Series(iNaT, dtype='M8[ns]', index=lrange(5))

pandas/types/cast.py

+14-14
Original file line numberDiff line numberDiff line change
@@ -806,25 +806,25 @@ def _try_timedelta(v):
806806
except:
807807
return v
808808

809-
# do a quick inference for perf
810-
sample = v[:min(3, len(v))]
811-
inferred_type = lib.infer_dtype(sample)
809+
inferred_type = lib.infer_datetimelike_array(_ensure_object(v))
812810

813-
if (inferred_type in ['datetime', 'datetime64'] or
814-
(convert_dates and inferred_type in ['date'])):
811+
if inferred_type == 'date' and convert_dates:
815812
value = _try_datetime(v)
816-
elif inferred_type in ['timedelta', 'timedelta64']:
813+
elif inferred_type == 'datetime':
814+
value = _try_datetime(v)
815+
elif inferred_type == 'timedelta':
817816
value = _try_timedelta(v)
817+
elif inferred_type == 'nat':
818818

819-
# It's possible to have nulls intermixed within the datetime or
820-
# timedelta. These will in general have an inferred_type of 'mixed',
821-
# so have to try both datetime and timedelta.
822-
823-
# try timedelta first to avoid spurious datetime conversions
824-
# e.g. '00:00:01' is a timedelta but technically is also a datetime
825-
elif inferred_type in ['mixed']:
819+
# if all NaT, return as datetime
820+
if isnull(v).all():
821+
value = _try_datetime(v)
822+
else:
826823

827-
if lib.is_possible_datetimelike_array(_ensure_object(v)):
824+
# We have at least a NaT and a string
825+
# try timedelta first to avoid spurious datetime conversions
826+
# e.g. '00:00:01' is a timedelta but
827+
# technically is also a datetime
828828
value = _try_timedelta(v)
829829
if lib.infer_dtype(value) in ['mixed']:
830830
value = _try_datetime(v)

0 commit comments

Comments
 (0)