Skip to content

BUG: Bug in DataFrame construction with nulls and datetimes in a list like #15892

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -997,6 +997,7 @@ Conversion
- Bug in ``Series.ffill()`` with mixed dtypes containing tz-aware datetimes. (:issue:`14956`)
- Bug in ``DataFrame.fillna()`` where the argument ``downcast`` was ignored when fillna value was of type ``dict`` (:issue:`15277`)
- Bug in ``.asfreq()``, where frequency was not set for empty ``Series`` (:issue:`14320`)
- Bug in ``DataFrame`` construction with nulls and datetimes in a list-like (:issue:`15869`)

Indexing
^^^^^^^^
Expand Down
79 changes: 67 additions & 12 deletions pandas/_libs/src/inference.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -439,31 +439,86 @@ def infer_dtype(object value):
return 'mixed'


cpdef bint is_possible_datetimelike_array(object arr):
# determine if we have a possible datetimelike (or null-like) array
cpdef object infer_datetimelike_array(object arr):
"""
infer if we have a datetime or timedelta array
- date: we have *only* date and maybe strings, nulls
- datetime: we have *only* datetimes and maybe strings, nulls
- timedelta: we have *only* timedeltas and maybe strings, nulls
- nat: we do not have *any* date, datetimes or timedeltas, but do have
at least a NaT
- mixed: other objects (strings or actual objects)

Parameters
----------
arr : object array

Returns
-------
string: {datetime, timedelta, date, nat, mixed}

"""

cdef:
Py_ssize_t i, n = len(arr)
bint seen_timedelta = 0, seen_datetime = 0
bint seen_timedelta = 0, seen_date = 0, seen_datetime = 0
bint seen_nat = 0
list objs = []
object v

for i in range(n):
v = arr[i]
if util.is_string_object(v):
continue
objs.append(v)

if len(objs) == 3:
break

elif util._checknull(v):
continue
elif is_datetime(v):
seen_datetime=1
elif is_timedelta(v):
seen_timedelta=1
# nan or None
pass
elif v is NaT:
seen_nat = 1
elif is_datetime(v) or util.is_datetime64_object(v):
# datetime, or np.datetime64
seen_datetime = 1
elif is_date(v):
seen_date = 1
elif is_timedelta(v) or util.is_timedelta64_object(v):
# timedelta, or timedelta64
seen_timedelta = 1
else:
return False
return seen_datetime or seen_timedelta
return 'mixed'

if seen_date and not (seen_datetime or seen_timedelta):
return 'date'
elif seen_datetime and not seen_timedelta:
return 'datetime'
elif seen_timedelta and not seen_datetime:
return 'timedelta'
elif seen_nat:
return 'nat'

# short-circuit by trying to
# actually convert these strings
# this is for performance as we don't need to try
# convert *every* string array
if len(objs) == 3:
try:
tslib.array_to_datetime(objs, errors='raise')
return 'datetime'
except:
pass

# we are *not* going to infer from strings
# for timedelta as too much ambiguity

return 'mixed'


cdef inline bint is_null_datetimelike(v):
# determine if we have a null for a timedelta/datetime (or integer
# versions)x
# versions)
if util._checknull(v):
return True
elif v is NaT:
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1366,6 +1366,15 @@ def test_constructor_with_datetimes(self):
.reset_index(drop=True), 'b': i_no_tz})
tm.assert_frame_equal(df, expected)

def test_constructor_datetimes_with_nulls(self):
# gh-15869
for arr in [np.array([None, None, None, None,
datetime.now(), None]),
np.array([None, None, datetime.now(), None])]:
result = DataFrame(arr).get_dtype_counts()
expected = Series({'datetime64[ns]': 1})
tm.assert_series_equal(result, expected)

def test_constructor_for_list_with_dtypes(self):
# TODO(wesm): unused
intname = np.dtype(np.int_).name # noqa
Expand Down
12 changes: 11 additions & 1 deletion pandas/tests/frame/test_misc_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from numpy.random import randn
import numpy as np

from pandas import DataFrame, Series
from pandas import DataFrame, Series, date_range, timedelta_range
import pandas as pd

from pandas.util.testing import (assert_almost_equal,
Expand Down Expand Up @@ -328,6 +328,16 @@ def test_empty_nonzero(self):
self.assertTrue(df.empty)
self.assertTrue(df.T.empty)

def test_with_datetimelikes(self):

df = DataFrame({'A': date_range('20130101', periods=10),
'B': timedelta_range('1 day', periods=10)})
t = df.T

result = t.get_dtype_counts()
expected = Series({'object': 10})
tm.assert_series_equal(result, expected)

def test_inplace_return_self(self):
# re #1893

Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,14 @@ def test_constructor_datelike_coercion(self):
result = df.loc['216']
self.assertTrue(result.dtype == object)

def test_constructor_datetimes_with_nulls(self):
# gh-15869
for arr in [np.array([None, None, None, None,
datetime.now(), None]),
np.array([None, None, datetime.now(), None])]:
result = Series(arr)
assert result.dtype == 'M8[ns]'

def test_constructor_dtype_datetime64(self):

s = Series(iNaT, dtype='M8[ns]', index=lrange(5))
Expand Down
32 changes: 18 additions & 14 deletions pandas/types/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -774,6 +774,10 @@ def maybe_infer_to_datetimelike(value, convert_dates=False):
if not v.ndim == 1:
v = v.ravel()

# we only care about object dtypes
if not is_object_dtype(v):
return value

if len(v):

def _try_datetime(v):
Expand Down Expand Up @@ -806,25 +810,25 @@ def _try_timedelta(v):
except:
return v

# do a quick inference for perf
sample = v[:min(3, len(v))]
inferred_type = lib.infer_dtype(sample)
inferred_type = lib.infer_datetimelike_array(_ensure_object(v))

if (inferred_type in ['datetime', 'datetime64'] or
(convert_dates and inferred_type in ['date'])):
if inferred_type == 'date' and convert_dates:
value = _try_datetime(v)
elif inferred_type == 'datetime':
value = _try_datetime(v)
elif inferred_type in ['timedelta', 'timedelta64']:
elif inferred_type == 'timedelta':
value = _try_timedelta(v)
elif inferred_type == 'nat':

# It's possible to have nulls intermixed within the datetime or
# timedelta. These will in general have an inferred_type of 'mixed',
# so have to try both datetime and timedelta.

# try timedelta first to avoid spurious datetime conversions
# e.g. '00:00:01' is a timedelta but technically is also a datetime
elif inferred_type in ['mixed']:
# if all NaT, return as datetime
if isnull(v).all():
value = _try_datetime(v)
else:

if lib.is_possible_datetimelike_array(_ensure_object(v)):
# We have at least a NaT and a string
# try timedelta first to avoid spurious datetime conversions
# e.g. '00:00:01' is a timedelta but
# technically is also a datetime
value = _try_timedelta(v)
if lib.infer_dtype(value) in ['mixed']:
value = _try_datetime(v)
Expand Down