From 7fcd4e5f7d9c7cacb80be2775566b187dd53de19 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 4 Apr 2017 16:00:34 -0400 Subject: [PATCH 1/2] BUG: Bug in DataFrame construction with nulls and datetimes in a list-like closes #15869 --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/_libs/src/inference.pyx | 80 ++++++++++++++++++++---- pandas/tests/frame/test_constructors.py | 9 +++ pandas/tests/frame/test_misc_api.py | 12 +++- pandas/tests/series/test_constructors.py | 8 +++ pandas/types/cast.py | 28 ++++----- 6 files changed, 111 insertions(+), 27 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 355dceba1b953..2e1cc396287ce 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -997,6 +997,7 @@ Conversion - Bug in ``Series.ffill()`` with mixed dtypes containing tz-aware datetimes. (:issue:`14956`) - Bug in ``DataFrame.fillna()`` where the argument ``downcast`` was ignored when fillna value was of type ``dict`` (:issue:`15277`) - Bug in ``.asfreq()``, where frequency was not set for empty ``Series`` (:issue:`14320`) +- Bug in ``DataFrame`` construction with nulls and datetimes in a list-like (:issue:`15869`) Indexing ^^^^^^^^ diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index b0fb7048f154c..e9e2eab92f964 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -439,31 +439,87 @@ def infer_dtype(object value): return 'mixed' -cpdef bint is_possible_datetimelike_array(object arr): - # determine if we have a possible datetimelike (or null-like) array +cpdef object infer_datetimelike_array(object arr): + """ + infer if we have a datetime or timedelta array + - date: we have *only* date and myabe strings, nulls + - datetime: we have *only* datetimes and maybe strings, nulls + - timedelta: we have *only* timedeltas and maybe strings, nulls + - nat: we do not have *any* date, datetimes or timedeltas, but do have + at least a NaT + - mixed: other objects (strings or actual objects) + + Parameters + ---------- + arr : object array + + Returns + ------- + string: {datetime, timedelta, date, nat, mixed} + + """ + cdef: Py_ssize_t i, n = len(arr) - bint seen_timedelta = 0, seen_datetime = 0 + bint seen_timedelta = 0, seen_date = 0, seen_datetime = 0 + bint seen_nat = 0 + list objs = [] object v for i in range(n): v = arr[i] if util.is_string_object(v): - continue + objs.append(v) + + if len(objs) == 3: + break + elif util._checknull(v): - continue - elif is_datetime(v): - seen_datetime=1 - elif is_timedelta(v): - seen_timedelta=1 + # nan or None + seen_null = 1 + elif v is NaT: + seen_nat = 1 + elif is_datetime(v) or util.is_datetime64_object(v): + # datetime, or np.datetime64 + seen_datetime = 1 + elif is_date(v): + seen_date = 1 + elif is_timedelta(v) or util.is_timedelta64_object(v): + # timedelta, or timedelta64 + seen_timedelta = 1 else: - return False - return seen_datetime or seen_timedelta + return 'mixed' + + if seen_date and not (seen_datetime or seen_timedelta): + return 'date' + elif seen_datetime and not seen_timedelta: + return 'datetime' + elif seen_timedelta and not seen_datetime: + return 'timedelta' + elif seen_nat: + return 'nat' + + # short-circuit by trying to + # actually convert these strings + # this is for performance as we don't need to try + # convert *every* string array + if len(objs) == 3: + try: + tslib.array_to_datetime(objs, errors='raise') + return 'datetime' + except: + pass + + # we are *not* going to infer from strings + # for timedelta as too much ambiguity + + return 'mixed' + cdef inline bint is_null_datetimelike(v): # determine if we have a null for a timedelta/datetime (or integer - # versions)x + # versions) if util._checknull(v): return True elif v is NaT: diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 1ab292649a973..6d28d3b4dfcd5 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1366,6 +1366,15 @@ def test_constructor_with_datetimes(self): .reset_index(drop=True), 'b': i_no_tz}) tm.assert_frame_equal(df, expected) + def test_constructor_datetimes_with_nulls(self): + # gh-15869 + for arr in [np.array([None, None, None, None, + datetime.now(), None]), + np.array([None, None, datetime.now(), None])]: + result = DataFrame(arr).get_dtype_counts() + expected = Series({'datetime64[ns]': 1}) + tm.assert_series_equal(result, expected) + def test_constructor_for_list_with_dtypes(self): # TODO(wesm): unused intname = np.dtype(np.int_).name # noqa diff --git a/pandas/tests/frame/test_misc_api.py b/pandas/tests/frame/test_misc_api.py index 42427df90401d..cf88c0206442d 100644 --- a/pandas/tests/frame/test_misc_api.py +++ b/pandas/tests/frame/test_misc_api.py @@ -12,7 +12,7 @@ from numpy.random import randn import numpy as np -from pandas import DataFrame, Series +from pandas import DataFrame, Series, date_range, timedelta_range import pandas as pd from pandas.util.testing import (assert_almost_equal, @@ -328,6 +328,16 @@ def test_empty_nonzero(self): self.assertTrue(df.empty) self.assertTrue(df.T.empty) + def test_with_datetimelikes(self): + + df = DataFrame({'A': date_range('20130101', periods=2), + 'B': timedelta_range('1 day', periods=2)}) + t = df.T + + result = t.get_dtype_counts() + expected = Series({'object': 2}) + tm.assert_series_equal(result, expected) + def test_inplace_return_self(self): # re #1893 diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 24e4355fa9f9a..dbe2db67359f3 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -327,6 +327,14 @@ def test_constructor_datelike_coercion(self): result = df.loc['216'] self.assertTrue(result.dtype == object) + def test_constructor_datetimes_with_nulls(self): + # gh-15869 + for arr in [np.array([None, None, None, None, + datetime.now(), None]), + np.array([None, None, datetime.now(), None])]: + result = Series(arr) + assert result.dtype == 'M8[ns]' + def test_constructor_dtype_datetime64(self): s = Series(iNaT, dtype='M8[ns]', index=lrange(5)) diff --git a/pandas/types/cast.py b/pandas/types/cast.py index 985e5b9f95831..dd3b700406e0b 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -806,25 +806,25 @@ def _try_timedelta(v): except: return v - # do a quick inference for perf - sample = v[:min(3, len(v))] - inferred_type = lib.infer_dtype(sample) + inferred_type = lib.infer_datetimelike_array(_ensure_object(v)) - if (inferred_type in ['datetime', 'datetime64'] or - (convert_dates and inferred_type in ['date'])): + if inferred_type == 'date' and convert_dates: value = _try_datetime(v) - elif inferred_type in ['timedelta', 'timedelta64']: + elif inferred_type == 'datetime': + value = _try_datetime(v) + elif inferred_type == 'timedelta': value = _try_timedelta(v) + elif inferred_type == 'nat': - # It's possible to have nulls intermixed within the datetime or - # timedelta. These will in general have an inferred_type of 'mixed', - # so have to try both datetime and timedelta. - - # try timedelta first to avoid spurious datetime conversions - # e.g. '00:00:01' is a timedelta but technically is also a datetime - elif inferred_type in ['mixed']: + # if all NaT, return as datetime + if isnull(v).all(): + value = _try_datetime(v) + else: - if lib.is_possible_datetimelike_array(_ensure_object(v)): + # We have at least a NaT and a string + # try timedelta first to avoid spurious datetime conversions + # e.g. '00:00:01' is a timedelta but + # technically is also a datetime value = _try_timedelta(v) if lib.infer_dtype(value) in ['mixed']: value = _try_datetime(v) From 6bf2148645dcf64e0efa170745cb255de8a6a0f2 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 4 Apr 2017 17:20:35 -0400 Subject: [PATCH 2/2] fix perf --- pandas/_libs/src/inference.pyx | 5 ++--- pandas/tests/frame/test_misc_api.py | 6 +++--- pandas/types/cast.py | 4 ++++ 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index e9e2eab92f964..905f5278bcfd8 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -442,7 +442,7 @@ def infer_dtype(object value): cpdef object infer_datetimelike_array(object arr): """ infer if we have a datetime or timedelta array - - date: we have *only* date and myabe strings, nulls + - date: we have *only* date and maybe strings, nulls - datetime: we have *only* datetimes and maybe strings, nulls - timedelta: we have *only* timedeltas and maybe strings, nulls - nat: we do not have *any* date, datetimes or timedeltas, but do have @@ -476,7 +476,7 @@ cpdef object infer_datetimelike_array(object arr): elif util._checknull(v): # nan or None - seen_null = 1 + pass elif v is NaT: seen_nat = 1 elif is_datetime(v) or util.is_datetime64_object(v): @@ -516,7 +516,6 @@ cpdef object infer_datetimelike_array(object arr): return 'mixed' - cdef inline bint is_null_datetimelike(v): # determine if we have a null for a timedelta/datetime (or integer # versions) diff --git a/pandas/tests/frame/test_misc_api.py b/pandas/tests/frame/test_misc_api.py index cf88c0206442d..50fa0dca6bf04 100644 --- a/pandas/tests/frame/test_misc_api.py +++ b/pandas/tests/frame/test_misc_api.py @@ -330,12 +330,12 @@ def test_empty_nonzero(self): def test_with_datetimelikes(self): - df = DataFrame({'A': date_range('20130101', periods=2), - 'B': timedelta_range('1 day', periods=2)}) + df = DataFrame({'A': date_range('20130101', periods=10), + 'B': timedelta_range('1 day', periods=10)}) t = df.T result = t.get_dtype_counts() - expected = Series({'object': 2}) + expected = Series({'object': 10}) tm.assert_series_equal(result, expected) def test_inplace_return_self(self): diff --git a/pandas/types/cast.py b/pandas/types/cast.py index dd3b700406e0b..4180ad1919315 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -774,6 +774,10 @@ def maybe_infer_to_datetimelike(value, convert_dates=False): if not v.ndim == 1: v = v.ravel() + # we only care about object dtypes + if not is_object_dtype(v): + return value + if len(v): def _try_datetime(v):