diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 70c466ed51681..39cb4a26d413c 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -484,8 +484,9 @@ Bug Fixes - Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError`` (:issue:`13156`) - Bug in ``Series.str.extractall()`` with single group and quantifier (:issue:`13382`) - - Bug in ``DatetimeIndex`` and ``Period`` subtraction raises ``ValueError`` or ``AttributeError`` rather than ``TypeError`` (:issue:`13078`) +- Bug in ``Index`` and ``Series`` created with ``NaN`` and ``NaT`` mixed data may not have ``datetime64`` dtype (:issue:`13324`) +- Bug in ``Index`` and ``Series`` may ignore ``np.datetime64('nat')`` and ``np.timdelta64('nat')`` to infer dtype (:issue:`13324`) - Bug in ``PeriodIndex`` and ``Period`` subtraction raises ``AttributeError`` (:issue:`13071`) - Bug in ``PeriodIndex`` construction returning a ``float64`` index in some circumstances (:issue:`13067`) - Bug in ``.resample(..)`` with a ``PeriodIndex`` not changing its ``freq`` appropriately when empty (:issue:`13067`) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index ad27010714f63..2f0691f1d1bb0 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -242,8 +242,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # don't support boolean explicity ATM pass elif inferred != 'string': - if (inferred.startswith('datetime') or - tslib.is_timestamp_array(subarr)): + if inferred.startswith('datetime'): if (lib.is_datetime_with_singletz_array(subarr) or 'tz' in kwargs): diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 234ac7ea2c60c..9f96037c97c62 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -103,6 +103,7 @@ def infer_dtype(object _values): Py_ssize_t i, n object val ndarray values + bint seen_pdnat = False, seen_val = False if isinstance(_values, np.ndarray): values = _values @@ -141,17 +142,34 @@ def infer_dtype(object _values): values = values.ravel() # try to use a valid value - for i in range(n): - val = util.get_value_1d(values, i) - if not is_null_datetimelike(val): - break + for i from 0 <= i < n: + val = util.get_value_1d(values, i) - if util.is_datetime64_object(val) or val is NaT: + # do not use is_nul_datetimelike to keep + # np.datetime64('nat') and np.timedelta64('nat') + if util._checknull(val): + pass + elif val is NaT: + seen_pdnat = True + else: + seen_val = True + break + + # if all values are nan/NaT + if seen_val is False and seen_pdnat is True: + return 'datetime' + # float/object nan is handled in latter logic + + if util.is_datetime64_object(val): if is_datetime64_array(values): return 'datetime64' elif is_timedelta_or_timedelta64_array(values): return 'timedelta' + elif is_timedelta(val): + if is_timedelta_or_timedelta64_array(values): + return 'timedelta' + elif util.is_integer_object(val): # a timedelta will show true here as well if is_timedelta(val): @@ -200,17 +218,15 @@ def infer_dtype(object _values): if is_bytes_array(values): return 'bytes' - elif is_timedelta(val): - if is_timedelta_or_timedelta64_array(values): - return 'timedelta' - elif is_period(val): if is_period_array(values): return 'period' for i in range(n): val = util.get_value_1d(values, i) - if util.is_integer_object(val): + if (util.is_integer_object(val) and + not util.is_timedelta64_object(val) and + not util.is_datetime64_object(val)): return 'mixed-integer' return 'mixed' @@ -237,20 +253,46 @@ def is_possible_datetimelike_array(object arr): return False return seen_datetime or seen_timedelta + cdef inline bint is_null_datetimelike(v): # determine if we have a null for a timedelta/datetime (or integer versions)x if util._checknull(v): return True + elif v is NaT: + return True elif util.is_timedelta64_object(v): return v.view('int64') == iNaT elif util.is_datetime64_object(v): return v.view('int64') == iNaT elif util.is_integer_object(v): return v == iNaT + return False + + +cdef inline bint is_null_datetime64(v): + # determine if we have a null for a datetime (or integer versions)x, + # excluding np.timedelta64('nat') + if util._checknull(v): + return True + elif v is NaT: + return True + elif util.is_datetime64_object(v): + return v.view('int64') == iNaT + return False + + +cdef inline bint is_null_timedelta64(v): + # determine if we have a null for a timedelta (or integer versions)x, + # excluding np.datetime64('nat') + if util._checknull(v): + return True elif v is NaT: return True + elif util.is_timedelta64_object(v): + return v.view('int64') == iNaT return False + cdef inline bint is_datetime(object o): return PyDateTime_Check(o) @@ -420,7 +462,7 @@ def is_datetime_array(ndarray[object] values): # return False for all nulls for i in range(n): v = values[i] - if is_null_datetimelike(v): + if is_null_datetime64(v): # we are a regular null if util._checknull(v): null_count += 1 @@ -437,7 +479,7 @@ def is_datetime64_array(ndarray values): # return False for all nulls for i in range(n): v = values[i] - if is_null_datetimelike(v): + if is_null_datetime64(v): # we are a regular null if util._checknull(v): null_count += 1 @@ -481,7 +523,7 @@ def is_timedelta_array(ndarray values): return False for i in range(n): v = values[i] - if is_null_datetimelike(v): + if is_null_timedelta64(v): # we are a regular null if util._checknull(v): null_count += 1 @@ -496,7 +538,7 @@ def is_timedelta64_array(ndarray values): return False for i in range(n): v = values[i] - if is_null_datetimelike(v): + if is_null_timedelta64(v): # we are a regular null if util._checknull(v): null_count += 1 @@ -512,7 +554,7 @@ def is_timedelta_or_timedelta64_array(ndarray values): return False for i in range(n): v = values[i] - if is_null_datetimelike(v): + if is_null_timedelta64(v): # we are a regular null if util._checknull(v): null_count += 1 diff --git a/pandas/src/util.pxd b/pandas/src/util.pxd index 96a23a91cc7c2..fcb5583a0a6e7 100644 --- a/pandas/src/util.pxd +++ b/pandas/src/util.pxd @@ -98,4 +98,4 @@ cdef inline bint _checknan(object val): return not cnp.PyArray_Check(val) and val != val cdef inline bint is_period_object(object val): - return getattr(val,'_typ','_typ') == 'period' + return getattr(val, '_typ', '_typ') == 'period' diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index d535eaa238567..67869901b068e 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -203,6 +203,49 @@ def __array__(self, dtype=None): result = pd.Index(ArrayLike(array)) self.assert_index_equal(result, expected) + def test_index_ctor_infer_nan_nat(self): + # GH 13467 + exp = pd.Float64Index([np.nan, np.nan]) + self.assertEqual(exp.dtype, np.float64) + tm.assert_index_equal(Index([np.nan, np.nan]), exp) + tm.assert_index_equal(Index(np.array([np.nan, np.nan])), exp) + + exp = pd.DatetimeIndex([pd.NaT, pd.NaT]) + self.assertEqual(exp.dtype, 'datetime64[ns]') + tm.assert_index_equal(Index([pd.NaT, pd.NaT]), exp) + tm.assert_index_equal(Index(np.array([pd.NaT, pd.NaT])), exp) + + exp = pd.DatetimeIndex([pd.NaT, pd.NaT]) + self.assertEqual(exp.dtype, 'datetime64[ns]') + + for data in [[pd.NaT, np.nan], [np.nan, pd.NaT], + [np.nan, np.datetime64('nat')], + [np.datetime64('nat'), np.nan]]: + tm.assert_index_equal(Index(data), exp) + tm.assert_index_equal(Index(np.array(data, dtype=object)), exp) + + exp = pd.TimedeltaIndex([pd.NaT, pd.NaT]) + self.assertEqual(exp.dtype, 'timedelta64[ns]') + + for data in [[np.nan, np.timedelta64('nat')], + [np.timedelta64('nat'), np.nan], + [pd.NaT, np.timedelta64('nat')], + [np.timedelta64('nat'), pd.NaT]]: + + tm.assert_index_equal(Index(data), exp) + tm.assert_index_equal(Index(np.array(data, dtype=object)), exp) + + # mixed np.datetime64/timedelta64 nat results in object + data = [np.datetime64('nat'), np.timedelta64('nat')] + exp = pd.Index(data, dtype=object) + tm.assert_index_equal(Index(data), exp) + tm.assert_index_equal(Index(np.array(data, dtype=object)), exp) + + data = [np.timedelta64('nat'), np.datetime64('nat')] + exp = pd.Index(data, dtype=object) + tm.assert_index_equal(Index(data), exp) + tm.assert_index_equal(Index(np.array(data, dtype=object)), exp) + def test_index_ctor_infer_periodindex(self): xp = period_range('2012-1-1', freq='M', periods=3) rs = Index(xp) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index c632704b7c5eb..2a7e8a957977f 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -252,6 +252,24 @@ def test_constructor_pass_none(self): expected = Series(index=Index([None])) assert_series_equal(s, expected) + def test_constructor_pass_nan_nat(self): + # GH 13467 + exp = Series([np.nan, np.nan], dtype=np.float64) + self.assertEqual(exp.dtype, np.float64) + tm.assert_series_equal(Series([np.nan, np.nan]), exp) + tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp) + + exp = Series([pd.NaT, pd.NaT]) + self.assertEqual(exp.dtype, 'datetime64[ns]') + tm.assert_series_equal(Series([pd.NaT, pd.NaT]), exp) + tm.assert_series_equal(Series(np.array([pd.NaT, pd.NaT])), exp) + + tm.assert_series_equal(Series([pd.NaT, np.nan]), exp) + tm.assert_series_equal(Series(np.array([pd.NaT, np.nan])), exp) + + tm.assert_series_equal(Series([np.nan, pd.NaT]), exp) + tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp) + def test_constructor_cast(self): self.assertRaises(ValueError, Series, ['a', 'b', 'c'], dtype=float) @@ -688,8 +706,9 @@ def test_constructor_dtype_timedelta64(self): td = Series([np.timedelta64(300000000), pd.NaT]) self.assertEqual(td.dtype, 'timedelta64[ns]') + # because iNaT is int, not coerced to timedelta td = Series([np.timedelta64(300000000), tslib.iNaT]) - self.assertEqual(td.dtype, 'timedelta64[ns]') + self.assertEqual(td.dtype, 'object') td = Series([np.timedelta64(300000000), np.nan]) self.assertEqual(td.dtype, 'timedelta64[ns]') diff --git a/pandas/tests/test_infer_and_convert.py b/pandas/tests/test_infer_and_convert.py index a6941369b35be..5f016322f101f 100644 --- a/pandas/tests/test_infer_and_convert.py +++ b/pandas/tests/test_infer_and_convert.py @@ -180,6 +180,207 @@ def test_datetime(self): index = Index(dates) self.assertEqual(index.inferred_type, 'datetime64') + def test_infer_dtype_datetime(self): + + arr = np.array([pd.Timestamp('2011-01-01'), + pd.Timestamp('2011-01-02')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') + + arr = np.array([np.datetime64('2011-01-01'), + np.datetime64('2011-01-01')], dtype=object) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime64') + + arr = np.array([datetime(2011, 1, 1), datetime(2012, 2, 1)]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') + + # starts with nan + for n in [pd.NaT, np.nan]: + arr = np.array([n, pd.Timestamp('2011-01-02')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') + + arr = np.array([n, np.datetime64('2011-01-02')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime64') + + arr = np.array([n, datetime(2011, 1, 1)]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') + + arr = np.array([n, pd.Timestamp('2011-01-02'), n]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') + + arr = np.array([n, np.datetime64('2011-01-02'), n]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime64') + + arr = np.array([n, datetime(2011, 1, 1), n]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') + + # different type of nat + arr = np.array([np.timedelta64('nat'), + np.datetime64('2011-01-02')], dtype=object) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + arr = np.array([np.datetime64('2011-01-02'), + np.timedelta64('nat')], dtype=object) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + # mixed datetime + arr = np.array([datetime(2011, 1, 1), + pd.Timestamp('2011-01-02')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') + + # should be datetime? + arr = np.array([np.datetime64('2011-01-01'), + pd.Timestamp('2011-01-02')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + arr = np.array([pd.Timestamp('2011-01-02'), + np.datetime64('2011-01-01')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1]) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed-integer') + + arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1.1]) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + arr = np.array([np.nan, '2011-01-01', pd.Timestamp('2011-01-02')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + def test_infer_dtype_timedelta(self): + + arr = np.array([pd.Timedelta('1 days'), + pd.Timedelta('2 days')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') + + arr = np.array([np.timedelta64(1, 'D'), + np.timedelta64(2, 'D')], dtype=object) + self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') + + arr = np.array([timedelta(1), timedelta(2)]) + self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') + + # starts with nan + for n in [pd.NaT, np.nan]: + arr = np.array([n, pd.Timedelta('1 days')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') + + arr = np.array([n, np.timedelta64(1, 'D')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') + + arr = np.array([n, timedelta(1)]) + self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') + + arr = np.array([n, pd.Timedelta('1 days'), n]) + self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') + + arr = np.array([n, np.timedelta64(1, 'D'), n]) + self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') + + arr = np.array([n, timedelta(1), n]) + self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') + + # different type of nat + arr = np.array([np.datetime64('nat'), np.timedelta64(1, 'D')], + dtype=object) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + arr = np.array([np.timedelta64(1, 'D'), np.datetime64('nat')], + dtype=object) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + def test_infer_dtype_all_nan_nat_like(self): + arr = np.array([np.nan, np.nan]) + self.assertEqual(pd.lib.infer_dtype(arr), 'floating') + + # nan and None mix are result in mixed + arr = np.array([np.nan, np.nan, None]) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + arr = np.array([None, np.nan, np.nan]) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + # pd.NaT + arr = np.array([pd.NaT]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') + + arr = np.array([pd.NaT, np.nan]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') + + arr = np.array([np.nan, pd.NaT]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') + + arr = np.array([np.nan, pd.NaT, np.nan]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') + + arr = np.array([None, pd.NaT, None]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime') + + # np.datetime64(nat) + arr = np.array([np.datetime64('nat')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime64') + + for n in [np.nan, pd.NaT, None]: + arr = np.array([n, np.datetime64('nat'), n]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime64') + + arr = np.array([pd.NaT, n, np.datetime64('nat'), n]) + self.assertEqual(pd.lib.infer_dtype(arr), 'datetime64') + + arr = np.array([np.timedelta64('nat')], dtype=object) + self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') + + for n in [np.nan, pd.NaT, None]: + arr = np.array([n, np.timedelta64('nat'), n]) + self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') + + arr = np.array([pd.NaT, n, np.timedelta64('nat'), n]) + self.assertEqual(pd.lib.infer_dtype(arr), 'timedelta') + + # datetime / timedelta mixed + arr = np.array([pd.NaT, np.datetime64('nat'), + np.timedelta64('nat'), np.nan]) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + arr = np.array([np.timedelta64('nat'), np.datetime64('nat')], + dtype=object) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + def test_is_datetimelike_array_all_nan_nat_like(self): + arr = np.array([np.nan, pd.NaT, np.datetime64('nat')]) + self.assertTrue(pd.lib.is_datetime_array(arr)) + self.assertTrue(pd.lib.is_datetime64_array(arr)) + self.assertFalse(pd.lib.is_timedelta_array(arr)) + self.assertFalse(pd.lib.is_timedelta64_array(arr)) + self.assertFalse(pd.lib.is_timedelta_or_timedelta64_array(arr)) + + arr = np.array([np.nan, pd.NaT, np.timedelta64('nat')]) + self.assertFalse(pd.lib.is_datetime_array(arr)) + self.assertFalse(pd.lib.is_datetime64_array(arr)) + self.assertTrue(pd.lib.is_timedelta_array(arr)) + self.assertTrue(pd.lib.is_timedelta64_array(arr)) + self.assertTrue(pd.lib.is_timedelta_or_timedelta64_array(arr)) + + arr = np.array([np.nan, pd.NaT, np.datetime64('nat'), + np.timedelta64('nat')]) + self.assertFalse(pd.lib.is_datetime_array(arr)) + self.assertFalse(pd.lib.is_datetime64_array(arr)) + self.assertFalse(pd.lib.is_timedelta_array(arr)) + self.assertFalse(pd.lib.is_timedelta64_array(arr)) + self.assertFalse(pd.lib.is_timedelta_or_timedelta64_array(arr)) + + arr = np.array([np.nan, pd.NaT]) + self.assertTrue(pd.lib.is_datetime_array(arr)) + self.assertTrue(pd.lib.is_datetime64_array(arr)) + self.assertTrue(pd.lib.is_timedelta_array(arr)) + self.assertTrue(pd.lib.is_timedelta64_array(arr)) + self.assertTrue(pd.lib.is_timedelta_or_timedelta64_array(arr)) + + arr = np.array([np.nan, np.nan], dtype=object) + self.assertFalse(pd.lib.is_datetime_array(arr)) + self.assertFalse(pd.lib.is_datetime64_array(arr)) + self.assertFalse(pd.lib.is_timedelta_array(arr)) + self.assertFalse(pd.lib.is_timedelta64_array(arr)) + self.assertFalse(pd.lib.is_timedelta_or_timedelta64_array(arr)) + def test_date(self): dates = [date(2012, 1, x) for x in range(1, 20)] @@ -244,6 +445,13 @@ def test_categorical(self): result = lib.infer_dtype(Series(arr)) self.assertEqual(result, 'categorical') + def test_is_period(self): + self.assertTrue(lib.is_period(pd.Period('2011-01', freq='M'))) + self.assertFalse(lib.is_period(pd.PeriodIndex(['2011-01'], freq='M'))) + self.assertFalse(lib.is_period(pd.Timestamp('2011-01'))) + self.assertFalse(lib.is_period(1)) + self.assertFalse(lib.is_period(np.nan)) + class TestConvert(tm.TestCase): @@ -437,6 +645,7 @@ def test_convert_downcast_int64(self): result = lib.downcast_int64(arr, na_values) self.assert_numpy_array_equal(result, expected) + if __name__ == '__main__': import nose diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 62f8b10e3eea2..fe4de11864522 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -843,15 +843,6 @@ cdef _tz_format(object obj, object zone): except: return ', tz=%s' % zone -def is_timestamp_array(ndarray[object] values): - cdef int i, n = len(values) - if n == 0: - return False - for i in range(n): - if not is_timestamp(values[i]): - return False - return True - cpdef object get_value_box(ndarray arr, object loc): cdef: @@ -957,6 +948,7 @@ cdef str _NDIM_STRING = "ndim" # (see Timestamp class above). This will serve as a C extension type that # shadows the python class, where we do any heavy lifting. cdef class _Timestamp(datetime): + cdef readonly: int64_t value, nanosecond object freq # frequency reference