Skip to content

BUG: Series/Index results in datetime/timedelta incorrectly if inputs are all nan/nat like #13477

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 11, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.19.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -484,8 +484,9 @@ Bug Fixes

- Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError`` (:issue:`13156`)
- Bug in ``Series.str.extractall()`` with single group and quantifier (:issue:`13382`)

- Bug in ``DatetimeIndex`` and ``Period`` subtraction raises ``ValueError`` or ``AttributeError`` rather than ``TypeError`` (:issue:`13078`)
- Bug in ``Index`` and ``Series`` created with ``NaN`` and ``NaT`` mixed data may not have ``datetime64`` dtype (:issue:`13324`)
- Bug in ``Index`` and ``Series`` may ignore ``np.datetime64('nat')`` and ``np.timdelta64('nat')`` to infer dtype (:issue:`13324`)
- Bug in ``PeriodIndex`` and ``Period`` subtraction raises ``AttributeError`` (:issue:`13071`)
- Bug in ``PeriodIndex`` construction returning a ``float64`` index in some circumstances (:issue:`13067`)
- Bug in ``.resample(..)`` with a ``PeriodIndex`` not changing its ``freq`` appropriately when empty (:issue:`13067`)
Expand Down
3 changes: 1 addition & 2 deletions pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,8 +242,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
# don't support boolean explicity ATM
pass
elif inferred != 'string':
if (inferred.startswith('datetime') or
tslib.is_timestamp_array(subarr)):
if inferred.startswith('datetime'):

if (lib.is_datetime_with_singletz_array(subarr) or
'tz' in kwargs):
Expand Down
72 changes: 57 additions & 15 deletions pandas/src/inference.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ def infer_dtype(object _values):
Py_ssize_t i, n
object val
ndarray values
bint seen_pdnat = False, seen_val = False

if isinstance(_values, np.ndarray):
values = _values
Expand Down Expand Up @@ -141,17 +142,34 @@ def infer_dtype(object _values):
values = values.ravel()

# try to use a valid value
for i in range(n):
val = util.get_value_1d(values, i)
if not is_null_datetimelike(val):
break
for i from 0 <= i < n:
val = util.get_value_1d(values, i)

if util.is_datetime64_object(val) or val is NaT:
# do not use is_nul_datetimelike to keep
# np.datetime64('nat') and np.timedelta64('nat')
if util._checknull(val):
pass
elif val is NaT:
seen_pdnat = True
else:
seen_val = True
break

# if all values are nan/NaT
if seen_val is False and seen_pdnat is True:
return 'datetime'
# float/object nan is handled in latter logic

if util.is_datetime64_object(val):
if is_datetime64_array(values):
return 'datetime64'
elif is_timedelta_or_timedelta64_array(values):
return 'timedelta'

elif is_timedelta(val):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This must be moved here. Otherwise, timedelta and object mixed data is regarded as "mixed-integer".

if is_timedelta_or_timedelta64_array(values):
return 'timedelta'

elif util.is_integer_object(val):
# a timedelta will show true here as well
if is_timedelta(val):
Expand Down Expand Up @@ -200,17 +218,15 @@ def infer_dtype(object _values):
if is_bytes_array(values):
return 'bytes'

elif is_timedelta(val):
if is_timedelta_or_timedelta64_array(values):
return 'timedelta'

elif is_period(val):
if is_period_array(values):
return 'period'

for i in range(n):
val = util.get_value_1d(values, i)
if util.is_integer_object(val):
if (util.is_integer_object(val) and
not util.is_timedelta64_object(val) and
not util.is_datetime64_object(val)):
return 'mixed-integer'

return 'mixed'
Expand All @@ -237,20 +253,46 @@ def is_possible_datetimelike_array(object arr):
return False
return seen_datetime or seen_timedelta


cdef inline bint is_null_datetimelike(v):
# determine if we have a null for a timedelta/datetime (or integer versions)x
if util._checknull(v):
return True
elif v is NaT:
return True
elif util.is_timedelta64_object(v):
return v.view('int64') == iNaT
elif util.is_datetime64_object(v):
return v.view('int64') == iNaT
elif util.is_integer_object(v):
return v == iNaT
return False


cdef inline bint is_null_datetime64(v):
# determine if we have a null for a datetime (or integer versions)x,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need to export these (as in tslib.pxd)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not needed ATM because tslib doesn't use is_null_datetimelike.

# excluding np.timedelta64('nat')
if util._checknull(v):
return True
elif v is NaT:
return True
elif util.is_datetime64_object(v):
return v.view('int64') == iNaT
return False


cdef inline bint is_null_timedelta64(v):
# determine if we have a null for a timedelta (or integer versions)x,
# excluding np.datetime64('nat')
if util._checknull(v):
return True
elif v is NaT:
return True
elif util.is_timedelta64_object(v):
return v.view('int64') == iNaT
return False


cdef inline bint is_datetime(object o):
return PyDateTime_Check(o)

Expand Down Expand Up @@ -420,7 +462,7 @@ def is_datetime_array(ndarray[object] values):
# return False for all nulls
for i in range(n):
v = values[i]
if is_null_datetimelike(v):
if is_null_datetime64(v):
# we are a regular null
if util._checknull(v):
null_count += 1
Expand All @@ -437,7 +479,7 @@ def is_datetime64_array(ndarray values):
# return False for all nulls
for i in range(n):
v = values[i]
if is_null_datetimelike(v):
if is_null_datetime64(v):
# we are a regular null
if util._checknull(v):
null_count += 1
Expand Down Expand Up @@ -481,7 +523,7 @@ def is_timedelta_array(ndarray values):
return False
for i in range(n):
v = values[i]
if is_null_datetimelike(v):
if is_null_timedelta64(v):
# we are a regular null
if util._checknull(v):
null_count += 1
Expand All @@ -496,7 +538,7 @@ def is_timedelta64_array(ndarray values):
return False
for i in range(n):
v = values[i]
if is_null_datetimelike(v):
if is_null_timedelta64(v):
# we are a regular null
if util._checknull(v):
null_count += 1
Expand All @@ -512,7 +554,7 @@ def is_timedelta_or_timedelta64_array(ndarray values):
return False
for i in range(n):
v = values[i]
if is_null_datetimelike(v):
if is_null_timedelta64(v):
# we are a regular null
if util._checknull(v):
null_count += 1
Expand Down
2 changes: 1 addition & 1 deletion pandas/src/util.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -98,4 +98,4 @@ cdef inline bint _checknan(object val):
return not cnp.PyArray_Check(val) and val != val

cdef inline bint is_period_object(object val):
return getattr(val,'_typ','_typ') == 'period'
return getattr(val, '_typ', '_typ') == 'period'
43 changes: 43 additions & 0 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,49 @@ def __array__(self, dtype=None):
result = pd.Index(ArrayLike(array))
self.assert_index_equal(result, expected)

def test_index_ctor_infer_nan_nat(self):
# GH 13467
exp = pd.Float64Index([np.nan, np.nan])
self.assertEqual(exp.dtype, np.float64)
tm.assert_index_equal(Index([np.nan, np.nan]), exp)
tm.assert_index_equal(Index(np.array([np.nan, np.nan])), exp)

exp = pd.DatetimeIndex([pd.NaT, pd.NaT])
self.assertEqual(exp.dtype, 'datetime64[ns]')
tm.assert_index_equal(Index([pd.NaT, pd.NaT]), exp)
tm.assert_index_equal(Index(np.array([pd.NaT, pd.NaT])), exp)

exp = pd.DatetimeIndex([pd.NaT, pd.NaT])
self.assertEqual(exp.dtype, 'datetime64[ns]')

for data in [[pd.NaT, np.nan], [np.nan, pd.NaT],
[np.nan, np.datetime64('nat')],
[np.datetime64('nat'), np.nan]]:
tm.assert_index_equal(Index(data), exp)
tm.assert_index_equal(Index(np.array(data, dtype=object)), exp)

exp = pd.TimedeltaIndex([pd.NaT, pd.NaT])
self.assertEqual(exp.dtype, 'timedelta64[ns]')

for data in [[np.nan, np.timedelta64('nat')],
[np.timedelta64('nat'), np.nan],
[pd.NaT, np.timedelta64('nat')],
[np.timedelta64('nat'), pd.NaT]]:

tm.assert_index_equal(Index(data), exp)
tm.assert_index_equal(Index(np.array(data, dtype=object)), exp)

# mixed np.datetime64/timedelta64 nat results in object
data = [np.datetime64('nat'), np.timedelta64('nat')]
exp = pd.Index(data, dtype=object)
tm.assert_index_equal(Index(data), exp)
tm.assert_index_equal(Index(np.array(data, dtype=object)), exp)

data = [np.timedelta64('nat'), np.datetime64('nat')]
exp = pd.Index(data, dtype=object)
tm.assert_index_equal(Index(data), exp)
tm.assert_index_equal(Index(np.array(data, dtype=object)), exp)

def test_index_ctor_infer_periodindex(self):
xp = period_range('2012-1-1', freq='M', periods=3)
rs = Index(xp)
Expand Down
21 changes: 20 additions & 1 deletion pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,24 @@ def test_constructor_pass_none(self):
expected = Series(index=Index([None]))
assert_series_equal(s, expected)

def test_constructor_pass_nan_nat(self):
# GH 13467
exp = Series([np.nan, np.nan], dtype=np.float64)
self.assertEqual(exp.dtype, np.float64)
tm.assert_series_equal(Series([np.nan, np.nan]), exp)
tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp)

exp = Series([pd.NaT, pd.NaT])
self.assertEqual(exp.dtype, 'datetime64[ns]')
tm.assert_series_equal(Series([pd.NaT, pd.NaT]), exp)
tm.assert_series_equal(Series(np.array([pd.NaT, pd.NaT])), exp)

tm.assert_series_equal(Series([pd.NaT, np.nan]), exp)
tm.assert_series_equal(Series(np.array([pd.NaT, np.nan])), exp)

tm.assert_series_equal(Series([np.nan, pd.NaT]), exp)
tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp)

def test_constructor_cast(self):
self.assertRaises(ValueError, Series, ['a', 'b', 'c'], dtype=float)

Expand Down Expand Up @@ -688,8 +706,9 @@ def test_constructor_dtype_timedelta64(self):
td = Series([np.timedelta64(300000000), pd.NaT])
self.assertEqual(td.dtype, 'timedelta64[ns]')

# because iNaT is int, not coerced to timedelta
td = Series([np.timedelta64(300000000), tslib.iNaT])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh I c, this is essentially an invalid test because of the integer. oh then.

self.assertEqual(td.dtype, 'timedelta64[ns]')
self.assertEqual(td.dtype, 'object')

td = Series([np.timedelta64(300000000), np.nan])
self.assertEqual(td.dtype, 'timedelta64[ns]')
Expand Down
Loading