Skip to content

Commit 5605f99

Browse files
sinhrksjorisvandenbossche
authored andcommitted
CLN/TST: Add tests for nan/nat mixed input (#13477)
closes #13467
1 parent 2e8c993 commit 5605f99

File tree

8 files changed

+334
-29
lines changed

8 files changed

+334
-29
lines changed

doc/source/whatsnew/v0.19.0.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -484,8 +484,9 @@ Bug Fixes
484484

485485
- Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError`` (:issue:`13156`)
486486
- Bug in ``Series.str.extractall()`` with single group and quantifier (:issue:`13382`)
487-
488487
- Bug in ``DatetimeIndex`` and ``Period`` subtraction raises ``ValueError`` or ``AttributeError`` rather than ``TypeError`` (:issue:`13078`)
488+
- Bug in ``Index`` and ``Series`` created with ``NaN`` and ``NaT`` mixed data may not have ``datetime64`` dtype (:issue:`13324`)
489+
- Bug in ``Index`` and ``Series`` may ignore ``np.datetime64('nat')`` and ``np.timdelta64('nat')`` to infer dtype (:issue:`13324`)
489490
- Bug in ``PeriodIndex`` and ``Period`` subtraction raises ``AttributeError`` (:issue:`13071`)
490491
- Bug in ``PeriodIndex`` construction returning a ``float64`` index in some circumstances (:issue:`13067`)
491492
- Bug in ``.resample(..)`` with a ``PeriodIndex`` not changing its ``freq`` appropriately when empty (:issue:`13067`)

pandas/indexes/base.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -243,8 +243,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
243243
# don't support boolean explicity ATM
244244
pass
245245
elif inferred != 'string':
246-
if (inferred.startswith('datetime') or
247-
tslib.is_timestamp_array(subarr)):
246+
if inferred.startswith('datetime'):
248247

249248
if (lib.is_datetime_with_singletz_array(subarr) or
250249
'tz' in kwargs):

pandas/src/inference.pyx

+57-15
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ def infer_dtype(object _values):
103103
Py_ssize_t i, n
104104
object val
105105
ndarray values
106+
bint seen_pdnat = False, seen_val = False
106107

107108
if isinstance(_values, np.ndarray):
108109
values = _values
@@ -141,17 +142,34 @@ def infer_dtype(object _values):
141142
values = values.ravel()
142143

143144
# try to use a valid value
144-
for i in range(n):
145-
val = util.get_value_1d(values, i)
146-
if not is_null_datetimelike(val):
147-
break
145+
for i from 0 <= i < n:
146+
val = util.get_value_1d(values, i)
148147

149-
if util.is_datetime64_object(val) or val is NaT:
148+
# do not use is_nul_datetimelike to keep
149+
# np.datetime64('nat') and np.timedelta64('nat')
150+
if util._checknull(val):
151+
pass
152+
elif val is NaT:
153+
seen_pdnat = True
154+
else:
155+
seen_val = True
156+
break
157+
158+
# if all values are nan/NaT
159+
if seen_val is False and seen_pdnat is True:
160+
return 'datetime'
161+
# float/object nan is handled in latter logic
162+
163+
if util.is_datetime64_object(val):
150164
if is_datetime64_array(values):
151165
return 'datetime64'
152166
elif is_timedelta_or_timedelta64_array(values):
153167
return 'timedelta'
154168

169+
elif is_timedelta(val):
170+
if is_timedelta_or_timedelta64_array(values):
171+
return 'timedelta'
172+
155173
elif util.is_integer_object(val):
156174
# a timedelta will show true here as well
157175
if is_timedelta(val):
@@ -200,17 +218,15 @@ def infer_dtype(object _values):
200218
if is_bytes_array(values):
201219
return 'bytes'
202220

203-
elif is_timedelta(val):
204-
if is_timedelta_or_timedelta64_array(values):
205-
return 'timedelta'
206-
207221
elif is_period(val):
208222
if is_period_array(values):
209223
return 'period'
210224

211225
for i in range(n):
212226
val = util.get_value_1d(values, i)
213-
if util.is_integer_object(val):
227+
if (util.is_integer_object(val) and
228+
not util.is_timedelta64_object(val) and
229+
not util.is_datetime64_object(val)):
214230
return 'mixed-integer'
215231

216232
return 'mixed'
@@ -237,20 +253,46 @@ def is_possible_datetimelike_array(object arr):
237253
return False
238254
return seen_datetime or seen_timedelta
239255

256+
240257
cdef inline bint is_null_datetimelike(v):
241258
# determine if we have a null for a timedelta/datetime (or integer versions)x
242259
if util._checknull(v):
243260
return True
261+
elif v is NaT:
262+
return True
244263
elif util.is_timedelta64_object(v):
245264
return v.view('int64') == iNaT
246265
elif util.is_datetime64_object(v):
247266
return v.view('int64') == iNaT
248267
elif util.is_integer_object(v):
249268
return v == iNaT
269+
return False
270+
271+
272+
cdef inline bint is_null_datetime64(v):
273+
# determine if we have a null for a datetime (or integer versions)x,
274+
# excluding np.timedelta64('nat')
275+
if util._checknull(v):
276+
return True
277+
elif v is NaT:
278+
return True
279+
elif util.is_datetime64_object(v):
280+
return v.view('int64') == iNaT
281+
return False
282+
283+
284+
cdef inline bint is_null_timedelta64(v):
285+
# determine if we have a null for a timedelta (or integer versions)x,
286+
# excluding np.datetime64('nat')
287+
if util._checknull(v):
288+
return True
250289
elif v is NaT:
251290
return True
291+
elif util.is_timedelta64_object(v):
292+
return v.view('int64') == iNaT
252293
return False
253294

295+
254296
cdef inline bint is_datetime(object o):
255297
return PyDateTime_Check(o)
256298

@@ -420,7 +462,7 @@ def is_datetime_array(ndarray[object] values):
420462
# return False for all nulls
421463
for i in range(n):
422464
v = values[i]
423-
if is_null_datetimelike(v):
465+
if is_null_datetime64(v):
424466
# we are a regular null
425467
if util._checknull(v):
426468
null_count += 1
@@ -437,7 +479,7 @@ def is_datetime64_array(ndarray values):
437479
# return False for all nulls
438480
for i in range(n):
439481
v = values[i]
440-
if is_null_datetimelike(v):
482+
if is_null_datetime64(v):
441483
# we are a regular null
442484
if util._checknull(v):
443485
null_count += 1
@@ -481,7 +523,7 @@ def is_timedelta_array(ndarray values):
481523
return False
482524
for i in range(n):
483525
v = values[i]
484-
if is_null_datetimelike(v):
526+
if is_null_timedelta64(v):
485527
# we are a regular null
486528
if util._checknull(v):
487529
null_count += 1
@@ -496,7 +538,7 @@ def is_timedelta64_array(ndarray values):
496538
return False
497539
for i in range(n):
498540
v = values[i]
499-
if is_null_datetimelike(v):
541+
if is_null_timedelta64(v):
500542
# we are a regular null
501543
if util._checknull(v):
502544
null_count += 1
@@ -512,7 +554,7 @@ def is_timedelta_or_timedelta64_array(ndarray values):
512554
return False
513555
for i in range(n):
514556
v = values[i]
515-
if is_null_datetimelike(v):
557+
if is_null_timedelta64(v):
516558
# we are a regular null
517559
if util._checknull(v):
518560
null_count += 1

pandas/src/util.pxd

+1-1
Original file line numberDiff line numberDiff line change
@@ -98,4 +98,4 @@ cdef inline bint _checknan(object val):
9898
return not cnp.PyArray_Check(val) and val != val
9999

100100
cdef inline bint is_period_object(object val):
101-
return getattr(val,'_typ','_typ') == 'period'
101+
return getattr(val, '_typ', '_typ') == 'period'

pandas/tests/indexes/test_base.py

+43
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,49 @@ def __array__(self, dtype=None):
203203
result = pd.Index(ArrayLike(array))
204204
self.assert_index_equal(result, expected)
205205

206+
def test_index_ctor_infer_nan_nat(self):
207+
# GH 13467
208+
exp = pd.Float64Index([np.nan, np.nan])
209+
self.assertEqual(exp.dtype, np.float64)
210+
tm.assert_index_equal(Index([np.nan, np.nan]), exp)
211+
tm.assert_index_equal(Index(np.array([np.nan, np.nan])), exp)
212+
213+
exp = pd.DatetimeIndex([pd.NaT, pd.NaT])
214+
self.assertEqual(exp.dtype, 'datetime64[ns]')
215+
tm.assert_index_equal(Index([pd.NaT, pd.NaT]), exp)
216+
tm.assert_index_equal(Index(np.array([pd.NaT, pd.NaT])), exp)
217+
218+
exp = pd.DatetimeIndex([pd.NaT, pd.NaT])
219+
self.assertEqual(exp.dtype, 'datetime64[ns]')
220+
221+
for data in [[pd.NaT, np.nan], [np.nan, pd.NaT],
222+
[np.nan, np.datetime64('nat')],
223+
[np.datetime64('nat'), np.nan]]:
224+
tm.assert_index_equal(Index(data), exp)
225+
tm.assert_index_equal(Index(np.array(data, dtype=object)), exp)
226+
227+
exp = pd.TimedeltaIndex([pd.NaT, pd.NaT])
228+
self.assertEqual(exp.dtype, 'timedelta64[ns]')
229+
230+
for data in [[np.nan, np.timedelta64('nat')],
231+
[np.timedelta64('nat'), np.nan],
232+
[pd.NaT, np.timedelta64('nat')],
233+
[np.timedelta64('nat'), pd.NaT]]:
234+
235+
tm.assert_index_equal(Index(data), exp)
236+
tm.assert_index_equal(Index(np.array(data, dtype=object)), exp)
237+
238+
# mixed np.datetime64/timedelta64 nat results in object
239+
data = [np.datetime64('nat'), np.timedelta64('nat')]
240+
exp = pd.Index(data, dtype=object)
241+
tm.assert_index_equal(Index(data), exp)
242+
tm.assert_index_equal(Index(np.array(data, dtype=object)), exp)
243+
244+
data = [np.timedelta64('nat'), np.datetime64('nat')]
245+
exp = pd.Index(data, dtype=object)
246+
tm.assert_index_equal(Index(data), exp)
247+
tm.assert_index_equal(Index(np.array(data, dtype=object)), exp)
248+
206249
def test_index_ctor_infer_periodindex(self):
207250
xp = period_range('2012-1-1', freq='M', periods=3)
208251
rs = Index(xp)

pandas/tests/series/test_constructors.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,24 @@ def test_constructor_pass_none(self):
252252
expected = Series(index=Index([None]))
253253
assert_series_equal(s, expected)
254254

255+
def test_constructor_pass_nan_nat(self):
256+
# GH 13467
257+
exp = Series([np.nan, np.nan], dtype=np.float64)
258+
self.assertEqual(exp.dtype, np.float64)
259+
tm.assert_series_equal(Series([np.nan, np.nan]), exp)
260+
tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp)
261+
262+
exp = Series([pd.NaT, pd.NaT])
263+
self.assertEqual(exp.dtype, 'datetime64[ns]')
264+
tm.assert_series_equal(Series([pd.NaT, pd.NaT]), exp)
265+
tm.assert_series_equal(Series(np.array([pd.NaT, pd.NaT])), exp)
266+
267+
tm.assert_series_equal(Series([pd.NaT, np.nan]), exp)
268+
tm.assert_series_equal(Series(np.array([pd.NaT, np.nan])), exp)
269+
270+
tm.assert_series_equal(Series([np.nan, pd.NaT]), exp)
271+
tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp)
272+
255273
def test_constructor_cast(self):
256274
self.assertRaises(ValueError, Series, ['a', 'b', 'c'], dtype=float)
257275

@@ -688,8 +706,9 @@ def test_constructor_dtype_timedelta64(self):
688706
td = Series([np.timedelta64(300000000), pd.NaT])
689707
self.assertEqual(td.dtype, 'timedelta64[ns]')
690708

709+
# because iNaT is int, not coerced to timedelta
691710
td = Series([np.timedelta64(300000000), tslib.iNaT])
692-
self.assertEqual(td.dtype, 'timedelta64[ns]')
711+
self.assertEqual(td.dtype, 'object')
693712

694713
td = Series([np.timedelta64(300000000), np.nan])
695714
self.assertEqual(td.dtype, 'timedelta64[ns]')

0 commit comments

Comments
 (0)