Skip to content

Commit d39249a

Browse files
realeadjreback
authored andcommitted
BUG: don't mangle NaN-float-values and pd.NaT (GH 22295) (#22296)
1 parent c994e80 commit d39249a

File tree

7 files changed

+79
-53
lines changed

7 files changed

+79
-53
lines changed

doc/source/whatsnew/v0.24.0.txt

+4-1
Original file line numberDiff line numberDiff line change
@@ -721,13 +721,16 @@ Indexing
721721
- Bug where mixed indexes wouldn't allow integers for ``.at`` (:issue:`19860`)
722722
- ``Float64Index.get_loc`` now raises ``KeyError`` when boolean key passed. (:issue:`19087`)
723723
- Bug in :meth:`DataFrame.loc` when indexing with an :class:`IntervalIndex` (:issue:`19977`)
724+
- :class:`Index` no longer mangles ``None``, ``NaN`` and ``NaT``, i.e. they are treated as three different keys. However, for numeric Index all three are still coerced to a ``NaN`` (:issue:`22332`)
724725

725726
Missing
726727
^^^^^^^
727728

728729
- Bug in :func:`DataFrame.fillna` where a ``ValueError`` would raise when one column contained a ``datetime64[ns, tz]`` dtype (:issue:`15522`)
729730
- Bug in :func:`Series.hasnans` that could be incorrectly cached and return incorrect answers if null elements are introduced after an initial call (:issue:`19700`)
730-
- :func:`Series.isin` now treats all nans as equal also for ``np.object``-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`)
731+
- :func:`Series.isin` now treats all NaN-floats as equal also for `np.object`-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`)
732+
- :func:`unique` no longer mangles NaN-floats and the ``NaT``-object for `np.object`-dtype, i.e. ``NaT`` is no longer coerced to a NaN-value and is treated as a different entity. (:issue:`22295`)
733+
731734

732735
MultiIndex
733736
^^^^^^^^^^

pandas/_libs/hashtable_class_helper.pxi.in

+7-45
Original file line numberDiff line numberDiff line change
@@ -470,7 +470,6 @@ cdef class {{name}}HashTable(HashTable):
470470
int ret = 0
471471
{{dtype}}_t val
472472
khiter_t k
473-
bint seen_na = 0
474473
{{name}}Vector uniques = {{name}}Vector()
475474
{{name}}VectorData *ud
476475

@@ -479,30 +478,13 @@ cdef class {{name}}HashTable(HashTable):
479478
with nogil:
480479
for i in range(n):
481480
val = values[i]
482-
{{if float_group}}
483-
if val == val:
484-
k = kh_get_{{dtype}}(self.table, val)
485-
if k == self.table.n_buckets:
486-
kh_put_{{dtype}}(self.table, val, &ret)
487-
if needs_resize(ud):
488-
with gil:
489-
uniques.resize()
490-
append_data_{{dtype}}(ud, val)
491-
elif not seen_na:
492-
seen_na = 1
493-
if needs_resize(ud):
494-
with gil:
495-
uniques.resize()
496-
append_data_{{dtype}}(ud, NAN)
497-
{{else}}
498481
k = kh_get_{{dtype}}(self.table, val)
499482
if k == self.table.n_buckets:
500483
kh_put_{{dtype}}(self.table, val, &ret)
501484
if needs_resize(ud):
502485
with gil:
503486
uniques.resize()
504487
append_data_{{dtype}}(ud, val)
505-
{{endif}}
506488
return uniques.to_array()
507489

508490
{{endfor}}
@@ -747,9 +729,6 @@ cdef class StringHashTable(HashTable):
747729
return np.asarray(labels)
748730

749731

750-
na_sentinel = object
751-
752-
753732
cdef class PyObjectHashTable(HashTable):
754733

755734
def __init__(self, size_hint=1):
@@ -767,8 +746,7 @@ cdef class PyObjectHashTable(HashTable):
767746
def __contains__(self, object key):
768747
cdef khiter_t k
769748
hash(key)
770-
if key != key or key is None:
771-
key = na_sentinel
749+
772750
k = kh_get_pymap(self.table, <PyObject*>key)
773751
return k != self.table.n_buckets
774752

@@ -780,8 +758,7 @@ cdef class PyObjectHashTable(HashTable):
780758

781759
cpdef get_item(self, object val):
782760
cdef khiter_t k
783-
if val != val or val is None:
784-
val = na_sentinel
761+
785762
k = kh_get_pymap(self.table, <PyObject*>val)
786763
if k != self.table.n_buckets:
787764
return self.table.vals[k]
@@ -795,8 +772,7 @@ cdef class PyObjectHashTable(HashTable):
795772
char* buf
796773

797774
hash(key)
798-
if key != key or key is None:
799-
key = na_sentinel
775+
800776
k = kh_put_pymap(self.table, <PyObject*>key, &ret)
801777
# self.table.keys[k] = key
802778
if kh_exist_pymap(self.table, k):
@@ -814,8 +790,6 @@ cdef class PyObjectHashTable(HashTable):
814790
for i in range(n):
815791
val = values[i]
816792
hash(val)
817-
if val != val or val is None:
818-
val = na_sentinel
819793

820794
k = kh_put_pymap(self.table, <PyObject*>val, &ret)
821795
self.table.vals[k] = i
@@ -831,8 +805,6 @@ cdef class PyObjectHashTable(HashTable):
831805
for i in range(n):
832806
val = values[i]
833807
hash(val)
834-
if val != val or val is None:
835-
val = na_sentinel
836808

837809
k = kh_get_pymap(self.table, <PyObject*>val)
838810
if k != self.table.n_buckets:
@@ -849,24 +821,14 @@ cdef class PyObjectHashTable(HashTable):
849821
object val
850822
khiter_t k
851823
ObjectVector uniques = ObjectVector()
852-
bint seen_na = 0
853824

854825
for i in range(n):
855826
val = values[i]
856827
hash(val)
857-
858-
# `val is None` below is exception to prevent mangling of None and
859-
# other NA values; note however that other NA values (ex: pd.NaT
860-
# and np.nan) will still get mangled, so many not be a permanent
861-
# solution; see GH 20866
862-
if not checknull(val) or val is None:
863-
k = kh_get_pymap(self.table, <PyObject*>val)
864-
if k == self.table.n_buckets:
865-
kh_put_pymap(self.table, <PyObject*>val, &ret)
866-
uniques.append(val)
867-
elif not seen_na:
868-
seen_na = 1
869-
uniques.append(nan)
828+
k = kh_get_pymap(self.table, <PyObject*>val)
829+
if k == self.table.n_buckets:
830+
kh_put_pymap(self.table, <PyObject*>val, &ret)
831+
uniques.append(val)
870832

871833
return uniques.to_array()
872834

pandas/conftest.py

+12
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,18 @@ def nulls_fixture(request):
286286
nulls_fixture2 = nulls_fixture # Generate cartesian product of nulls_fixture
287287

288288

289+
@pytest.fixture(params=[None, np.nan, pd.NaT])
290+
def unique_nulls_fixture(request):
291+
"""
292+
Fixture for each null type in pandas, each null type exactly once
293+
"""
294+
return request.param
295+
296+
297+
# Generate cartesian product of unique_nulls_fixture:
298+
unique_nulls_fixture2 = unique_nulls_fixture
299+
300+
289301
TIMEZONES = [None, 'UTC', 'US/Eastern', 'Asia/Tokyo', 'dateutil/US/Pacific',
290302
'dateutil/Asia/Singapore']
291303

pandas/core/indexes/base.py

-5
Original file line numberDiff line numberDiff line change
@@ -3109,7 +3109,6 @@ def get_loc(self, key, method=None, tolerance=None):
31093109
return self._engine.get_loc(key)
31103110
except KeyError:
31113111
return self._engine.get_loc(self._maybe_cast_indexer(key))
3112-
31133112
indexer = self.get_indexer([key], method=method, tolerance=tolerance)
31143113
if indexer.ndim > 1 or indexer.size > 1:
31153114
raise TypeError('get_loc requires scalar valued input')
@@ -4475,10 +4474,6 @@ def insert(self, loc, item):
44754474
-------
44764475
new_index : Index
44774476
"""
4478-
if is_scalar(item) and isna(item):
4479-
# GH 18295
4480-
item = self._na_value
4481-
44824477
_self = np.asarray(self)
44834478
item = self._coerce_scalar_to_index(item)._ndarray_values
44844479
idx = np.concatenate((_self[:loc], item, _self[loc:]))

pandas/core/indexes/numeric.py

+8
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
is_bool,
1010
is_bool_dtype,
1111
is_scalar)
12+
from pandas.core.dtypes.missing import isna
1213

1314
from pandas import compat
1415
from pandas.core import algorithms
@@ -114,6 +115,13 @@ def is_all_dates(self):
114115
"""
115116
return False
116117

118+
@Appender(Index.insert.__doc__)
119+
def insert(self, loc, item):
120+
# treat NA values as nans:
121+
if is_scalar(item) and isna(item):
122+
item = self._na_value
123+
return super(NumericIndex, self).insert(loc, item)
124+
117125

118126
_num_index_shared_docs['class_descr'] = """
119127
Immutable ndarray implementing an ordered, sliceable set. The basic object

pandas/tests/indexes/test_base.py

+18-2
Original file line numberDiff line numberDiff line change
@@ -560,8 +560,9 @@ def test_insert(self):
560560
tm.assert_index_equal(Index(['a']), null_index.insert(0, 'a'))
561561

562562
def test_insert_missing(self, nulls_fixture):
563-
# GH 18295 (test missing)
564-
expected = Index(['a', np.nan, 'b', 'c'])
563+
# GH 22295
564+
# test there is no mangling of NA values
565+
expected = Index(['a', nulls_fixture, 'b', 'c'])
565566
result = Index(list('abc')).insert(1, nulls_fixture)
566567
tm.assert_index_equal(result, expected)
567568

@@ -1364,6 +1365,21 @@ def test_get_indexer_numeric_index_boolean_target(self):
13641365
expected = np.array([-1, -1, -1], dtype=np.intp)
13651366
tm.assert_numpy_array_equal(result, expected)
13661367

1368+
def test_get_indexer_with_NA_values(self, unique_nulls_fixture,
1369+
unique_nulls_fixture2):
1370+
# GH 22332
1371+
# check pairwise, that no pair of na values
1372+
# is mangled
1373+
if unique_nulls_fixture is unique_nulls_fixture2:
1374+
return # skip it, values are not unique
1375+
arr = np.array([unique_nulls_fixture,
1376+
unique_nulls_fixture2], dtype=np.object)
1377+
index = pd.Index(arr, dtype=np.object)
1378+
result = index.get_indexer([unique_nulls_fixture,
1379+
unique_nulls_fixture2, 'Unknown'])
1380+
expected = np.array([0, 1, -1], dtype=np.int64)
1381+
tm.assert_numpy_array_equal(result, expected)
1382+
13671383
@pytest.mark.parametrize("method", [None, 'pad', 'backfill', 'nearest'])
13681384
def test_get_loc(self, method):
13691385
index = pd.Index([0, 1, 2])

pandas/tests/test_algos.py

+30
Original file line numberDiff line numberDiff line change
@@ -520,6 +520,36 @@ def test_different_nans(self):
520520
expected = np.array([np.nan])
521521
tm.assert_numpy_array_equal(result, expected)
522522

523+
def test_first_nan_kept(self):
524+
# GH 22295
525+
# create different nans from bit-patterns:
526+
bits_for_nan1 = 0xfff8000000000001
527+
bits_for_nan2 = 0x7ff8000000000001
528+
NAN1 = struct.unpack("d", struct.pack("=Q", bits_for_nan1))[0]
529+
NAN2 = struct.unpack("d", struct.pack("=Q", bits_for_nan2))[0]
530+
assert NAN1 != NAN1
531+
assert NAN2 != NAN2
532+
for el_type in [np.float64, np.object]:
533+
a = np.array([NAN1, NAN2], dtype=el_type)
534+
result = pd.unique(a)
535+
assert result.size == 1
536+
# use bit patterns to identify which nan was kept:
537+
result_nan_bits = struct.unpack("=Q",
538+
struct.pack("d", result[0]))[0]
539+
assert result_nan_bits == bits_for_nan1
540+
541+
def test_do_not_mangle_na_values(self, unique_nulls_fixture,
542+
unique_nulls_fixture2):
543+
# GH 22295
544+
if unique_nulls_fixture is unique_nulls_fixture2:
545+
return # skip it, values not unique
546+
a = np.array([unique_nulls_fixture,
547+
unique_nulls_fixture2], dtype=np.object)
548+
result = pd.unique(a)
549+
assert result.size == 2
550+
assert a[0] is unique_nulls_fixture
551+
assert a[1] is unique_nulls_fixture2
552+
523553

524554
class TestIsin(object):
525555

0 commit comments

Comments
 (0)