Skip to content

BUG: don't mangle NaN-float-values and pd.NaT (GH 22295) #22296

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Sep 18, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -720,13 +720,16 @@ Indexing
- Bug where mixed indexes wouldn't allow integers for ``.at`` (:issue:`19860`)
- ``Float64Index.get_loc`` now raises ``KeyError`` when boolean key passed. (:issue:`19087`)
- Bug in :meth:`DataFrame.loc` when indexing with an :class:`IntervalIndex` (:issue:`19977`)
- :class:`Index` no longer mangles ``None``, ``NaN`` and ``NaT``, i.e. they are treated as three different keys. However, for numeric Index all three are still coerced to a ``NaN`` (:issue:`22332`)

Missing
^^^^^^^

- Bug in :func:`DataFrame.fillna` where a ``ValueError`` would raise when one column contained a ``datetime64[ns, tz]`` dtype (:issue:`15522`)
- Bug in :func:`Series.hasnans` that could be incorrectly cached and return incorrect answers if null elements are introduced after an initial call (:issue:`19700`)
- :func:`Series.isin` now treats all nans as equal also for ``np.object``-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`)
- :func:`Series.isin` now treats all NaN-floats as equal also for `np.object`-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`)
- :func:`unique` no longer mangles NaN-floats and the ``NaT``-object for `np.object`-dtype, i.e. ``NaT`` is no longer coerced to a NaN-value and is treated as a different entity. (:issue:`22295`)


MultiIndex
^^^^^^^^^^
Expand Down
52 changes: 7 additions & 45 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,6 @@ cdef class {{name}}HashTable(HashTable):
int ret = 0
{{dtype}}_t val
khiter_t k
bint seen_na = 0
{{name}}Vector uniques = {{name}}Vector()
{{name}}VectorData *ud

Expand All @@ -479,30 +478,13 @@ cdef class {{name}}HashTable(HashTable):
with nogil:
for i in range(n):
val = values[i]
{{if float_group}}
if val == val:
k = kh_get_{{dtype}}(self.table, val)
if k == self.table.n_buckets:
kh_put_{{dtype}}(self.table, val, &ret)
if needs_resize(ud):
with gil:
uniques.resize()
append_data_{{dtype}}(ud, val)
elif not seen_na:
seen_na = 1
if needs_resize(ud):
with gil:
uniques.resize()
append_data_{{dtype}}(ud, NAN)
{{else}}
k = kh_get_{{dtype}}(self.table, val)
if k == self.table.n_buckets:
kh_put_{{dtype}}(self.table, val, &ret)
if needs_resize(ud):
with gil:
uniques.resize()
append_data_{{dtype}}(ud, val)
{{endif}}
return uniques.to_array()

{{endfor}}
Expand Down Expand Up @@ -747,9 +729,6 @@ cdef class StringHashTable(HashTable):
return np.asarray(labels)


na_sentinel = object


cdef class PyObjectHashTable(HashTable):

def __init__(self, size_hint=1):
Expand All @@ -767,8 +746,7 @@ cdef class PyObjectHashTable(HashTable):
def __contains__(self, object key):
cdef khiter_t k
hash(key)
if key != key or key is None:
key = na_sentinel

k = kh_get_pymap(self.table, <PyObject*>key)
return k != self.table.n_buckets

Expand All @@ -780,8 +758,7 @@ cdef class PyObjectHashTable(HashTable):

cpdef get_item(self, object val):
cdef khiter_t k
if val != val or val is None:
val = na_sentinel

k = kh_get_pymap(self.table, <PyObject*>val)
if k != self.table.n_buckets:
return self.table.vals[k]
Expand All @@ -795,8 +772,7 @@ cdef class PyObjectHashTable(HashTable):
char* buf

hash(key)
if key != key or key is None:
key = na_sentinel

k = kh_put_pymap(self.table, <PyObject*>key, &ret)
# self.table.keys[k] = key
if kh_exist_pymap(self.table, k):
Expand All @@ -814,8 +790,6 @@ cdef class PyObjectHashTable(HashTable):
for i in range(n):
val = values[i]
hash(val)
if val != val or val is None:
val = na_sentinel

k = kh_put_pymap(self.table, <PyObject*>val, &ret)
self.table.vals[k] = i
Expand All @@ -831,8 +805,6 @@ cdef class PyObjectHashTable(HashTable):
for i in range(n):
val = values[i]
hash(val)
if val != val or val is None:
val = na_sentinel

k = kh_get_pymap(self.table, <PyObject*>val)
if k != self.table.n_buckets:
Expand All @@ -849,24 +821,14 @@ cdef class PyObjectHashTable(HashTable):
object val
khiter_t k
ObjectVector uniques = ObjectVector()
bint seen_na = 0

for i in range(n):
val = values[i]
hash(val)

# `val is None` below is exception to prevent mangling of None and
# other NA values; note however that other NA values (ex: pd.NaT
# and np.nan) will still get mangled, so many not be a permanent
# solution; see GH 20866
if not checknull(val) or val is None:
k = kh_get_pymap(self.table, <PyObject*>val)
if k == self.table.n_buckets:
kh_put_pymap(self.table, <PyObject*>val, &ret)
uniques.append(val)
elif not seen_na:
seen_na = 1
uniques.append(nan)
k = kh_get_pymap(self.table, <PyObject*>val)
if k == self.table.n_buckets:
kh_put_pymap(self.table, <PyObject*>val, &ret)
uniques.append(val)

return uniques.to_array()

Expand Down
12 changes: 12 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,18 @@ def nulls_fixture(request):
nulls_fixture2 = nulls_fixture # Generate cartesian product of nulls_fixture


@pytest.fixture(params=[None, np.nan, pd.NaT])
def unique_nulls_fixture(request):
"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can't you use the existing nulls_fixture? (ahh or that has the float NaN which are the same as np.nan)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback I could have used nulls_fixture, but then as you already have pointed out, I would have to filter out [np.nan, float('nan')] & Co. (which are not list of unique elements) at several places in tests, so I assumed adding unique_nulls_fixture would be cleaner.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes this is ok (i see how you are using it below).

Fixture for each null type in pandas, each null type exactly once
"""
return request.param


# Generate cartesian product of unique_nulls_fixture:
unique_nulls_fixture2 = unique_nulls_fixture


TIMEZONES = [None, 'UTC', 'US/Eastern', 'Asia/Tokyo', 'dateutil/US/Pacific',
'dateutil/Asia/Singapore']

Expand Down
5 changes: 0 additions & 5 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3109,7 +3109,6 @@ def get_loc(self, key, method=None, tolerance=None):
return self._engine.get_loc(key)
except KeyError:
return self._engine.get_loc(self._maybe_cast_indexer(key))

indexer = self.get_indexer([key], method=method, tolerance=tolerance)
if indexer.ndim > 1 or indexer.size > 1:
raise TypeError('get_loc requires scalar valued input')
Expand Down Expand Up @@ -4475,10 +4474,6 @@ def insert(self, loc, item):
-------
new_index : Index
"""
if is_scalar(item) and isna(item):
# GH 18295
item = self._na_value

_self = np.asarray(self)
item = self._coerce_scalar_to_index(item)._ndarray_values
idx = np.concatenate((_self[:loc], item, _self[loc:]))
Expand Down
8 changes: 8 additions & 0 deletions pandas/core/indexes/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
is_bool,
is_bool_dtype,
is_scalar)
from pandas.core.dtypes.missing import isna

from pandas import compat
from pandas.core import algorithms
Expand Down Expand Up @@ -114,6 +115,13 @@ def is_all_dates(self):
"""
return False

@Appender(Index.insert.__doc__)
def insert(self, loc, item):
# treat NA values as nans:
if is_scalar(item) and isna(item):
item = self._na_value
return super(NumericIndex, self).insert(loc, item)


_num_index_shared_docs['class_descr'] = """
Immutable ndarray implementing an ordered, sliceable set. The basic object
Expand Down
20 changes: 18 additions & 2 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,8 +560,9 @@ def test_insert(self):
tm.assert_index_equal(Index(['a']), null_index.insert(0, 'a'))

def test_insert_missing(self, nulls_fixture):
# GH 18295 (test missing)
expected = Index(['a', np.nan, 'b', 'c'])
# GH 22295
# test there is no mangling of NA values
expected = Index(['a', nulls_fixture, 'b', 'c'])
result = Index(list('abc')).insert(1, nulls_fixture)
tm.assert_index_equal(result, expected)

Expand Down Expand Up @@ -1364,6 +1365,21 @@ def test_get_indexer_numeric_index_boolean_target(self):
expected = np.array([-1, -1, -1], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)

def test_get_indexer_with_NA_values(self, unique_nulls_fixture,
unique_nulls_fixture2):
# GH 22332
# check pairwise, that no pair of na values
# is mangled
if unique_nulls_fixture is unique_nulls_fixture2:
return # skip it, values are not unique
arr = np.array([unique_nulls_fixture,
unique_nulls_fixture2], dtype=np.object)
index = pd.Index(arr, dtype=np.object)
result = index.get_indexer([unique_nulls_fixture,
unique_nulls_fixture2, 'Unknown'])
expected = np.array([0, 1, -1], dtype=np.int64)
tm.assert_numpy_array_equal(result, expected)

@pytest.mark.parametrize("method", [None, 'pad', 'backfill', 'nearest'])
def test_get_loc(self, method):
index = pd.Index([0, 1, 2])
Expand Down
30 changes: 30 additions & 0 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,36 @@ def test_different_nans(self):
expected = np.array([np.nan])
tm.assert_numpy_array_equal(result, expected)

def test_first_nan_kept(self):
# GH 22295
# create different nans from bit-patterns:
bits_for_nan1 = 0xfff8000000000001
bits_for_nan2 = 0x7ff8000000000001
NAN1 = struct.unpack("d", struct.pack("=Q", bits_for_nan1))[0]
NAN2 = struct.unpack("d", struct.pack("=Q", bits_for_nan2))[0]
assert NAN1 != NAN1
assert NAN2 != NAN2
for el_type in [np.float64, np.object]:
a = np.array([NAN1, NAN2], dtype=el_type)
result = pd.unique(a)
assert result.size == 1
# use bit patterns to identify which nan was kept:
result_nan_bits = struct.unpack("=Q",
struct.pack("d", result[0]))[0]
assert result_nan_bits == bits_for_nan1

def test_do_not_mangle_na_values(self, unique_nulls_fixture,
unique_nulls_fixture2):
# GH 22295
if unique_nulls_fixture is unique_nulls_fixture2:
return # skip it, values not unique
a = np.array([unique_nulls_fixture,
unique_nulls_fixture2], dtype=np.object)
result = pd.unique(a)
assert result.size == 2
assert a[0] is unique_nulls_fixture
assert a[1] is unique_nulls_fixture2


class TestIsin(object):

Expand Down