diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 649629714c3b1..ff9661bcd37fc 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -720,13 +720,16 @@ Indexing - Bug where mixed indexes wouldn't allow integers for ``.at`` (:issue:`19860`) - ``Float64Index.get_loc`` now raises ``KeyError`` when boolean key passed. (:issue:`19087`) - Bug in :meth:`DataFrame.loc` when indexing with an :class:`IntervalIndex` (:issue:`19977`) +- :class:`Index` no longer mangles ``None``, ``NaN`` and ``NaT``, i.e. they are treated as three different keys. However, for numeric Index all three are still coerced to a ``NaN`` (:issue:`22332`) Missing ^^^^^^^ - Bug in :func:`DataFrame.fillna` where a ``ValueError`` would raise when one column contained a ``datetime64[ns, tz]`` dtype (:issue:`15522`) - Bug in :func:`Series.hasnans` that could be incorrectly cached and return incorrect answers if null elements are introduced after an initial call (:issue:`19700`) -- :func:`Series.isin` now treats all nans as equal also for ``np.object``-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`) +- :func:`Series.isin` now treats all NaN-floats as equal also for `np.object`-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`) +- :func:`unique` no longer mangles NaN-floats and the ``NaT``-object for `np.object`-dtype, i.e. ``NaT`` is no longer coerced to a NaN-value and is treated as a different entity. (:issue:`22295`) + MultiIndex ^^^^^^^^^^ diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 550cabd5e3192..f294fd141a9f1 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -470,7 +470,6 @@ cdef class {{name}}HashTable(HashTable): int ret = 0 {{dtype}}_t val khiter_t k - bint seen_na = 0 {{name}}Vector uniques = {{name}}Vector() {{name}}VectorData *ud @@ -479,22 +478,6 @@ cdef class {{name}}HashTable(HashTable): with nogil: for i in range(n): val = values[i] - {{if float_group}} - if val == val: - k = kh_get_{{dtype}}(self.table, val) - if k == self.table.n_buckets: - kh_put_{{dtype}}(self.table, val, &ret) - if needs_resize(ud): - with gil: - uniques.resize() - append_data_{{dtype}}(ud, val) - elif not seen_na: - seen_na = 1 - if needs_resize(ud): - with gil: - uniques.resize() - append_data_{{dtype}}(ud, NAN) - {{else}} k = kh_get_{{dtype}}(self.table, val) if k == self.table.n_buckets: kh_put_{{dtype}}(self.table, val, &ret) @@ -502,7 +485,6 @@ cdef class {{name}}HashTable(HashTable): with gil: uniques.resize() append_data_{{dtype}}(ud, val) - {{endif}} return uniques.to_array() {{endfor}} @@ -747,9 +729,6 @@ cdef class StringHashTable(HashTable): return np.asarray(labels) -na_sentinel = object - - cdef class PyObjectHashTable(HashTable): def __init__(self, size_hint=1): @@ -767,8 +746,7 @@ cdef class PyObjectHashTable(HashTable): def __contains__(self, object key): cdef khiter_t k hash(key) - if key != key or key is None: - key = na_sentinel + k = kh_get_pymap(self.table, key) return k != self.table.n_buckets @@ -780,8 +758,7 @@ cdef class PyObjectHashTable(HashTable): cpdef get_item(self, object val): cdef khiter_t k - if val != val or val is None: - val = na_sentinel + k = kh_get_pymap(self.table, val) if k != self.table.n_buckets: return self.table.vals[k] @@ -795,8 +772,7 @@ cdef class PyObjectHashTable(HashTable): char* buf hash(key) - if key != key or key is None: - key = na_sentinel + k = kh_put_pymap(self.table, key, &ret) # self.table.keys[k] = key if kh_exist_pymap(self.table, k): @@ -814,8 +790,6 @@ cdef class PyObjectHashTable(HashTable): for i in range(n): val = values[i] hash(val) - if val != val or val is None: - val = na_sentinel k = kh_put_pymap(self.table, val, &ret) self.table.vals[k] = i @@ -831,8 +805,6 @@ cdef class PyObjectHashTable(HashTable): for i in range(n): val = values[i] hash(val) - if val != val or val is None: - val = na_sentinel k = kh_get_pymap(self.table, val) if k != self.table.n_buckets: @@ -849,24 +821,14 @@ cdef class PyObjectHashTable(HashTable): object val khiter_t k ObjectVector uniques = ObjectVector() - bint seen_na = 0 for i in range(n): val = values[i] hash(val) - - # `val is None` below is exception to prevent mangling of None and - # other NA values; note however that other NA values (ex: pd.NaT - # and np.nan) will still get mangled, so many not be a permanent - # solution; see GH 20866 - if not checknull(val) or val is None: - k = kh_get_pymap(self.table, val) - if k == self.table.n_buckets: - kh_put_pymap(self.table, val, &ret) - uniques.append(val) - elif not seen_na: - seen_na = 1 - uniques.append(nan) + k = kh_get_pymap(self.table, val) + if k == self.table.n_buckets: + kh_put_pymap(self.table, val, &ret) + uniques.append(val) return uniques.to_array() diff --git a/pandas/conftest.py b/pandas/conftest.py index 28c24fc8c0640..621de3ffd4b12 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -286,6 +286,18 @@ def nulls_fixture(request): nulls_fixture2 = nulls_fixture # Generate cartesian product of nulls_fixture +@pytest.fixture(params=[None, np.nan, pd.NaT]) +def unique_nulls_fixture(request): + """ + Fixture for each null type in pandas, each null type exactly once + """ + return request.param + + +# Generate cartesian product of unique_nulls_fixture: +unique_nulls_fixture2 = unique_nulls_fixture + + TIMEZONES = [None, 'UTC', 'US/Eastern', 'Asia/Tokyo', 'dateutil/US/Pacific', 'dateutil/Asia/Singapore'] diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ca381160de0df..487d3975a6219 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3109,7 +3109,6 @@ def get_loc(self, key, method=None, tolerance=None): return self._engine.get_loc(key) except KeyError: return self._engine.get_loc(self._maybe_cast_indexer(key)) - indexer = self.get_indexer([key], method=method, tolerance=tolerance) if indexer.ndim > 1 or indexer.size > 1: raise TypeError('get_loc requires scalar valued input') @@ -4475,10 +4474,6 @@ def insert(self, loc, item): ------- new_index : Index """ - if is_scalar(item) and isna(item): - # GH 18295 - item = self._na_value - _self = np.asarray(self) item = self._coerce_scalar_to_index(item)._ndarray_values idx = np.concatenate((_self[:loc], item, _self[loc:])) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index e0627432cbc2e..8d616468a87d9 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -9,6 +9,7 @@ is_bool, is_bool_dtype, is_scalar) +from pandas.core.dtypes.missing import isna from pandas import compat from pandas.core import algorithms @@ -114,6 +115,13 @@ def is_all_dates(self): """ return False + @Appender(Index.insert.__doc__) + def insert(self, loc, item): + # treat NA values as nans: + if is_scalar(item) and isna(item): + item = self._na_value + return super(NumericIndex, self).insert(loc, item) + _num_index_shared_docs['class_descr'] = """ Immutable ndarray implementing an ordered, sliceable set. The basic object diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 755b3cc7f1dca..eab04419fe939 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -560,8 +560,9 @@ def test_insert(self): tm.assert_index_equal(Index(['a']), null_index.insert(0, 'a')) def test_insert_missing(self, nulls_fixture): - # GH 18295 (test missing) - expected = Index(['a', np.nan, 'b', 'c']) + # GH 22295 + # test there is no mangling of NA values + expected = Index(['a', nulls_fixture, 'b', 'c']) result = Index(list('abc')).insert(1, nulls_fixture) tm.assert_index_equal(result, expected) @@ -1364,6 +1365,21 @@ def test_get_indexer_numeric_index_boolean_target(self): expected = np.array([-1, -1, -1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_with_NA_values(self, unique_nulls_fixture, + unique_nulls_fixture2): + # GH 22332 + # check pairwise, that no pair of na values + # is mangled + if unique_nulls_fixture is unique_nulls_fixture2: + return # skip it, values are not unique + arr = np.array([unique_nulls_fixture, + unique_nulls_fixture2], dtype=np.object) + index = pd.Index(arr, dtype=np.object) + result = index.get_indexer([unique_nulls_fixture, + unique_nulls_fixture2, 'Unknown']) + expected = np.array([0, 1, -1], dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("method", [None, 'pad', 'backfill', 'nearest']) def test_get_loc(self, method): index = pd.Index([0, 1, 2]) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 64d2e155aa9a9..b2ddbf715b480 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -520,6 +520,36 @@ def test_different_nans(self): expected = np.array([np.nan]) tm.assert_numpy_array_equal(result, expected) + def test_first_nan_kept(self): + # GH 22295 + # create different nans from bit-patterns: + bits_for_nan1 = 0xfff8000000000001 + bits_for_nan2 = 0x7ff8000000000001 + NAN1 = struct.unpack("d", struct.pack("=Q", bits_for_nan1))[0] + NAN2 = struct.unpack("d", struct.pack("=Q", bits_for_nan2))[0] + assert NAN1 != NAN1 + assert NAN2 != NAN2 + for el_type in [np.float64, np.object]: + a = np.array([NAN1, NAN2], dtype=el_type) + result = pd.unique(a) + assert result.size == 1 + # use bit patterns to identify which nan was kept: + result_nan_bits = struct.unpack("=Q", + struct.pack("d", result[0]))[0] + assert result_nan_bits == bits_for_nan1 + + def test_do_not_mangle_na_values(self, unique_nulls_fixture, + unique_nulls_fixture2): + # GH 22295 + if unique_nulls_fixture is unique_nulls_fixture2: + return # skip it, values not unique + a = np.array([unique_nulls_fixture, + unique_nulls_fixture2], dtype=np.object) + result = pd.unique(a) + assert result.size == 2 + assert a[0] is unique_nulls_fixture + assert a[1] is unique_nulls_fixture2 + class TestIsin(object):