pandas-dev · hksonngan · Mar 12, 2019
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -199,6 +199,7 @@ Missing
 
 - Fixed misleading exception message in :meth:`Series.missing` if argument ``order`` is required, but omitted (:issue:`10633`, :issue:`24014`).
 - Fixed class type displayed in exception message in :meth:`DataFrame.dropna` if invalid ``axis`` parameter passed (:issue:`25555`)
+- Fixed MultiIndex bug copying values incorrectly when adding values to index, in case `NaN` is included in the index (:issue:`22247`)
 -
 
 MultiIndex

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -591,7 +591,7 @@ cdef class BaseMultiIndexCodesEngine:
     level, then locating (the integer representation of) codes.
     """
     def __init__(self, object levels, object labels,
-                 ndarray[uint64_t, ndim=1] offsets):
+                 ndarray[uint64_t, ndim=1] offsets, hasnans):
         """
         Parameters
         ----------
@@ -605,6 +605,7 @@ cdef class BaseMultiIndexCodesEngine:
 
         self.levels = levels
         self.offsets = offsets
+        self.hasnans = hasnans
 
         # Transform labels in a single array, and add 1 so that we are working
         # with positive integers (-1 for NaN becomes 0):
@@ -657,6 +658,14 @@ cdef class BaseMultiIndexCodesEngine:
             indexer = indexer[order]
         else:
             indexer = self._base.get_indexer(self, lab_ints)
+            # HashTable return same value for 'NaN' and new value
+            # simple fix by take maximum value from array and plus once
+            len = indexer.size - 1
+            if len + 1 > 1 and self.hasnans:
+                check_dup = np.any(self._isin(indexer[0:len],
+                                   indexer[len:indexer.size]))
+                if check_dup and indexer[len]==-1:
+                    indexer[len] = np.max(indexer) + 1
 
         return indexer
 
@@ -673,8 +682,18 @@ cdef class BaseMultiIndexCodesEngine:
 
         # Transform indices into single integer:
         lab_int = self._codes_to_ints(np.array(indices, dtype='uint64'))
-
-        return self._base.get_loc(self, lab_int)
+        ret = []
+        try:
+            ret = self._base.get_loc(self, lab_int)
+        except KeyError:
+            if self.hasnans:
+                # as NaN value, we have 0 bit represent for codes
+                # hacking here by add position of NaN in levels.
+                lab_int += len(self.levels[len(self.levels)-1])
+                ret = self._base.get_loc(self, np.uint64(lab_int))
+            else:
+                raise KeyError(lab_int)
+        return ret
 
     def get_indexer_non_unique(self, object target):
         # This needs to be overridden just because the default one works on

diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -17,7 +17,7 @@
 from pandas.core.dtypes.common import (
     ensure_int64, ensure_platform_int, is_categorical_dtype, is_hashable,
     is_integer, is_iterator, is_list_like, is_object_dtype, is_scalar,
-    pandas_dtype)
+    is_string_dtype, pandas_dtype)
 from pandas.core.dtypes.dtypes import ExtensionDtype, PandasExtensionDtype
 from pandas.core.dtypes.generic import ABCDataFrame
 from pandas.core.dtypes.missing import array_equivalent, isna
@@ -74,8 +74,30 @@ def _codes_to_ints(self, codes):
             # Single key
             return np.bitwise_or.reduce(codes)
 
+        codes = np.bitwise_or.reduce(codes, axis=1)
+        if codes.size > 1 and self.hasnans:
+            check_dup = np.any(algos.isin(codes[0:codes.size - 1],
+                                          codes[codes.size - 1:codes.size]))
+            if check_dup:
+                codes[codes.size - 1] = np.max(codes) + 1
+
         # Multiple keys
-        return np.bitwise_or.reduce(codes, axis=1)
+        return codes
+
+    def _isin(self, comps, values):
+        """
+        Compute the isin boolean array
+        Note just wraping algorithms.isin function to avoid fail of isort
+        Parameters
+        ----------
+        comps : array-like
+        values : array-like
+
+        Returns
+        -------
+        boolean array same length as comps
+        """
+        return algos.isin(comps, values)
 
 
 class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine,
@@ -116,8 +138,32 @@ def _codes_to_ints(self, codes):
             # Single key
             return np.bitwise_or.reduce(codes)
 
+        codes = np.bitwise_or.reduce(codes, axis=1)
+        # Shift return same value for 'NaN' and new value
+        # simple fix by take maximum value from array and plus once
+        if codes.size > 1 and self.hasnans:
+            check_dup = np.any(algos.isin(codes[0:codes.size - 1],
+                                          codes[codes.size - 1:codes.size]))
+            if check_dup:
+                codes[codes.size - 1] = np.max(codes) + 1
+
         # Multiple keys
-        return np.bitwise_or.reduce(codes, axis=1)
+        return codes
+
+    def _isin(self, comps, values):
+        """
+        Compute the isin boolean array
+        Note just wraping algorithms.isin function to avoid fail of isort
+        Parameters
+        ----------
+        comps : array-like
+        values : array-like
+
+        Returns
+        -------
+        boolean array same length as comps
+        """
+        return algos.isin(comps, values)
 
 
 class MultiIndex(Index):
@@ -208,6 +254,7 @@ class MultiIndex(Index):
     _levels = FrozenList()
     _codes = FrozenList()
     _comparables = ['names']
+    _isna = False
     rename = Index.set_names
 
     # --------------------------------------------------------------------
@@ -702,6 +749,34 @@ def _set_codes(self, codes, level=None, copy=False, validate=True,
         self._codes = new_codes
         self._tuples = None
         self._reset_cache()
+        self._hasnans()
+
+    def _hasnans(self):
+        """
+        Return if I have any nans
+        """
+        is_not_right_level = False
+        try:
+            self._verify_integrity()
+        except ValueError:
+            is_not_right_level = True
+
+        if is_not_right_level:
+            return
+
+        if (self.values.size > 0 and is_string_dtype(self.values)):
+            flat = []
+            # flatten tuple to 1-D array for searching 'NaN'
+            for row in self.values:
+                flat.extend(row)
+            # algorithms.isin can not pass test_has_duplicates_overflow
+            with warnings.catch_warnings():
+                warnings.simplefilter(action='ignore', category=FutureWarning)
+                try:
+                    self._isna = np.array(np.where(
+                                          np.hstack(flat) == 'nan')).size > 0
+                except UnicodeDecodeError:
+                    self._isna = False
 
     def set_labels(self, labels, level=None, inplace=False,
                    verify_integrity=True):
@@ -1161,8 +1236,10 @@ def _engine(self):
         # Check the total number of bits needed for our representation:
         if lev_bits[0] > 64:
             # The levels would overflow a 64 bit uint - use Python integers:
-            return MultiIndexPyIntEngine(self.levels, self.codes, offsets)
-        return MultiIndexUIntEngine(self.levels, self.codes, offsets)
+            return MultiIndexPyIntEngine(self.levels,
+                                         self.codes, offsets, self._isna)
+        return MultiIndexUIntEngine(self.levels,
+                                    self.codes, offsets, self._isna)
 
     @property
     def values(self):

diff --git a/pandas/tests/indexes/multi/test_missing.py b/pandas/tests/indexes/multi/test_missing.py
@@ -127,3 +127,73 @@ def test_nan_stays_float():
     assert pd.isna(df0.index.get_level_values(1)).all()
     # the following failed in 0.14.1
     assert pd.isna(dfm.index.get_level_values(1)[:-1]).all()
+
+
+def test_nan_multi_index():
+    # GH 22247
+    # When using the MultiIndex features of pandas, when an `np.nan`
+    # is in the index when new values are added to the DF then the
+    # values are not `np.nan`, but copied from the `np.nan` row.
+    df = pd.DataFrame(
+        [
+            ['A', np.nan, 1.23, 4.56],
+            ['A', 'G', 1.23, 4.56],
+            ['A', 'D', 9.87, 10.54],
+        ],
+        columns=['pivot_0', 'pivot_1', 'col_1', 'col_2'],
+    )
+    df.set_index(['pivot_0', 'pivot_1'], inplace=True)
+    pivot_0 = 'A'
+    pivot_1_values = ['D', 'E', 'F']
+    for value in pivot_1_values:
+        if value not in df.index.get_level_values('pivot_1').tolist():
+            df.at[(pivot_0, value), 'col_2'] = 0.0
+
+    assert df.loc[('A', 'F')]['col_2'] == 0.0  # Pass
+    # Fails: value of 1.23 from the first row in the df is copied
+    # This behavior shows for all versions v0.23.x, however is fine for 0.22.0.
+    assert pd.isna(df.loc[('A', 'F')]['col_1'])
+
+
+def test_nan_set_value_multi_index():
+    # GH 22247
+    # When using the MultiIndex features of pandas, when an `np.nan`
+    # is in the index when new values are added to the DF then the
+    # values are not `np.nan`, but copied from the `np.nan` row.
+    df = pd.DataFrame(
+        [
+            ['A', 'G', 1.23, 4.56],
+            ['A', 'D', 9.87, 10.54],
+        ],
+        columns=['pivot_0', 'pivot_1', 'col_1', 'col_2'],
+    )
+    df.set_index(['pivot_0', 'pivot_1'], inplace=True)
+    df.at[('A', 'E'), 'col_2'] = 0.0
+    df.at[('A', 'F'), 'col_2'] = 0.0
+    # Fails: raise exception
+    # This behavior shows for all versions v0.23.x, however is fine for 0.22.0.
+    df.at[('A', np.nan), 'col_2'] = 0.0
+
+    assert df.loc[('A', np.nan)]['col_2'] == 0.0
+    assert pd.isna(df.loc[('A', np.nan)]['col_1'])
+
+
+def test_nan_sigle_index():
+    # GH 22247
+    df = pd.DataFrame(
+        [
+            [np.nan, 1.23, 4.56],
+            ['G', 1.23, 4.56],
+            ['D', 9.87, 10.54],
+        ],
+        columns=['pivot_0', 'col_1', 'col_2'],
+    )
+    df.set_index(['pivot_0'], inplace=True)
+
+    pivot_0_values = ['D', 'E', 'F']
+    for value in pivot_0_values:
+        if value not in df.index.get_level_values('pivot_0').tolist():
+            df.at[(value), 'col_2'] = 0.0
+
+    assert df.loc[('F')]['col_2'] == 0.0
+    assert pd.isna(df.loc[('F')]['col_1'])