Skip to content

Fixed regression of Multi index with NaN #25424

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ Missing

- Fixed misleading exception message in :meth:`Series.missing` if argument ``order`` is required, but omitted (:issue:`10633`, :issue:`24014`).
- Fixed class type displayed in exception message in :meth:`DataFrame.dropna` if invalid ``axis`` parameter passed (:issue:`25555`)
- Fixed MultiIndex bug copying values incorrectly when adding values to index, in case `NaN` is included in the index (:issue:`22247`)
-

MultiIndex
Expand Down
25 changes: 22 additions & 3 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -591,7 +591,7 @@ cdef class BaseMultiIndexCodesEngine:
level, then locating (the integer representation of) codes.
"""
def __init__(self, object levels, object labels,
ndarray[uint64_t, ndim=1] offsets):
ndarray[uint64_t, ndim=1] offsets, hasnans):
"""
Parameters
----------
Expand All @@ -605,6 +605,7 @@ cdef class BaseMultiIndexCodesEngine:

self.levels = levels
self.offsets = offsets
self.hasnans = hasnans

# Transform labels in a single array, and add 1 so that we are working
# with positive integers (-1 for NaN becomes 0):
Expand Down Expand Up @@ -657,6 +658,14 @@ cdef class BaseMultiIndexCodesEngine:
indexer = indexer[order]
else:
indexer = self._base.get_indexer(self, lab_ints)
# HashTable return same value for 'NaN' and new value
# simple fix by take maximum value from array and plus once
len = indexer.size - 1
if len + 1 > 1 and self.hasnans:
check_dup = np.any(self._isin(indexer[0:len],
indexer[len:indexer.size]))
if check_dup and indexer[len]==-1:
indexer[len] = np.max(indexer) + 1

return indexer

Expand All @@ -673,8 +682,18 @@ cdef class BaseMultiIndexCodesEngine:

# Transform indices into single integer:
lab_int = self._codes_to_ints(np.array(indices, dtype='uint64'))

return self._base.get_loc(self, lab_int)
ret = []
try:
ret = self._base.get_loc(self, lab_int)
except KeyError:
if self.hasnans:
# as NaN value, we have 0 bit represent for codes
# hacking here by add position of NaN in levels.
lab_int += len(self.levels[len(self.levels)-1])
ret = self._base.get_loc(self, np.uint64(lab_int))
else:
raise KeyError(lab_int)
return ret

def get_indexer_non_unique(self, object target):
# This needs to be overridden just because the default one works on
Expand Down
87 changes: 82 additions & 5 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from pandas.core.dtypes.common import (
ensure_int64, ensure_platform_int, is_categorical_dtype, is_hashable,
is_integer, is_iterator, is_list_like, is_object_dtype, is_scalar,
pandas_dtype)
is_string_dtype, pandas_dtype)
from pandas.core.dtypes.dtypes import ExtensionDtype, PandasExtensionDtype
from pandas.core.dtypes.generic import ABCDataFrame
from pandas.core.dtypes.missing import array_equivalent, isna
Expand Down Expand Up @@ -74,8 +74,30 @@ def _codes_to_ints(self, codes):
# Single key
return np.bitwise_or.reduce(codes)

codes = np.bitwise_or.reduce(codes, axis=1)
if codes.size > 1 and self.hasnans:
check_dup = np.any(algos.isin(codes[0:codes.size - 1],
codes[codes.size - 1:codes.size]))
if check_dup:
codes[codes.size - 1] = np.max(codes) + 1

# Multiple keys
return np.bitwise_or.reduce(codes, axis=1)
return codes

def _isin(self, comps, values):
"""
Compute the isin boolean array
Note just wraping algorithms.isin function to avoid fail of isort
Parameters
----------
comps : array-like
values : array-like

Returns
-------
boolean array same length as comps
"""
return algos.isin(comps, values)


class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine,
Expand Down Expand Up @@ -116,8 +138,32 @@ def _codes_to_ints(self, codes):
# Single key
return np.bitwise_or.reduce(codes)

codes = np.bitwise_or.reduce(codes, axis=1)
# Shift return same value for 'NaN' and new value
# simple fix by take maximum value from array and plus once
if codes.size > 1 and self.hasnans:
check_dup = np.any(algos.isin(codes[0:codes.size - 1],
codes[codes.size - 1:codes.size]))
if check_dup:
codes[codes.size - 1] = np.max(codes) + 1

# Multiple keys
return np.bitwise_or.reduce(codes, axis=1)
return codes

def _isin(self, comps, values):
"""
Compute the isin boolean array
Note just wraping algorithms.isin function to avoid fail of isort
Parameters
----------
comps : array-like
values : array-like

Returns
-------
boolean array same length as comps
"""
return algos.isin(comps, values)


class MultiIndex(Index):
Expand Down Expand Up @@ -208,6 +254,7 @@ class MultiIndex(Index):
_levels = FrozenList()
_codes = FrozenList()
_comparables = ['names']
_isna = False
rename = Index.set_names

# --------------------------------------------------------------------
Expand Down Expand Up @@ -702,6 +749,34 @@ def _set_codes(self, codes, level=None, copy=False, validate=True,
self._codes = new_codes
self._tuples = None
self._reset_cache()
self._hasnans()

def _hasnans(self):
"""
Return if I have any nans
"""
is_not_right_level = False
try:
self._verify_integrity()
except ValueError:
is_not_right_level = True

if is_not_right_level:
return

if (self.values.size > 0 and is_string_dtype(self.values)):
flat = []
# flatten tuple to 1-D array for searching 'NaN'
for row in self.values:
flat.extend(row)
# algorithms.isin can not pass test_has_duplicates_overflow
with warnings.catch_warnings():
warnings.simplefilter(action='ignore', category=FutureWarning)
try:
self._isna = np.array(np.where(
np.hstack(flat) == 'nan')).size > 0
except UnicodeDecodeError:
self._isna = False

def set_labels(self, labels, level=None, inplace=False,
verify_integrity=True):
Expand Down Expand Up @@ -1161,8 +1236,10 @@ def _engine(self):
# Check the total number of bits needed for our representation:
if lev_bits[0] > 64:
# The levels would overflow a 64 bit uint - use Python integers:
return MultiIndexPyIntEngine(self.levels, self.codes, offsets)
return MultiIndexUIntEngine(self.levels, self.codes, offsets)
return MultiIndexPyIntEngine(self.levels,
self.codes, offsets, self._isna)
return MultiIndexUIntEngine(self.levels,
self.codes, offsets, self._isna)

@property
def values(self):
Expand Down
70 changes: 70 additions & 0 deletions pandas/tests/indexes/multi/test_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,73 @@ def test_nan_stays_float():
assert pd.isna(df0.index.get_level_values(1)).all()
# the following failed in 0.14.1
assert pd.isna(dfm.index.get_level_values(1)[:-1]).all()


def test_nan_multi_index():
# GH 22247
# When using the MultiIndex features of pandas, when an `np.nan`
# is in the index when new values are added to the DF then the
# values are not `np.nan`, but copied from the `np.nan` row.
df = pd.DataFrame(
[
['A', np.nan, 1.23, 4.56],
['A', 'G', 1.23, 4.56],
['A', 'D', 9.87, 10.54],
],
columns=['pivot_0', 'pivot_1', 'col_1', 'col_2'],
)
df.set_index(['pivot_0', 'pivot_1'], inplace=True)
pivot_0 = 'A'
pivot_1_values = ['D', 'E', 'F']
for value in pivot_1_values:
if value not in df.index.get_level_values('pivot_1').tolist():
df.at[(pivot_0, value), 'col_2'] = 0.0

assert df.loc[('A', 'F')]['col_2'] == 0.0 # Pass
# Fails: value of 1.23 from the first row in the df is copied
# This behavior shows for all versions v0.23.x, however is fine for 0.22.0.
assert pd.isna(df.loc[('A', 'F')]['col_1'])


def test_nan_set_value_multi_index():
# GH 22247
# When using the MultiIndex features of pandas, when an `np.nan`
# is in the index when new values are added to the DF then the
# values are not `np.nan`, but copied from the `np.nan` row.
df = pd.DataFrame(
[
['A', 'G', 1.23, 4.56],
['A', 'D', 9.87, 10.54],
],
columns=['pivot_0', 'pivot_1', 'col_1', 'col_2'],
)
df.set_index(['pivot_0', 'pivot_1'], inplace=True)
df.at[('A', 'E'), 'col_2'] = 0.0
df.at[('A', 'F'), 'col_2'] = 0.0
# Fails: raise exception
# This behavior shows for all versions v0.23.x, however is fine for 0.22.0.
df.at[('A', np.nan), 'col_2'] = 0.0

assert df.loc[('A', np.nan)]['col_2'] == 0.0
assert pd.isna(df.loc[('A', np.nan)]['col_1'])


def test_nan_sigle_index():
# GH 22247
df = pd.DataFrame(
[
[np.nan, 1.23, 4.56],
['G', 1.23, 4.56],
['D', 9.87, 10.54],
],
columns=['pivot_0', 'col_1', 'col_2'],
)
df.set_index(['pivot_0'], inplace=True)

pivot_0_values = ['D', 'E', 'F']
for value in pivot_0_values:
if value not in df.index.get_level_values('pivot_0').tolist():
df.at[(value), 'col_2'] = 0.0

assert df.loc[('F')]['col_2'] == 0.0
assert pd.isna(df.loc[('F')]['col_1'])