Skip to content

Commit 7f7a12a

Browse files
author
haison
committed
add test case set value with NaN
1 parent 9eec9b8 commit 7f7a12a

File tree

4 files changed

+175
-8
lines changed

4 files changed

+175
-8
lines changed

doc/source/whatsnew/v0.25.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,7 @@ Missing
199199

200200
- Fixed misleading exception message in :meth:`Series.missing` if argument ``order`` is required, but omitted (:issue:`10633`, :issue:`24014`).
201201
- Fixed class type displayed in exception message in :meth:`DataFrame.dropna` if invalid ``axis`` parameter passed (:issue:`25555`)
202+
- Fixed MultiIndex bug copying values incorrectly when adding values to index, in case `NaN` is included in the index (:issue:`22247`)
202203
-
203204

204205
MultiIndex

pandas/_libs/index.pyx

+22-3
Original file line numberDiff line numberDiff line change
@@ -591,7 +591,7 @@ cdef class BaseMultiIndexCodesEngine:
591591
level, then locating (the integer representation of) codes.
592592
"""
593593
def __init__(self, object levels, object labels,
594-
ndarray[uint64_t, ndim=1] offsets):
594+
ndarray[uint64_t, ndim=1] offsets, hasnans):
595595
"""
596596
Parameters
597597
----------
@@ -605,6 +605,7 @@ cdef class BaseMultiIndexCodesEngine:
605605

606606
self.levels = levels
607607
self.offsets = offsets
608+
self.hasnans = hasnans
608609

609610
# Transform labels in a single array, and add 1 so that we are working
610611
# with positive integers (-1 for NaN becomes 0):
@@ -657,6 +658,14 @@ cdef class BaseMultiIndexCodesEngine:
657658
indexer = indexer[order]
658659
else:
659660
indexer = self._base.get_indexer(self, lab_ints)
661+
# HashTable return same value for 'NaN' and new value
662+
# simple fix by take maximum value from array and plus once
663+
len = indexer.size - 1
664+
if len + 1 > 1 and self.hasnans:
665+
check_dup = np.any(self._isin(indexer[0:len],
666+
indexer[len:indexer.size]))
667+
if check_dup and indexer[len]==-1:
668+
indexer[len] = np.max(indexer) + 1
660669

661670
return indexer
662671

@@ -673,8 +682,18 @@ cdef class BaseMultiIndexCodesEngine:
673682

674683
# Transform indices into single integer:
675684
lab_int = self._codes_to_ints(np.array(indices, dtype='uint64'))
676-
677-
return self._base.get_loc(self, lab_int)
685+
ret = []
686+
try:
687+
ret = self._base.get_loc(self, lab_int)
688+
except KeyError:
689+
if self.hasnans:
690+
# as NaN value, we have 0 bit represent for codes
691+
# hacking here by add position of NaN in levels.
692+
lab_int += len(self.levels[len(self.levels)-1])
693+
ret = self._base.get_loc(self, np.uint64(lab_int))
694+
else:
695+
raise KeyError(lab_int)
696+
return ret
678697

679698
def get_indexer_non_unique(self, object target):
680699
# This needs to be overridden just because the default one works on

pandas/core/indexes/multi.py

+82-5
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from pandas.core.dtypes.common import (
1818
ensure_int64, ensure_platform_int, is_categorical_dtype, is_hashable,
1919
is_integer, is_iterator, is_list_like, is_object_dtype, is_scalar,
20-
pandas_dtype)
20+
is_string_dtype, pandas_dtype)
2121
from pandas.core.dtypes.dtypes import ExtensionDtype, PandasExtensionDtype
2222
from pandas.core.dtypes.generic import ABCDataFrame
2323
from pandas.core.dtypes.missing import array_equivalent, isna
@@ -74,8 +74,30 @@ def _codes_to_ints(self, codes):
7474
# Single key
7575
return np.bitwise_or.reduce(codes)
7676

77+
codes = np.bitwise_or.reduce(codes, axis=1)
78+
if codes.size > 1 and self.hasnans:
79+
check_dup = np.any(algos.isin(codes[0:codes.size - 1],
80+
codes[codes.size - 1:codes.size]))
81+
if check_dup:
82+
codes[codes.size - 1] = np.max(codes) + 1
83+
7784
# Multiple keys
78-
return np.bitwise_or.reduce(codes, axis=1)
85+
return codes
86+
87+
def _isin(self, comps, values):
88+
"""
89+
Compute the isin boolean array
90+
Note just wraping algorithms.isin function to avoid fail of isort
91+
Parameters
92+
----------
93+
comps : array-like
94+
values : array-like
95+
96+
Returns
97+
-------
98+
boolean array same length as comps
99+
"""
100+
return algos.isin(comps, values)
79101

80102

81103
class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine,
@@ -116,8 +138,32 @@ def _codes_to_ints(self, codes):
116138
# Single key
117139
return np.bitwise_or.reduce(codes)
118140

141+
codes = np.bitwise_or.reduce(codes, axis=1)
142+
# Shift return same value for 'NaN' and new value
143+
# simple fix by take maximum value from array and plus once
144+
if codes.size > 1 and self.hasnans:
145+
check_dup = np.any(algos.isin(codes[0:codes.size - 1],
146+
codes[codes.size - 1:codes.size]))
147+
if check_dup:
148+
codes[codes.size - 1] = np.max(codes) + 1
149+
119150
# Multiple keys
120-
return np.bitwise_or.reduce(codes, axis=1)
151+
return codes
152+
153+
def _isin(self, comps, values):
154+
"""
155+
Compute the isin boolean array
156+
Note just wraping algorithms.isin function to avoid fail of isort
157+
Parameters
158+
----------
159+
comps : array-like
160+
values : array-like
161+
162+
Returns
163+
-------
164+
boolean array same length as comps
165+
"""
166+
return algos.isin(comps, values)
121167

122168

123169
class MultiIndex(Index):
@@ -208,6 +254,7 @@ class MultiIndex(Index):
208254
_levels = FrozenList()
209255
_codes = FrozenList()
210256
_comparables = ['names']
257+
_isna = False
211258
rename = Index.set_names
212259

213260
# --------------------------------------------------------------------
@@ -702,6 +749,34 @@ def _set_codes(self, codes, level=None, copy=False, validate=True,
702749
self._codes = new_codes
703750
self._tuples = None
704751
self._reset_cache()
752+
self._hasnans()
753+
754+
def _hasnans(self):
755+
"""
756+
Return if I have any nans
757+
"""
758+
is_not_right_level = False
759+
try:
760+
self._verify_integrity()
761+
except ValueError:
762+
is_not_right_level = True
763+
764+
if is_not_right_level:
765+
return
766+
767+
if (self.values.size > 0 and is_string_dtype(self.values)):
768+
flat = []
769+
# flatten tuple to 1-D array for searching 'NaN'
770+
for row in self.values:
771+
flat.extend(row)
772+
# algorithms.isin can not pass test_has_duplicates_overflow
773+
with warnings.catch_warnings():
774+
warnings.simplefilter(action='ignore', category=FutureWarning)
775+
try:
776+
self._isna = np.array(np.where(
777+
np.hstack(flat) == 'nan')).size > 0
778+
except UnicodeDecodeError:
779+
self._isna = False
705780

706781
def set_labels(self, labels, level=None, inplace=False,
707782
verify_integrity=True):
@@ -1161,8 +1236,10 @@ def _engine(self):
11611236
# Check the total number of bits needed for our representation:
11621237
if lev_bits[0] > 64:
11631238
# The levels would overflow a 64 bit uint - use Python integers:
1164-
return MultiIndexPyIntEngine(self.levels, self.codes, offsets)
1165-
return MultiIndexUIntEngine(self.levels, self.codes, offsets)
1239+
return MultiIndexPyIntEngine(self.levels,
1240+
self.codes, offsets, self._isna)
1241+
return MultiIndexUIntEngine(self.levels,
1242+
self.codes, offsets, self._isna)
11661243

11671244
@property
11681245
def values(self):

pandas/tests/indexes/multi/test_missing.py

+70
Original file line numberDiff line numberDiff line change
@@ -127,3 +127,73 @@ def test_nan_stays_float():
127127
assert pd.isna(df0.index.get_level_values(1)).all()
128128
# the following failed in 0.14.1
129129
assert pd.isna(dfm.index.get_level_values(1)[:-1]).all()
130+
131+
132+
def test_nan_multi_index():
133+
# GH 22247
134+
# When using the MultiIndex features of pandas, when an `np.nan`
135+
# is in the index when new values are added to the DF then the
136+
# values are not `np.nan`, but copied from the `np.nan` row.
137+
df = pd.DataFrame(
138+
[
139+
['A', np.nan, 1.23, 4.56],
140+
['A', 'G', 1.23, 4.56],
141+
['A', 'D', 9.87, 10.54],
142+
],
143+
columns=['pivot_0', 'pivot_1', 'col_1', 'col_2'],
144+
)
145+
df.set_index(['pivot_0', 'pivot_1'], inplace=True)
146+
pivot_0 = 'A'
147+
pivot_1_values = ['D', 'E', 'F']
148+
for value in pivot_1_values:
149+
if value not in df.index.get_level_values('pivot_1').tolist():
150+
df.at[(pivot_0, value), 'col_2'] = 0.0
151+
152+
assert df.loc[('A', 'F')]['col_2'] == 0.0 # Pass
153+
# Fails: value of 1.23 from the first row in the df is copied
154+
# This behavior shows for all versions v0.23.x, however is fine for 0.22.0.
155+
assert pd.isna(df.loc[('A', 'F')]['col_1'])
156+
157+
158+
def test_nan_set_value_multi_index():
159+
# GH 22247
160+
# When using the MultiIndex features of pandas, when an `np.nan`
161+
# is in the index when new values are added to the DF then the
162+
# values are not `np.nan`, but copied from the `np.nan` row.
163+
df = pd.DataFrame(
164+
[
165+
['A', 'G', 1.23, 4.56],
166+
['A', 'D', 9.87, 10.54],
167+
],
168+
columns=['pivot_0', 'pivot_1', 'col_1', 'col_2'],
169+
)
170+
df.set_index(['pivot_0', 'pivot_1'], inplace=True)
171+
df.at[('A', 'E'), 'col_2'] = 0.0
172+
df.at[('A', 'F'), 'col_2'] = 0.0
173+
# Fails: raise exception
174+
# This behavior shows for all versions v0.23.x, however is fine for 0.22.0.
175+
df.at[('A', np.nan), 'col_2'] = 0.0
176+
177+
assert df.loc[('A', np.nan)]['col_2'] == 0.0
178+
assert pd.isna(df.loc[('A', np.nan)]['col_1'])
179+
180+
181+
def test_nan_sigle_index():
182+
# GH 22247
183+
df = pd.DataFrame(
184+
[
185+
[np.nan, 1.23, 4.56],
186+
['G', 1.23, 4.56],
187+
['D', 9.87, 10.54],
188+
],
189+
columns=['pivot_0', 'col_1', 'col_2'],
190+
)
191+
df.set_index(['pivot_0'], inplace=True)
192+
193+
pivot_0_values = ['D', 'E', 'F']
194+
for value in pivot_0_values:
195+
if value not in df.index.get_level_values('pivot_0').tolist():
196+
df.at[(value), 'col_2'] = 0.0
197+
198+
assert df.loc[('F')]['col_2'] == 0.0
199+
assert pd.isna(df.loc[('F')]['col_1'])

0 commit comments

Comments
 (0)