|
17 | 17 | from pandas.core.dtypes.common import (
|
18 | 18 | ensure_int64, ensure_platform_int, is_categorical_dtype, is_hashable,
|
19 | 19 | is_integer, is_iterator, is_list_like, is_object_dtype, is_scalar,
|
20 |
| - pandas_dtype) |
| 20 | + is_string_dtype, pandas_dtype) |
21 | 21 | from pandas.core.dtypes.dtypes import ExtensionDtype, PandasExtensionDtype
|
22 | 22 | from pandas.core.dtypes.generic import ABCDataFrame
|
23 | 23 | from pandas.core.dtypes.missing import array_equivalent, isna
|
@@ -74,8 +74,30 @@ def _codes_to_ints(self, codes):
|
74 | 74 | # Single key
|
75 | 75 | return np.bitwise_or.reduce(codes)
|
76 | 76 |
|
| 77 | + codes = np.bitwise_or.reduce(codes, axis=1) |
| 78 | + if codes.size > 1 and self.hasnans: |
| 79 | + check_dup = np.any(algos.isin(codes[0:codes.size - 1], |
| 80 | + codes[codes.size - 1:codes.size])) |
| 81 | + if check_dup: |
| 82 | + codes[codes.size - 1] = np.max(codes) + 1 |
| 83 | + |
77 | 84 | # Multiple keys
|
78 |
| - return np.bitwise_or.reduce(codes, axis=1) |
| 85 | + return codes |
| 86 | + |
| 87 | + def _isin(self, comps, values): |
| 88 | + """ |
| 89 | + Compute the isin boolean array |
| 90 | + Note just wraping algorithms.isin function to avoid fail of isort |
| 91 | + Parameters |
| 92 | + ---------- |
| 93 | + comps : array-like |
| 94 | + values : array-like |
| 95 | +
|
| 96 | + Returns |
| 97 | + ------- |
| 98 | + boolean array same length as comps |
| 99 | + """ |
| 100 | + return algos.isin(comps, values) |
79 | 101 |
|
80 | 102 |
|
81 | 103 | class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine,
|
@@ -116,8 +138,32 @@ def _codes_to_ints(self, codes):
|
116 | 138 | # Single key
|
117 | 139 | return np.bitwise_or.reduce(codes)
|
118 | 140 |
|
| 141 | + codes = np.bitwise_or.reduce(codes, axis=1) |
| 142 | + # Shift return same value for 'NaN' and new value |
| 143 | + # simple fix by take maximum value from array and plus once |
| 144 | + if codes.size > 1 and self.hasnans: |
| 145 | + check_dup = np.any(algos.isin(codes[0:codes.size - 1], |
| 146 | + codes[codes.size - 1:codes.size])) |
| 147 | + if check_dup: |
| 148 | + codes[codes.size - 1] = np.max(codes) + 1 |
| 149 | + |
119 | 150 | # Multiple keys
|
120 |
| - return np.bitwise_or.reduce(codes, axis=1) |
| 151 | + return codes |
| 152 | + |
| 153 | + def _isin(self, comps, values): |
| 154 | + """ |
| 155 | + Compute the isin boolean array |
| 156 | + Note just wraping algorithms.isin function to avoid fail of isort |
| 157 | + Parameters |
| 158 | + ---------- |
| 159 | + comps : array-like |
| 160 | + values : array-like |
| 161 | +
|
| 162 | + Returns |
| 163 | + ------- |
| 164 | + boolean array same length as comps |
| 165 | + """ |
| 166 | + return algos.isin(comps, values) |
121 | 167 |
|
122 | 168 |
|
123 | 169 | class MultiIndex(Index):
|
@@ -208,6 +254,7 @@ class MultiIndex(Index):
|
208 | 254 | _levels = FrozenList()
|
209 | 255 | _codes = FrozenList()
|
210 | 256 | _comparables = ['names']
|
| 257 | + _isna = False |
211 | 258 | rename = Index.set_names
|
212 | 259 |
|
213 | 260 | # --------------------------------------------------------------------
|
@@ -702,6 +749,34 @@ def _set_codes(self, codes, level=None, copy=False, validate=True,
|
702 | 749 | self._codes = new_codes
|
703 | 750 | self._tuples = None
|
704 | 751 | self._reset_cache()
|
| 752 | + self._hasnans() |
| 753 | + |
| 754 | + def _hasnans(self): |
| 755 | + """ |
| 756 | + Return if I have any nans |
| 757 | + """ |
| 758 | + is_not_right_level = False |
| 759 | + try: |
| 760 | + self._verify_integrity() |
| 761 | + except ValueError: |
| 762 | + is_not_right_level = True |
| 763 | + |
| 764 | + if is_not_right_level: |
| 765 | + return |
| 766 | + |
| 767 | + if (self.values.size > 0 and is_string_dtype(self.values)): |
| 768 | + flat = [] |
| 769 | + # flatten tuple to 1-D array for searching 'NaN' |
| 770 | + for row in self.values: |
| 771 | + flat.extend(row) |
| 772 | + # algorithms.isin can not pass test_has_duplicates_overflow |
| 773 | + with warnings.catch_warnings(): |
| 774 | + warnings.simplefilter(action='ignore', category=FutureWarning) |
| 775 | + try: |
| 776 | + self._isna = np.array(np.where( |
| 777 | + np.hstack(flat) == 'nan')).size > 0 |
| 778 | + except UnicodeDecodeError: |
| 779 | + self._isna = False |
705 | 780 |
|
706 | 781 | def set_labels(self, labels, level=None, inplace=False,
|
707 | 782 | verify_integrity=True):
|
@@ -1161,8 +1236,10 @@ def _engine(self):
|
1161 | 1236 | # Check the total number of bits needed for our representation:
|
1162 | 1237 | if lev_bits[0] > 64:
|
1163 | 1238 | # The levels would overflow a 64 bit uint - use Python integers:
|
1164 |
| - return MultiIndexPyIntEngine(self.levels, self.codes, offsets) |
1165 |
| - return MultiIndexUIntEngine(self.levels, self.codes, offsets) |
| 1239 | + return MultiIndexPyIntEngine(self.levels, |
| 1240 | + self.codes, offsets, self._isna) |
| 1241 | + return MultiIndexUIntEngine(self.levels, |
| 1242 | + self.codes, offsets, self._isna) |
1166 | 1243 |
|
1167 | 1244 | @property
|
1168 | 1245 | def values(self):
|
|
0 commit comments