Skip to content

Commit b539298

Browse files
committed
BUG: fix dtype of all-NaN categories and MultiIndex levels
1 parent 77b4bb3 commit b539298

File tree

6 files changed

+33
-14
lines changed

6 files changed

+33
-14
lines changed

asv_bench/benchmarks/categoricals.py

+9
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ def setup(self):
2626
self.datetimes = pd.Series(pd.date_range(
2727
'1995-01-01 00:00:00', periods=10000, freq='s'))
2828

29+
self.values_some_nan = list(np.tile(self.categories + [np.nan], N))
30+
self.values_all_nan = [np.nan] * len(self.values)
31+
2932
def time_concat(self):
3033
concat([self.s, self.s])
3134

@@ -46,6 +49,12 @@ def time_constructor_datetimes_with_nat(self):
4649
t.iloc[-1] = pd.NaT
4750
Categorical(t)
4851

52+
def time_constructor_with_nan(self):
53+
Categorical(self.values_some_nan)
54+
55+
def time_constructor_all_nan(self):
56+
Categorical(self.values_all_nan)
57+
4958

5059
class Categoricals2(object):
5160
goal_time = 0.2

doc/source/whatsnew/v0.21.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -964,6 +964,7 @@ Indexing
964964
- When called on an unsorted ``MultiIndex``, the ``loc`` indexer now will raise ``UnsortedIndexError`` only if proper slicing is used on non-sorted levels (:issue:`16734`).
965965
- Fixes regression in 0.20.3 when indexing with a string on a ``TimedeltaIndex`` (:issue:`16896`).
966966
- Fixed :func:`TimedeltaIndex.get_loc` handling of ``np.timedelta64`` inputs (:issue:`16909`).
967+
- Bug in ``MultiIndex`` which would assign object dtype to all-NaN levels (:issue:`17929`).
967968
- Fix :func:`MultiIndex.sort_index` ordering when ``ascending`` argument is a list, but not all levels are specified, or are in a different order (:issue:`16934`).
968969
- Fixes bug where indexing with ``np.inf`` caused an ``OverflowError`` to be raised (:issue:`16957`)
969970
- Bug in reindexing on an empty ``CategoricalIndex`` (:issue:`16770`)

pandas/core/categorical.py

+11-4
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
288288
self._dtype = dtype
289289
return
290290

291+
null_mask = np.array(False)
291292
# sanitize input
292293
if is_categorical_dtype(values):
293294

@@ -316,13 +317,14 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
316317
if not isinstance(values, np.ndarray):
317318
values = _convert_to_list_like(values)
318319
from pandas.core.series import _sanitize_array
319-
# On list with NaNs, int values will be converted to float. Use
320-
# "object" dtype to prevent this. In the end objects will be
321-
# casted to int/... in the category assignment step.
322-
if len(values) == 0 or isna(values).any():
320+
# By convention, empty lists result in object dtype:
321+
if len(values) == 0:
323322
sanitize_dtype = 'object'
324323
else:
325324
sanitize_dtype = None
325+
null_mask = isna(values)
326+
if null_mask.any():
327+
values = [values[idx] for idx in np.where(~null_mask)[0]]
326328
values = _sanitize_array(values, None, dtype=sanitize_dtype)
327329

328330
if dtype.categories is None:
@@ -370,6 +372,11 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
370372
"mean to use\n'Categorical.from_codes(codes, "
371373
"categories)'?", RuntimeWarning, stacklevel=2)
372374

375+
if null_mask.any():
376+
full_codes = - np.ones(null_mask.shape, dtype=codes.dtype)
377+
full_codes[~null_mask] = codes
378+
codes = full_codes
379+
373380
self._dtype = dtype
374381
self._codes = coerce_indexer_dtype(codes, dtype.categories)
375382

pandas/tests/indexes/test_multi.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -970,12 +970,13 @@ def test_get_level_values_na(self):
970970

971971
arrays = [[np.nan, np.nan, np.nan], ['a', np.nan, 1]]
972972
index = pd.MultiIndex.from_arrays(arrays)
973-
values = index.get_level_values(0)
974-
expected = np.array([np.nan, np.nan, np.nan])
975-
tm.assert_numpy_array_equal(values.values.astype(float), expected)
976-
values = index.get_level_values(1)
977-
expected = np.array(['a', np.nan, 1], dtype=object)
978-
tm.assert_numpy_array_equal(values.values, expected)
973+
result = index.get_level_values(0)
974+
expected = pd.Index([np.nan, np.nan, np.nan])
975+
tm.assert_index_equal(result, expected)
976+
977+
result = index.get_level_values(1)
978+
expected = pd.Index(['a', np.nan, 1])
979+
tm.assert_index_equal(result, expected)
979980

980981
arrays = [['a', 'b', 'b'], pd.DatetimeIndex([0, 1, pd.NaT])]
981982
index = pd.MultiIndex.from_arrays(arrays)

pandas/tests/reshape/test_concat.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -648,7 +648,7 @@ def test_concat_categorical_coercion_nan(self):
648648
s1 = pd.Series([np.nan, np.nan], dtype='category')
649649
s2 = pd.Series([np.nan, np.nan])
650650

651-
exp = pd.Series([np.nan, np.nan, np.nan, np.nan], dtype=object)
651+
exp = pd.Series([np.nan, np.nan, np.nan, np.nan])
652652
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
653653
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
654654
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)

pandas/tests/reshape/test_union_categoricals.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,8 @@ def test_union_categoricals_nan(self):
9090
tm.assert_categorical_equal(res, exp)
9191

9292
# all NaN
93-
res = union_categoricals([pd.Categorical([np.nan, np.nan]),
93+
res = union_categoricals([pd.Categorical(np.array([np.nan, np.nan],
94+
dtype=object)),
9495
pd.Categorical(['X'])])
9596
exp = Categorical([np.nan, np.nan, 'X'])
9697
tm.assert_categorical_equal(res, exp)
@@ -250,7 +251,7 @@ def test_union_categoricals_sort(self):
250251
c1 = Categorical([np.nan])
251252
c2 = Categorical([np.nan])
252253
result = union_categoricals([c1, c2], sort_categories=True)
253-
expected = Categorical([np.nan, np.nan], categories=[])
254+
expected = Categorical([np.nan, np.nan])
254255
tm.assert_categorical_equal(result, expected)
255256

256257
c1 = Categorical([])
@@ -299,7 +300,7 @@ def test_union_categoricals_sort_false(self):
299300
c1 = Categorical([np.nan])
300301
c2 = Categorical([np.nan])
301302
result = union_categoricals([c1, c2], sort_categories=False)
302-
expected = Categorical([np.nan, np.nan], categories=[])
303+
expected = Categorical([np.nan, np.nan])
303304
tm.assert_categorical_equal(result, expected)
304305

305306
c1 = Categorical([])

0 commit comments

Comments
 (0)