BUG: fix dtype of all-NaN categories and MultiIndex levels (pandas-dev#17934)

toobaz · No-Stream · commit b945703bff04 · 2017-11-28T15:15:14.000-08:00
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
@@ -26,6 +26,9 @@ def setup(self):
         self.datetimes = pd.Series(pd.date_range(
             '1995-01-01 00:00:00', periods=10000, freq='s'))
 
+        self.values_some_nan = list(np.tile(self.categories + [np.nan], N))
+        self.values_all_nan = [np.nan] * len(self.values)
+
     def time_concat(self):
         concat([self.s, self.s])
 
@@ -46,6 +49,12 @@ def time_constructor_datetimes_with_nat(self):
         t.iloc[-1] = pd.NaT
         Categorical(t)
 
+    def time_constructor_with_nan(self):
+        Categorical(self.values_some_nan)
+
+    def time_constructor_all_nan(self):
+        Categorical(self.values_all_nan)
+
 
 class Categoricals2(object):
     goal_time = 0.2
diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
@@ -41,6 +41,7 @@ Other API Changes
 ^^^^^^^^^^^^^^^^^
 
 - ``NaT`` division with :class:`datetime.timedelta` will now return ``NaN`` instead of raising (:issue:`17876`)
+- All-NaN levels in ``MultiIndex`` are now assigned float rather than object dtype, coherently with flat indexes (:issue:`17929`).
 - :class:`Timestamp` will no longer silently ignore unused or invalid `tz` or `tzinfo` arguments (:issue:`17690`)
 - :class:`CacheableOffset` and :class:`WeekDay` are no longer available in the `tseries.offsets` module (:issue:`17830`)
 -
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -288,6 +288,10 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
             self._dtype = dtype
             return
 
+        # null_mask indicates missing values we want to exclude from inference.
+        # This means: only missing values in list-likes (not arrays/ndframes).
+        null_mask = np.array(False)
+
         # sanitize input
         if is_categorical_dtype(values):
 
@@ -316,13 +320,14 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
             if not isinstance(values, np.ndarray):
                 values = _convert_to_list_like(values)
                 from pandas.core.series import _sanitize_array
-                # On list with NaNs, int values will be converted to float. Use
-                # "object" dtype to prevent this. In the end objects will be
-                # casted to int/... in the category assignment step.
-                if len(values) == 0 or isna(values).any():
+                # By convention, empty lists result in object dtype:
+                if len(values) == 0:
                     sanitize_dtype = 'object'
                 else:
                     sanitize_dtype = None
+                null_mask = isna(values)
+                if null_mask.any():
+                    values = [values[idx] for idx in np.where(~null_mask)[0]]
                 values = _sanitize_array(values, None, dtype=sanitize_dtype)
 
         if dtype.categories is None:
@@ -370,6 +375,12 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
                      "mean to use\n'Categorical.from_codes(codes, "
                      "categories)'?", RuntimeWarning, stacklevel=2)
 
+        if null_mask.any():
+            # Reinsert -1 placeholders for previously removed missing values
+            full_codes = - np.ones(null_mask.shape, dtype=codes.dtype)
+            full_codes[~null_mask] = codes
+            codes = full_codes
+
         self._dtype = dtype
         self._codes = coerce_indexer_dtype(codes, dtype.categories)
 
diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py
@@ -970,12 +970,13 @@ def test_get_level_values_na(self):
 
         arrays = [[np.nan, np.nan, np.nan], ['a', np.nan, 1]]
         index = pd.MultiIndex.from_arrays(arrays)
-        values = index.get_level_values(0)
-        expected = np.array([np.nan, np.nan, np.nan])
-        tm.assert_numpy_array_equal(values.values.astype(float), expected)
-        values = index.get_level_values(1)
-        expected = np.array(['a', np.nan, 1], dtype=object)
-        tm.assert_numpy_array_equal(values.values, expected)
+        result = index.get_level_values(0)
+        expected = pd.Index([np.nan, np.nan, np.nan])
+        tm.assert_index_equal(result, expected)
+
+        result = index.get_level_values(1)
+        expected = pd.Index(['a', np.nan, 1])
+        tm.assert_index_equal(result, expected)
 
         arrays = [['a', 'b', 'b'], pd.DatetimeIndex([0, 1, pd.NaT])]
         index = pd.MultiIndex.from_arrays(arrays)
diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py
@@ -648,7 +648,7 @@ def test_concat_categorical_coercion_nan(self):
         s1 = pd.Series([np.nan, np.nan], dtype='category')
         s2 = pd.Series([np.nan, np.nan])
 
-        exp = pd.Series([np.nan, np.nan, np.nan, np.nan], dtype=object)
+        exp = pd.Series([np.nan, np.nan, np.nan, np.nan])
         tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
         tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
         tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
diff --git a/pandas/tests/reshape/test_union_categoricals.py b/pandas/tests/reshape/test_union_categoricals.py
@@ -90,7 +90,8 @@ def test_union_categoricals_nan(self):
         tm.assert_categorical_equal(res, exp)
 
         # all NaN
-        res = union_categoricals([pd.Categorical([np.nan, np.nan]),
+        res = union_categoricals([pd.Categorical(np.array([np.nan, np.nan],
+                                                          dtype=object)),
                                   pd.Categorical(['X'])])
         exp = Categorical([np.nan, np.nan, 'X'])
         tm.assert_categorical_equal(res, exp)
@@ -250,7 +251,7 @@ def test_union_categoricals_sort(self):
         c1 = Categorical([np.nan])
         c2 = Categorical([np.nan])
         result = union_categoricals([c1, c2], sort_categories=True)
-        expected = Categorical([np.nan, np.nan], categories=[])
+        expected = Categorical([np.nan, np.nan])
         tm.assert_categorical_equal(result, expected)
 
         c1 = Categorical([])
@@ -299,7 +300,7 @@ def test_union_categoricals_sort_false(self):
         c1 = Categorical([np.nan])
         c2 = Categorical([np.nan])
         result = union_categoricals([c1, c2], sort_categories=False)
-        expected = Categorical([np.nan, np.nan], categories=[])
+        expected = Categorical([np.nan, np.nan])
         tm.assert_categorical_equal(result, expected)
 
         c1 = Categorical([])

Original file line number	Diff line number	Diff line change
`@@ -41,6 +41,7 @@ Other API Changes`
`41`	`41`	`^^^^^^^^^^^^^^^^^`
`42`	`42`
`43`	`43`	- ``NaT`` division with :class:`datetime.timedelta` will now return ``NaN`` instead of raising (:issue:`17876`)
	`44`	+- All-NaN levels in ``MultiIndex`` are now assigned float rather than object dtype, coherently with flat indexes (:issue:`17929`).
`44`	`45`	- :class:`Timestamp` will no longer silently ignore unused or invalid `tz` or `tzinfo` arguments (:issue:`17690`)
`45`	`46`	- :class:`CacheableOffset` and :class:`WeekDay` are no longer available in the `tseries.offsets` module (:issue:`17830`)
`46`	`47`	`-`