BUG: categorical fixups, related GH3678

jreback · jreback · commit 1fc80e6c2c16 · 2014-07-16T16:48:58.000-04:00
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -220,13 +220,12 @@ def __init__(self, values, levels=None, ordered=None, name=None, fastpath=False,
             inferred = com._possibly_infer_to_datetimelike(values)
             if not isinstance(inferred, np.ndarray):
                 from pandas.core.series import _sanitize_array
-                safe_dtype = None
-                if isinstance(values, list) and np.nan in values:
-                    # On list with NaNs, int values will be converted to float. Use "object" dtype
-                    # to prvent this. In the end objects will be casted to int/... in the level
-                    # assignment step.
-                    safe_dtype = "object"
-                values = _sanitize_array(values, None, dtype=safe_dtype)
+
+                # On list with NaNs, int values will be converted to float. Use "object" dtype
+                # to prvent this. In the end objects will be casted to int/... in the level
+                # assignment step.
+                dtype = object if com.isnull(values).any() else None
+                values = _sanitize_array(values, None, dtype=dtype)
 
         if levels is None:
             # object is needed to preserve ints in case we have np.nan in values
@@ -932,24 +931,20 @@ def describe(self):
                            ).groupby('codes').count()
 
         freqs = counts / float(counts.sum())
-
         from pandas.tools.merge import concat
         result = concat([counts,freqs],axis=1)
         result.columns = ['counts','freqs']
 
-        # Up to now we have codes -> fill in the levels
-        # object in case we need to handle NaNs
-        levels = np.asarray(self.levels, dtype=object)
-        # use arange to also include not used levels
-        index = np.arange(0, len(levels))
-        # handle nan
-        if -1 in result.index:
-            # take[...,-1] returns the last element. So put np.nan there...
-            levels = np.append(levels, np.nan)
-            # also sort the -1 to the last position in the index
-            index = np.append(index, -1)
-        result = result.reindex(index)
-        result.index = levels.take(result.index)
-        result.index.name = 'levels'
+        check = counts.index == -1
+        if check.any():
+            l = len(self.levels) if com.isnull(self.levels).any() else len(self.levels)+1
+            index = np.arange(0,l,dtype=object)
+            index[~check] = self.levels.take(counts.index[~check])
+            index[check] = np.nan
+            result.index = index
+        else:
+            result.index = self.levels.take(counts.index)
+            result = result.reindex(self.levels)
 
+        result.index.name = 'levels'
         return result
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -240,20 +240,20 @@ def test_describe(self):
         # describe should work with NaN
         cat = pd.Categorical([np.nan,1, 2, 2])
         desc = cat.describe()
-        expected = DataFrame.from_dict(dict(counts=[1, 2, 1],
-                                            freqs=[1/4., 2/4., 1/4.],
-                                            levels=[1,2,np.nan]
+        expected = DataFrame.from_dict(dict(counts=[1, 1, 2],
+                                            freqs=[1/4., 1/4., 2/4.],
+                                            levels=[np.nan,1,2]
                                             )
                                             ).set_index('levels')
         tm.assert_frame_equal(desc, expected)
 
-        # having NaN as level and as "not available" should also print two NaNs in describe!
+        # FIXME: this seemse very wrong: having NaN as level and as "not available" should also print two NaNs in describe!
         cat = pd.Categorical([np.nan,1, 2, 2])
         cat.levels = [1,2,np.nan]
         desc = cat.describe()
-        expected = DataFrame.from_dict(dict(counts=[1, 2, np.nan, 1],
-                                            freqs=[1/4., 2/4., np.nan, 1/4.],
-                                            levels=[1,2,np.nan,np.nan]
+        expected = DataFrame.from_dict(dict(counts=[1, 1, 2],
+                                            freqs=[1/4., 1/4., 2/4.],
+                                            levels=[np.nan,1,2]
                                             )
                                             ).set_index('levels')
         tm.assert_frame_equal(desc, expected)