Skip to content

Commit 1334684

Browse files
committed
Categorical: fix describe with np.nan
1 parent 704c505 commit 1334684

File tree

2 files changed

+49
-3
lines changed

2 files changed

+49
-3
lines changed

pandas/core/categorical.py

+16-3
Original file line numberDiff line numberDiff line change
@@ -931,12 +931,25 @@ def describe(self):
931931
'values' : self._codes }
932932
).groupby('codes').count()
933933

934-
counts.index = self.levels.take(counts.index)
935-
counts = counts.reindex(self.levels)
936934
freqs = counts / float(counts.sum())
937935

938936
from pandas.tools.merge import concat
939937
result = concat([counts,freqs],axis=1)
940-
result.index.name = 'levels'
941938
result.columns = ['counts','freqs']
939+
940+
# Up to now we have codes -> fill in the levels
941+
# object in case we need to handle NaNs
942+
levels = np.asarray(self.levels, dtype=object)
943+
# use arange to also include not used levels
944+
index = np.arange(0, len(levels))
945+
# handle nan
946+
if -1 in result.index:
947+
# take[...,-1] returns the last element. So put np.nan there...
948+
levels = np.append(levels, np.nan)
949+
# also sort the -1 to the last position in the index
950+
index = np.append(index, -1)
951+
result = result.reindex(index)
952+
result.index = levels.take(result.index)
953+
result.index.name = 'levels'
954+
942955
return result

pandas/tests/test_categorical.py

+33
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,16 @@ def test_describe(self):
217217
).set_index('levels')
218218
tm.assert_frame_equal(desc, expected)
219219

220+
# check unused levels
221+
cat = self.factor.copy()
222+
cat.levels = ["a","b","c","d"]
223+
desc = cat.describe()
224+
expected = DataFrame.from_dict(dict(counts=[3, 2, 3, np.nan],
225+
freqs=[3/8., 2/8., 3/8., np.nan],
226+
levels=['a', 'b', 'c', 'd'])
227+
).set_index('levels')
228+
tm.assert_frame_equal(desc, expected)
229+
220230
# check an integer one
221231
desc = Categorical([1,2,3,1,2,3,3,2,1,1,1]).describe()
222232
expected = DataFrame.from_dict(dict(counts=[5, 3, 3],
@@ -226,6 +236,29 @@ def test_describe(self):
226236
).set_index('levels')
227237
tm.assert_frame_equal(desc, expected)
228238

239+
# https://github.com/pydata/pandas/issues/3678
240+
# describe should work with NaN
241+
cat = pd.Categorical([np.nan,1, 2, 2])
242+
desc = cat.describe()
243+
expected = DataFrame.from_dict(dict(counts=[1, 2, 1],
244+
freqs=[1/4., 2/4., 1/4.],
245+
levels=[1,2,np.nan]
246+
)
247+
).set_index('levels')
248+
tm.assert_frame_equal(desc, expected)
249+
250+
# having NaN as level and as "not available" should also print two NaNs in describe!
251+
cat = pd.Categorical([np.nan,1, 2, 2])
252+
cat.levels = [1,2,np.nan]
253+
desc = cat.describe()
254+
expected = DataFrame.from_dict(dict(counts=[1, 2, np.nan, 1],
255+
freqs=[1/4., 2/4., np.nan, 1/4.],
256+
levels=[1,2,np.nan,np.nan]
257+
)
258+
).set_index('levels')
259+
tm.assert_frame_equal(desc, expected)
260+
261+
229262
def test_print(self):
230263
expected = [" a", " b", " b", " a", " a", " c", " c", " c",
231264
"Levels (3, object): [a < b < c]"]

0 commit comments

Comments
 (0)