Skip to content

Commit 1fc80e6

Browse files
committed
BUG: categorical fixups, related GH3678
1 parent 1334684 commit 1fc80e6

File tree

2 files changed

+24
-29
lines changed

2 files changed

+24
-29
lines changed

pandas/core/categorical.py

+17-22
Original file line numberDiff line numberDiff line change
@@ -220,13 +220,12 @@ def __init__(self, values, levels=None, ordered=None, name=None, fastpath=False,
220220
inferred = com._possibly_infer_to_datetimelike(values)
221221
if not isinstance(inferred, np.ndarray):
222222
from pandas.core.series import _sanitize_array
223-
safe_dtype = None
224-
if isinstance(values, list) and np.nan in values:
225-
# On list with NaNs, int values will be converted to float. Use "object" dtype
226-
# to prvent this. In the end objects will be casted to int/... in the level
227-
# assignment step.
228-
safe_dtype = "object"
229-
values = _sanitize_array(values, None, dtype=safe_dtype)
223+
224+
# On list with NaNs, int values will be converted to float. Use "object" dtype
225+
# to prvent this. In the end objects will be casted to int/... in the level
226+
# assignment step.
227+
dtype = object if com.isnull(values).any() else None
228+
values = _sanitize_array(values, None, dtype=dtype)
230229

231230
if levels is None:
232231
# object is needed to preserve ints in case we have np.nan in values
@@ -932,24 +931,20 @@ def describe(self):
932931
).groupby('codes').count()
933932

934933
freqs = counts / float(counts.sum())
935-
936934
from pandas.tools.merge import concat
937935
result = concat([counts,freqs],axis=1)
938936
result.columns = ['counts','freqs']
939937

940-
# Up to now we have codes -> fill in the levels
941-
# object in case we need to handle NaNs
942-
levels = np.asarray(self.levels, dtype=object)
943-
# use arange to also include not used levels
944-
index = np.arange(0, len(levels))
945-
# handle nan
946-
if -1 in result.index:
947-
# take[...,-1] returns the last element. So put np.nan there...
948-
levels = np.append(levels, np.nan)
949-
# also sort the -1 to the last position in the index
950-
index = np.append(index, -1)
951-
result = result.reindex(index)
952-
result.index = levels.take(result.index)
953-
result.index.name = 'levels'
938+
check = counts.index == -1
939+
if check.any():
940+
l = len(self.levels) if com.isnull(self.levels).any() else len(self.levels)+1
941+
index = np.arange(0,l,dtype=object)
942+
index[~check] = self.levels.take(counts.index[~check])
943+
index[check] = np.nan
944+
result.index = index
945+
else:
946+
result.index = self.levels.take(counts.index)
947+
result = result.reindex(self.levels)
954948

949+
result.index.name = 'levels'
955950
return result

pandas/tests/test_categorical.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -240,20 +240,20 @@ def test_describe(self):
240240
# describe should work with NaN
241241
cat = pd.Categorical([np.nan,1, 2, 2])
242242
desc = cat.describe()
243-
expected = DataFrame.from_dict(dict(counts=[1, 2, 1],
244-
freqs=[1/4., 2/4., 1/4.],
245-
levels=[1,2,np.nan]
243+
expected = DataFrame.from_dict(dict(counts=[1, 1, 2],
244+
freqs=[1/4., 1/4., 2/4.],
245+
levels=[np.nan,1,2]
246246
)
247247
).set_index('levels')
248248
tm.assert_frame_equal(desc, expected)
249249

250-
# having NaN as level and as "not available" should also print two NaNs in describe!
250+
# FIXME: this seemse very wrong: having NaN as level and as "not available" should also print two NaNs in describe!
251251
cat = pd.Categorical([np.nan,1, 2, 2])
252252
cat.levels = [1,2,np.nan]
253253
desc = cat.describe()
254-
expected = DataFrame.from_dict(dict(counts=[1, 2, np.nan, 1],
255-
freqs=[1/4., 2/4., np.nan, 1/4.],
256-
levels=[1,2,np.nan,np.nan]
254+
expected = DataFrame.from_dict(dict(counts=[1, 1, 2],
255+
freqs=[1/4., 1/4., 2/4.],
256+
levels=[np.nan,1,2]
257257
)
258258
).set_index('levels')
259259
tm.assert_frame_equal(desc, expected)

0 commit comments

Comments
 (0)