Skip to content

Commit 2c6d005

Browse files
enisnazifjreback
authored andcommitted
Fix the output of df.describe on an empty categorical / object column (#26474)
1 parent 9ebbe1b commit 2c6d005

File tree

4 files changed

+46
-1
lines changed

4 files changed

+46
-1
lines changed

doc/source/whatsnew/v0.25.0.rst

+28
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,34 @@ are returned. (:issue:`21521`)
253253
254254
df.groupby("a").ffill()
255255
256+
``DataFrame`` describe on an empty categorical / object column will return top and freq
257+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
258+
259+
When calling :meth:`DataFrame.describe` with an empty categorical / object
260+
column, the 'top' and 'freq' columns were previously omitted, which was inconsistent with
261+
the output for non-empty columns. Now the 'top' and 'freq' columns will always be included,
262+
with :attr:`numpy.nan` in the case of an empty :class:`DataFrame` (:issue:`26397`)
263+
264+
.. ipython:: python
265+
266+
df = pd.DataFrame({"empty_col": pd.Categorical([])})
267+
df
268+
269+
*Previous Behavior*:
270+
271+
.. code-block:: python
272+
273+
In [3]: df.describe()
274+
Out[3]:
275+
empty_col
276+
count 0
277+
unique 0
278+
279+
*New Behavior*:
280+
281+
.. ipython:: python
282+
283+
df.describe()
256284
257285
``__str__`` methods now call ``__repr__`` rather than vica-versa
258286
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

pandas/core/arrays/categorical.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1483,7 +1483,7 @@ def value_counts(self, dropna=True):
14831483

14841484
if dropna or clean:
14851485
obs = code if clean else code[mask]
1486-
count = bincount(obs, minlength=ncat or None)
1486+
count = bincount(obs, minlength=ncat or 0)
14871487
else:
14881488
count = bincount(np.where(mask, code, ncat))
14891489
ix = np.append(ix, -1)

pandas/core/generic.py

+6
Original file line numberDiff line numberDiff line change
@@ -9920,6 +9920,12 @@ def describe_categorical_1d(data):
99209920
names += ['top', 'freq']
99219921
result += [top, freq]
99229922

9923+
# If the DataFrame is empty, set 'top' and 'freq' to None
9924+
# to maintain output shape consistency
9925+
else:
9926+
names += ['top', 'freq']
9927+
result += [None, None]
9928+
99239929
return pd.Series(result, index=names, name=data.name)
99249930

99259931
def describe_1d(data):

pandas/tests/frame/test_analytics.py

+11
Original file line numberDiff line numberDiff line change
@@ -588,6 +588,16 @@ def test_describe_categorical(self):
588588
result = df3.describe()
589589
tm.assert_numpy_array_equal(result["cat"].values, result["s"].values)
590590

591+
def test_describe_empty_categorical_column(self):
592+
# GH 26397
593+
# Ensure the index of an an empty categoric DataFrame column
594+
# also contains (count, unique, top, freq)
595+
df = pd.DataFrame({"empty_col": Categorical([])})
596+
result = df.describe()
597+
expected = DataFrame({'empty_col': [0, 0, None, None]},
598+
index=['count', 'unique', 'top', 'freq'])
599+
tm.assert_frame_equal(result, expected)
600+
591601
def test_describe_categorical_columns(self):
592602
# GH 11558
593603
columns = pd.CategoricalIndex(['int1', 'int2', 'obj'],
@@ -608,6 +618,7 @@ def test_describe_categorical_columns(self):
608618
index=['count', 'mean', 'std', 'min', '25%',
609619
'50%', '75%', 'max'],
610620
columns=exp_columns)
621+
611622
tm.assert_frame_equal(result, expected)
612623
tm.assert_categorical_equal(result.columns.values,
613624
expected.columns.values)

0 commit comments

Comments
 (0)