diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index b4aa6447c0a1b..bad06329c4bfa 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -838,6 +838,7 @@ Other - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`) - Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`) - Bug in :meth:`MultiIndex.fillna` error message was referring to ``isna`` instead of ``fillna`` (:issue:`60974`) +- Bug in :meth:`Series.describe` where median percentile was always included when the ``percentiles`` argument was passed (:issue:`60550`). - Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`) - Bug in :meth:`Series.isin` raising ``TypeError`` when series is large (>10**6) and ``values`` contains NA (:issue:`60678`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0c3f535df9ce2..a7a287de0241e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10818,9 +10818,8 @@ def describe( ---------- percentiles : list-like of numbers, optional The percentiles to include in the output. All should - fall between 0 and 1. The default is - ``[.25, .5, .75]``, which returns the 25th, 50th, and - 75th percentiles. + fall between 0 and 1. The default, ``None``, will automatically + return the 25th, 50th, and 75th percentiles. include : 'all', list-like of dtypes or None (default), optional A white list of data types to include in the result. Ignored for ``Series``. Here are the options: diff --git a/pandas/core/methods/describe.py b/pandas/core/methods/describe.py index 17d4d38c97f33..944e28a9b0238 100644 --- a/pandas/core/methods/describe.py +++ b/pandas/core/methods/describe.py @@ -229,10 +229,15 @@ def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: formatted_percentiles = format_percentiles(percentiles) + if len(percentiles) == 0: + quantiles = [] + else: + quantiles = series.quantile(percentiles).tolist() + stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"] d = ( [series.count(), series.mean(), series.std(), series.min()] - + series.quantile(percentiles).tolist() + + quantiles + [series.max()] ) # GH#48340 - always return float on non-complex numeric data @@ -354,10 +359,6 @@ def _refine_percentiles( # get them all to be in [0, 1] validate_percentile(percentiles) - # median should always be included - if 0.5 not in percentiles: - percentiles.append(0.5) - percentiles = np.asarray(percentiles) # sort and check for duplicates diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index b7fbc4e5e22b7..fb799361fea67 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1565,6 +1565,9 @@ def format_percentiles( >>> format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999]) ['0%', '50%', '2.0%', '50%', '66.67%', '99.99%'] """ + if len(percentiles) == 0: + return [] + percentiles = np.asarray(percentiles) # It checks for np.nan as well diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index e9206e86b7b08..50656ca85e90a 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -413,3 +413,44 @@ def test_describe_exclude_pa_dtype(self): dtype=pd.ArrowDtype(pa.float64()), ) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("percentiles", [None, [], [0.2]]) + def test_refine_percentiles(self, percentiles): + """ + Test that the percentiles are returned correctly depending on the `percentiles` + argument. + - The default behavior is to return the 25th, 50th, and 75 percentiles + - If `percentiles` is an empty list, no percentiles are returned + - If `percentiles` is a non-empty list, only those percentiles are returned + """ + # GH#60550 + df = DataFrame({"a": np.arange(0, 10, 1)}) + + result = df.describe(percentiles=percentiles) + + if percentiles is None: + percentiles = [0.25, 0.5, 0.75] + + expected = DataFrame( + [ + len(df.a), + df.a.mean(), + df.a.std(), + df.a.min(), + *[df.a.quantile(p) for p in percentiles], + df.a.max(), + ], + index=pd.Index( + [ + "count", + "mean", + "std", + "min", + *[f"{p:.0%}" for p in percentiles], + "max", + ] + ), + columns=["a"], + ) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/methods/test_describe.py b/pandas/tests/groupby/methods/test_describe.py index 6c4b913574d9e..72bdfebc5eeb7 100644 --- a/pandas/tests/groupby/methods/test_describe.py +++ b/pandas/tests/groupby/methods/test_describe.py @@ -202,15 +202,15 @@ def test_describe_duplicate_columns(): gb = df.groupby(df[1]) result = gb.describe(percentiles=[]) - columns = ["count", "mean", "std", "min", "50%", "max"] + columns = ["count", "mean", "std", "min", "max"] frames = [ - DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns) + DataFrame([[1.0, val, np.nan, val, val]], index=[1], columns=columns) for val in (0.0, 2.0, 3.0) ] expected = pd.concat(frames, axis=1) expected.columns = MultiIndex( levels=[[0, 2], columns], - codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))], + codes=[5 * [0] + 5 * [1] + 5 * [0], 3 * list(range(5))], ) expected.index.names = [1] tm.assert_frame_equal(result, expected)