Skip to content

Commit 9a6bd6e

Browse files
committed
BUG: Fix describe(): percentiles (#13104), col index (#13288)
BUG #13104: - Percentile identifiers are now rounded to the least precision that keeps them unique. - Supplying duplicates in percentiles will raise ValueError. BUG #13288 - Fixed a column index of the output data frame. Previously, if a data frame had a column index of object type and the index contained numeric values, the output column index could be corrupt. It led to ValueError if the output was displayed. - describe() will raise ValueError with an informative message on DataFrame without columns.
1 parent 9e7bfdd commit 9a6bd6e

File tree

5 files changed

+231
-22
lines changed

5 files changed

+231
-22
lines changed

doc/source/whatsnew/v0.18.2.txt

+69
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,75 @@ resulting dtype will be upcast (unchanged from previous).
228228
pd.merge(df1, df2, how='outer', on='key')
229229
pd.merge(df1, df2, how='outer', on='key').dtypes
230230

231+
.. _whatsnew_0182.api.describe:
232+
233+
``.describe()`` changes
234+
^^^^^^^^^^^^^^^^^^^^^^^
235+
236+
Percentile identifiers in the index of a ``.describe()`` output will now be rounded to the least precision that keeps them distinct (:issue:`13104`)
237+
238+
.. ipython:: python
239+
240+
s = pd.Series([0, 1, 2, 3, 4])
241+
df = pd.DataFrame([0, 1, 2, 3, 4])
242+
243+
Previous Behavior:
244+
245+
They were rounded to at most one decimal place, which could raise ``ValueError`` for a data frame.
246+
247+
.. code-block:: ipython
248+
249+
In [3]: s.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999])
250+
Out[3]:
251+
count 5.000000
252+
mean 2.000000
253+
std 1.581139
254+
min 0.000000
255+
0.0% 0.000400
256+
0.1% 0.002000
257+
0.1% 0.004000
258+
50% 2.000000
259+
99.9% 3.996000
260+
100.0% 3.998000
261+
100.0% 3.999600
262+
max 4.000000
263+
dtype: float64
264+
265+
In [4]: df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999])
266+
Out[4]:
267+
...
268+
ValueError: cannot reindex from a duplicate axis
269+
270+
New Behavior:
271+
272+
.. ipython:: python
273+
274+
s.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999])
275+
df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999])
276+
277+
In addition to this, both ``Series.describe()`` and ``DataFrame.describe()`` will now raise ``ValueError`` if passed ``pecentiles`` contain duplicates.
278+
279+
Another bug is fixed that could raise ``TypeError`` when a column index of a data frame contained entries of different types (:issue:`13288`)
280+
281+
.. ipython:: python
282+
283+
df = pd.DataFrame({'A': list("BCDE"), 0: [1, 2, 3, 4]})
284+
285+
Previous Behavior:
286+
287+
.. code-block:: ipython
288+
289+
In [8]: df.describe()
290+
Out[8]:
291+
...
292+
ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'
293+
294+
New Behavior:
295+
296+
.. ipython:: python
297+
298+
df.describe()
299+
231300
.. _whatsnew_0182.api.other:
232301

233302
Other API changes

pandas/core/generic.py

+22-21
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import pandas.core.missing as missing
2121
import pandas.core.datetools as datetools
2222
from pandas.formats.printing import pprint_thing
23+
from pandas.formats.format import format_percentiles
2324
from pandas import compat
2425
from pandas.compat.numpy import function as nv
2526
from pandas.compat import (map, zip, lrange, string_types,
@@ -4868,32 +4869,33 @@ def abs(self):
48684869
@Appender(_shared_docs['describe'] % _shared_doc_kwargs)
48694870
def describe(self, percentiles=None, include=None, exclude=None):
48704871
if self.ndim >= 3:
4871-
msg = "describe is not implemented on on Panel or PanelND objects."
4872+
msg = "describe is not implemented on Panel or PanelND objects."
48724873
raise NotImplementedError(msg)
4874+
elif self.ndim == 2 and self.columns.size == 0:
4875+
raise ValueError("Cannot describe a DataFrame without columns")
48734876

48744877
if percentiles is not None:
48754878
# get them all to be in [0, 1]
48764879
self._check_percentile(percentiles)
4880+
4881+
# median should always be included
4882+
if 0.5 not in percentiles:
4883+
percentiles.append(0.5)
48774884
percentiles = np.asarray(percentiles)
48784885
else:
48794886
percentiles = np.array([0.25, 0.5, 0.75])
48804887

4881-
# median should always be included
4882-
if (percentiles != 0.5).all(): # median isn't included
4883-
lh = percentiles[percentiles < .5]
4884-
uh = percentiles[percentiles > .5]
4885-
percentiles = np.hstack([lh, 0.5, uh])
4888+
# sort and check for duplicates
4889+
unique_pcts = np.unique(percentiles)
4890+
if len(unique_pcts) < len(percentiles):
4891+
raise ValueError("percentiles cannot contain duplicates")
4892+
percentiles = unique_pcts
48864893

4887-
def pretty_name(x):
4888-
x *= 100
4889-
if x == int(x):
4890-
return '%.0f%%' % x
4891-
else:
4892-
return '%.1f%%' % x
4894+
formatted_percentiles = format_percentiles(percentiles)
48934895

4894-
def describe_numeric_1d(series, percentiles):
4896+
def describe_numeric_1d(series):
48954897
stat_index = (['count', 'mean', 'std', 'min'] +
4896-
[pretty_name(x) for x in percentiles] + ['max'])
4898+
formatted_percentiles + ['max'])
48974899
d = ([series.count(), series.mean(), series.std(), series.min()] +
48984900
[series.quantile(x) for x in percentiles] + [series.max()])
48994901
return pd.Series(d, index=stat_index, name=series.name)
@@ -4918,18 +4920,18 @@ def describe_categorical_1d(data):
49184920

49194921
return pd.Series(result, index=names, name=data.name)
49204922

4921-
def describe_1d(data, percentiles):
4923+
def describe_1d(data):
49224924
if com.is_bool_dtype(data):
49234925
return describe_categorical_1d(data)
49244926
elif com.is_numeric_dtype(data):
4925-
return describe_numeric_1d(data, percentiles)
4927+
return describe_numeric_1d(data)
49264928
elif com.is_timedelta64_dtype(data):
4927-
return describe_numeric_1d(data, percentiles)
4929+
return describe_numeric_1d(data)
49284930
else:
49294931
return describe_categorical_1d(data)
49304932

49314933
if self.ndim == 1:
4932-
return describe_1d(self, percentiles)
4934+
return describe_1d(self)
49334935
elif (include is None) and (exclude is None):
49344936
if len(self._get_numeric_data()._info_axis) > 0:
49354937
# when some numerics are found, keep only numerics
@@ -4944,7 +4946,7 @@ def describe_1d(data, percentiles):
49444946
else:
49454947
data = self.select_dtypes(include=include, exclude=exclude)
49464948

4947-
ldesc = [describe_1d(s, percentiles) for _, s in data.iteritems()]
4949+
ldesc = [describe_1d(s) for _, s in data.iteritems()]
49484950
# set a convenient order for rows
49494951
names = []
49504952
ldesc_indexes = sorted([x.index for x in ldesc], key=len)
@@ -4954,8 +4956,7 @@ def describe_1d(data, percentiles):
49544956
names.append(name)
49554957

49564958
d = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)
4957-
d.columns = self.columns._shallow_copy(values=d.columns.values)
4958-
d.columns.names = data.columns.names
4959+
d.columns = data.columns.copy()
49594960
return d
49604961

49614962
def _check_percentile(self, q):

pandas/formats/format.py

+62-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import sys
77

88
from pandas.core.base import PandasObject
9-
from pandas.core.common import isnull, notnull
9+
from pandas.core.common import isnull, notnull, is_numeric_dtype
1010
from pandas.core.index import Index, MultiIndex, _ensure_index
1111
from pandas import compat
1212
from pandas.compat import (StringIO, lzip, range, map, zip, reduce, u,
@@ -2260,6 +2260,67 @@ def _format_strings(self):
22602260
return fmt_values
22612261

22622262

2263+
def format_percentiles(percentiles):
2264+
"""
2265+
Outputs rounded and formatted percentiles.
2266+
2267+
Parameters
2268+
----------
2269+
percentiles : list-like, containing floats from interval [0,1]
2270+
2271+
Returns
2272+
-------
2273+
formatted : list of strings
2274+
2275+
Notes
2276+
-----
2277+
Rounding precision is chosen so that: (1) if any two elements of
2278+
``percentiles`` differ, they remain different after rounding
2279+
(2) no entry is *rounded* to 0% or 100%.
2280+
Any non-integer is always rounded to at least 1 decimal place.
2281+
2282+
Examples
2283+
--------
2284+
Keeps all entries different after rounding:
2285+
2286+
>>> format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999])
2287+
['1.999%', '2.001%', '50%', '66.667%', '99.99%']
2288+
2289+
No element is rounded to 0% or 100% (unless already equal to it).
2290+
Duplicates are allowed:
2291+
2292+
>>> format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999])
2293+
['0%', '50%', '2.0%', '50%', '66.67%', '99.99%']
2294+
"""
2295+
2296+
percentiles = np.asarray(percentiles)
2297+
2298+
# It checks for np.NaN as well
2299+
if not is_numeric_dtype(percentiles) or not np.all(percentiles >= 0) \
2300+
or not np.all(percentiles <= 1):
2301+
raise ValueError("percentiles should all be in the interval [0,1]")
2302+
2303+
percentiles = 100 * percentiles
2304+
int_idx = (percentiles.astype(int) == percentiles)
2305+
2306+
if np.all(int_idx):
2307+
out = percentiles.astype(int).astype(str)
2308+
return [i + '%' for i in out]
2309+
2310+
unique_pcts = np.unique(percentiles)
2311+
to_begin = unique_pcts[0] if unique_pcts[0] > 0 else None
2312+
to_end = 100 - unique_pcts[-1] if unique_pcts[-1] < 100 else None
2313+
# Least precision that keeps percentiles unique after rounding
2314+
prec = -np.floor(np.log10(np.min(
2315+
np.ediff1d(unique_pcts, to_begin=to_begin, to_end=to_end)
2316+
))).astype(int)
2317+
prec = max(1, prec)
2318+
out = np.empty_like(percentiles, dtype=object)
2319+
out[int_idx] = percentiles[int_idx].astype(int).astype(str)
2320+
out[~int_idx] = percentiles[~int_idx].round(prec).astype(str)
2321+
return [i + '%' for i in out]
2322+
2323+
22632324
def _is_dates_only(values):
22642325
# return a boolean if we are only dates (and don't have a timezone)
22652326
values = DatetimeIndex(values)

pandas/tests/formats/test_format.py

+15
Original file line numberDiff line numberDiff line change
@@ -4264,6 +4264,21 @@ def test_nat_representations(self):
42644264
self.assertEqual(f(pd.NaT), 'NaT')
42654265

42664266

4267+
def test_format_percentiles():
4268+
result = fmt.format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999])
4269+
expected = ['1.999%', '2.001%', '50%', '66.667%', '99.99%']
4270+
tm.assert_equal(result, expected)
4271+
4272+
result = fmt.format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999])
4273+
expected = ['0%', '50%', '2.0%', '50%', '66.67%', '99.99%']
4274+
tm.assert_equal(result, expected)
4275+
4276+
tm.assertRaises(ValueError, fmt.format_percentiles, [0.1, np.nan, 0.5])
4277+
tm.assertRaises(ValueError, fmt.format_percentiles, [-0.001, 0.1, 0.5])
4278+
tm.assertRaises(ValueError, fmt.format_percentiles, [2, 0.1, 0.5])
4279+
tm.assertRaises(ValueError, fmt.format_percentiles, [0.1, 0.5, 'a'])
4280+
4281+
42674282
if __name__ == '__main__':
42684283
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
42694284
exit=False)

pandas/tests/test_generic.py

+63
Original file line numberDiff line numberDiff line change
@@ -996,6 +996,59 @@ def test_describe_percentiles_insert_median(self):
996996
self.assertTrue('0%' in d1.index)
997997
self.assertTrue('100%' in d2.index)
998998

999+
def test_describe_percentiles_unique(self):
1000+
# GH13104
1001+
df = tm.makeDataFrame()
1002+
with self.assertRaises(ValueError):
1003+
df.describe(percentiles=[0.1, 0.2, 0.4, 0.5, 0.2, 0.6])
1004+
with self.assertRaises(ValueError):
1005+
df.describe(percentiles=[0.1, 0.2, 0.4, 0.2, 0.6])
1006+
1007+
def test_describe_percentiles_formatting(self):
1008+
# GH13104
1009+
df = tm.makeDataFrame()
1010+
1011+
# default
1012+
result = df.describe().index
1013+
expected = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%',
1014+
'max'],
1015+
dtype='object')
1016+
tm.assert_index_equal(result, expected)
1017+
1018+
result = df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999,
1019+
0.9995, 0.9999]).index
1020+
expected = Index(['count', 'mean', 'std', 'min', '0.01%', '0.05%',
1021+
'0.1%', '50%', '99.9%', '99.95%', '99.99%', 'max'],
1022+
dtype='object')
1023+
tm.assert_index_equal(result, expected)
1024+
1025+
result = df.describe(percentiles=[0.00499, 0.005, 0.25, 0.50,
1026+
0.75]).index
1027+
expected = Index(['count', 'mean', 'std', 'min', '0.499%', '0.5%',
1028+
'25%', '50%', '75%', 'max'],
1029+
dtype='object')
1030+
tm.assert_index_equal(result, expected)
1031+
1032+
result = df.describe(percentiles=[0.00499, 0.01001, 0.25, 0.50,
1033+
0.75]).index
1034+
expected = Index(['count', 'mean', 'std', 'min', '0.5%', '1.0%',
1035+
'25%', '50%', '75%', 'max'],
1036+
dtype='object')
1037+
tm.assert_index_equal(result, expected)
1038+
1039+
def test_describe_column_index_type(self):
1040+
# GH13288
1041+
df = pd.DataFrame([1, 2, 3, 4])
1042+
df.columns = pd.Index([0], dtype=object)
1043+
result = df.describe().columns
1044+
expected = Index([0], dtype=object)
1045+
tm.assert_index_equal(result, expected)
1046+
1047+
df = pd.DataFrame({'A': list("BCDE"), 0: [1, 2, 3, 4]})
1048+
result = df.describe().columns
1049+
expected = Index([0], dtype=object)
1050+
tm.assert_index_equal(result, expected)
1051+
9991052
def test_describe_no_numeric(self):
10001053
df = DataFrame({'A': ['foo', 'foo', 'bar'] * 8,
10011054
'B': ['a', 'b', 'c', 'd'] * 6})
@@ -1010,6 +1063,16 @@ def test_describe_no_numeric(self):
10101063
desc = df.describe()
10111064
self.assertEqual(desc.time['first'], min(ts.index))
10121065

1066+
def test_describe_empty(self):
1067+
df = DataFrame()
1068+
tm.assertRaisesRegexp(ValueError, 'DataFrame without columns',
1069+
df.describe)
1070+
1071+
df = DataFrame(columns=['A', 'B'])
1072+
result = df.describe()
1073+
expected = DataFrame(0, columns=['A', 'B'], index=['count', 'unique'])
1074+
tm.assert_frame_equal(result, expected)
1075+
10131076
def test_describe_empty_int_columns(self):
10141077
df = DataFrame([[0, 1], [1, 2]])
10151078
desc = df[df[0] < 0].describe() # works

0 commit comments

Comments
 (0)