diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 2b67aca1dcf74..88dcda444a09d 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -228,6 +228,75 @@ resulting dtype will be upcast (unchanged from previous). pd.merge(df1, df2, how='outer', on='key') pd.merge(df1, df2, how='outer', on='key').dtypes +.. _whatsnew_0182.api.describe: + +``.describe()`` changes +^^^^^^^^^^^^^^^^^^^^^^^ + +Percentile identifiers in the index of a ``.describe()`` output will now be rounded to the least precision that keeps them distinct (:issue:`13104`) + +.. ipython:: python + + s = pd.Series([0, 1, 2, 3, 4]) + df = pd.DataFrame([0, 1, 2, 3, 4]) + +Previous Behavior: + +They were rounded to at most one decimal place, which could raise ``ValueError`` for a data frame. + +.. code-block:: ipython + + In [3]: s.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) + Out[3]: + count 5.000000 + mean 2.000000 + std 1.581139 + min 0.000000 + 0.0% 0.000400 + 0.1% 0.002000 + 0.1% 0.004000 + 50% 2.000000 + 99.9% 3.996000 + 100.0% 3.998000 + 100.0% 3.999600 + max 4.000000 + dtype: float64 + + In [4]: df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) + Out[4]: + ... + ValueError: cannot reindex from a duplicate axis + +New Behavior: + +.. ipython:: python + + s.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) + df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) + +In addition to this, both ``Series.describe()`` and ``DataFrame.describe()`` will now raise ``ValueError`` if passed ``pecentiles`` contain duplicates. + +Another bug is fixed that could raise ``TypeError`` when a column index of a data frame contained entries of different types (:issue:`13288`) + +.. ipython:: python + + df = pd.DataFrame({'A': list("BCDE"), 0: [1, 2, 3, 4]}) + +Previous Behavior: + +.. code-block:: ipython + + In [8]: df.describe() + Out[8]: + ... + ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long' + +New Behavior: + +.. ipython:: python + + df.describe() + .. _whatsnew_0182.api.other: Other API changes diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 99599d2b04a45..9ecaaebc2b523 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -20,6 +20,7 @@ import pandas.core.missing as missing import pandas.core.datetools as datetools from pandas.formats.printing import pprint_thing +from pandas.formats.format import format_percentiles from pandas import compat from pandas.compat.numpy import function as nv from pandas.compat import (map, zip, lrange, string_types, @@ -4868,32 +4869,33 @@ def abs(self): @Appender(_shared_docs['describe'] % _shared_doc_kwargs) def describe(self, percentiles=None, include=None, exclude=None): if self.ndim >= 3: - msg = "describe is not implemented on on Panel or PanelND objects." + msg = "describe is not implemented on Panel or PanelND objects." raise NotImplementedError(msg) + elif self.ndim == 2 and self.columns.size == 0: + raise ValueError("Cannot describe a DataFrame without columns") if percentiles is not None: # get them all to be in [0, 1] self._check_percentile(percentiles) + + # median should always be included + if 0.5 not in percentiles: + percentiles.append(0.5) percentiles = np.asarray(percentiles) else: percentiles = np.array([0.25, 0.5, 0.75]) - # median should always be included - if (percentiles != 0.5).all(): # median isn't included - lh = percentiles[percentiles < .5] - uh = percentiles[percentiles > .5] - percentiles = np.hstack([lh, 0.5, uh]) + # sort and check for duplicates + unique_pcts = np.unique(percentiles) + if len(unique_pcts) < len(percentiles): + raise ValueError("percentiles cannot contain duplicates") + percentiles = unique_pcts - def pretty_name(x): - x *= 100 - if x == int(x): - return '%.0f%%' % x - else: - return '%.1f%%' % x + formatted_percentiles = format_percentiles(percentiles) - def describe_numeric_1d(series, percentiles): + def describe_numeric_1d(series): stat_index = (['count', 'mean', 'std', 'min'] + - [pretty_name(x) for x in percentiles] + ['max']) + formatted_percentiles + ['max']) d = ([series.count(), series.mean(), series.std(), series.min()] + [series.quantile(x) for x in percentiles] + [series.max()]) return pd.Series(d, index=stat_index, name=series.name) @@ -4918,18 +4920,18 @@ def describe_categorical_1d(data): return pd.Series(result, index=names, name=data.name) - def describe_1d(data, percentiles): + def describe_1d(data): if com.is_bool_dtype(data): return describe_categorical_1d(data) elif com.is_numeric_dtype(data): - return describe_numeric_1d(data, percentiles) + return describe_numeric_1d(data) elif com.is_timedelta64_dtype(data): - return describe_numeric_1d(data, percentiles) + return describe_numeric_1d(data) else: return describe_categorical_1d(data) if self.ndim == 1: - return describe_1d(self, percentiles) + return describe_1d(self) elif (include is None) and (exclude is None): if len(self._get_numeric_data()._info_axis) > 0: # when some numerics are found, keep only numerics @@ -4944,7 +4946,7 @@ def describe_1d(data, percentiles): else: data = self.select_dtypes(include=include, exclude=exclude) - ldesc = [describe_1d(s, percentiles) for _, s in data.iteritems()] + ldesc = [describe_1d(s) for _, s in data.iteritems()] # set a convenient order for rows names = [] ldesc_indexes = sorted([x.index for x in ldesc], key=len) @@ -4954,8 +4956,7 @@ def describe_1d(data, percentiles): names.append(name) d = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1) - d.columns = self.columns._shallow_copy(values=d.columns.values) - d.columns.names = data.columns.names + d.columns = data.columns.copy() return d def _check_percentile(self, q): diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 70b506a1415c1..ecdfbc3cc4c71 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -6,7 +6,7 @@ import sys from pandas.core.base import PandasObject -from pandas.core.common import isnull, notnull +from pandas.core.common import isnull, notnull, is_numeric_dtype from pandas.core.index import Index, MultiIndex, _ensure_index from pandas import compat from pandas.compat import (StringIO, lzip, range, map, zip, reduce, u, @@ -2260,6 +2260,67 @@ def _format_strings(self): return fmt_values +def format_percentiles(percentiles): + """ + Outputs rounded and formatted percentiles. + + Parameters + ---------- + percentiles : list-like, containing floats from interval [0,1] + + Returns + ------- + formatted : list of strings + + Notes + ----- + Rounding precision is chosen so that: (1) if any two elements of + ``percentiles`` differ, they remain different after rounding + (2) no entry is *rounded* to 0% or 100%. + Any non-integer is always rounded to at least 1 decimal place. + + Examples + -------- + Keeps all entries different after rounding: + + >>> format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999]) + ['1.999%', '2.001%', '50%', '66.667%', '99.99%'] + + No element is rounded to 0% or 100% (unless already equal to it). + Duplicates are allowed: + + >>> format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999]) + ['0%', '50%', '2.0%', '50%', '66.67%', '99.99%'] + """ + + percentiles = np.asarray(percentiles) + + # It checks for np.NaN as well + if not is_numeric_dtype(percentiles) or not np.all(percentiles >= 0) \ + or not np.all(percentiles <= 1): + raise ValueError("percentiles should all be in the interval [0,1]") + + percentiles = 100 * percentiles + int_idx = (percentiles.astype(int) == percentiles) + + if np.all(int_idx): + out = percentiles.astype(int).astype(str) + return [i + '%' for i in out] + + unique_pcts = np.unique(percentiles) + to_begin = unique_pcts[0] if unique_pcts[0] > 0 else None + to_end = 100 - unique_pcts[-1] if unique_pcts[-1] < 100 else None + # Least precision that keeps percentiles unique after rounding + prec = -np.floor(np.log10(np.min( + np.ediff1d(unique_pcts, to_begin=to_begin, to_end=to_end) + ))).astype(int) + prec = max(1, prec) + out = np.empty_like(percentiles, dtype=object) + out[int_idx] = percentiles[int_idx].astype(int).astype(str) + out[~int_idx] = percentiles[~int_idx].round(prec).astype(str) + return [i + '%' for i in out] + + def _is_dates_only(values): # return a boolean if we are only dates (and don't have a timezone) values = DatetimeIndex(values) diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index 7a806280916f1..e67fe2cddde77 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -4264,6 +4264,21 @@ def test_nat_representations(self): self.assertEqual(f(pd.NaT), 'NaT') +def test_format_percentiles(): + result = fmt.format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999]) + expected = ['1.999%', '2.001%', '50%', '66.667%', '99.99%'] + tm.assert_equal(result, expected) + + result = fmt.format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999]) + expected = ['0%', '50%', '2.0%', '50%', '66.67%', '99.99%'] + tm.assert_equal(result, expected) + + tm.assertRaises(ValueError, fmt.format_percentiles, [0.1, np.nan, 0.5]) + tm.assertRaises(ValueError, fmt.format_percentiles, [-0.001, 0.1, 0.5]) + tm.assertRaises(ValueError, fmt.format_percentiles, [2, 0.1, 0.5]) + tm.assertRaises(ValueError, fmt.format_percentiles, [0.1, 0.5, 'a']) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 83e1a17fc8b0c..2f4c2b414cc30 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -996,6 +996,59 @@ def test_describe_percentiles_insert_median(self): self.assertTrue('0%' in d1.index) self.assertTrue('100%' in d2.index) + def test_describe_percentiles_unique(self): + # GH13104 + df = tm.makeDataFrame() + with self.assertRaises(ValueError): + df.describe(percentiles=[0.1, 0.2, 0.4, 0.5, 0.2, 0.6]) + with self.assertRaises(ValueError): + df.describe(percentiles=[0.1, 0.2, 0.4, 0.2, 0.6]) + + def test_describe_percentiles_formatting(self): + # GH13104 + df = tm.makeDataFrame() + + # default + result = df.describe().index + expected = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', + 'max'], + dtype='object') + tm.assert_index_equal(result, expected) + + result = df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, + 0.9995, 0.9999]).index + expected = Index(['count', 'mean', 'std', 'min', '0.01%', '0.05%', + '0.1%', '50%', '99.9%', '99.95%', '99.99%', 'max'], + dtype='object') + tm.assert_index_equal(result, expected) + + result = df.describe(percentiles=[0.00499, 0.005, 0.25, 0.50, + 0.75]).index + expected = Index(['count', 'mean', 'std', 'min', '0.499%', '0.5%', + '25%', '50%', '75%', 'max'], + dtype='object') + tm.assert_index_equal(result, expected) + + result = df.describe(percentiles=[0.00499, 0.01001, 0.25, 0.50, + 0.75]).index + expected = Index(['count', 'mean', 'std', 'min', '0.5%', '1.0%', + '25%', '50%', '75%', 'max'], + dtype='object') + tm.assert_index_equal(result, expected) + + def test_describe_column_index_type(self): + # GH13288 + df = pd.DataFrame([1, 2, 3, 4]) + df.columns = pd.Index([0], dtype=object) + result = df.describe().columns + expected = Index([0], dtype=object) + tm.assert_index_equal(result, expected) + + df = pd.DataFrame({'A': list("BCDE"), 0: [1, 2, 3, 4]}) + result = df.describe().columns + expected = Index([0], dtype=object) + tm.assert_index_equal(result, expected) + def test_describe_no_numeric(self): df = DataFrame({'A': ['foo', 'foo', 'bar'] * 8, 'B': ['a', 'b', 'c', 'd'] * 6}) @@ -1010,6 +1063,16 @@ def test_describe_no_numeric(self): desc = df.describe() self.assertEqual(desc.time['first'], min(ts.index)) + def test_describe_empty(self): + df = DataFrame() + tm.assertRaisesRegexp(ValueError, 'DataFrame without columns', + df.describe) + + df = DataFrame(columns=['A', 'B']) + result = df.describe() + expected = DataFrame(0, columns=['A', 'B'], index=['count', 'unique']) + tm.assert_frame_equal(result, expected) + def test_describe_empty_int_columns(self): df = DataFrame([[0, 1], [1, 2]]) desc = df[df[0] < 0].describe() # works