BUG: Fix describe(): percentiles (#13104), col index (#13288)

pijucha · pijucha · commit 9a6bd6ee6c9d · 2016-05-30T21:15:21.000-04:00
BUG #13104: - Percentile identifiers are now rounded to the least precision that keeps them unique. - Supplying duplicates in percentiles will raise ValueError. BUG #13288 - Fixed a column index of the output data frame. Previously, if a data frame had a column index of object type and the index contained numeric values, the output column index could be corrupt. It led to ValueError if the output was displayed. - describe() will raise ValueError with an informative message on DataFrame without columns.
diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
@@ -228,6 +228,75 @@ resulting dtype will be upcast (unchanged from previous).
    pd.merge(df1, df2, how='outer', on='key')
    pd.merge(df1, df2, how='outer', on='key').dtypes
 
+.. _whatsnew_0182.api.describe:
+
+``.describe()`` changes
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Percentile identifiers in the index of a ``.describe()`` output will now be rounded to the least precision that keeps them distinct (:issue:`13104`)
+
+.. ipython:: python
+
+   s = pd.Series([0, 1, 2, 3, 4])
+   df = pd.DataFrame([0, 1, 2, 3, 4])
+
+Previous Behavior:
+
+They were rounded to at most one decimal place, which could raise ``ValueError`` for a data frame.
+
+.. code-block:: ipython
+
+   In [3]: s.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999])
+   Out[3]:
+   count     5.000000
+   mean      2.000000
+   std       1.581139
+   min       0.000000
+   0.0%      0.000400
+   0.1%      0.002000
+   0.1%      0.004000
+   50%       2.000000
+   99.9%     3.996000
+   100.0%    3.998000
+   100.0%    3.999600
+   max       4.000000
+   dtype: float64
+
+   In [4]: df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999])
+   Out[4]:
+   ...
+   ValueError: cannot reindex from a duplicate axis
+
+New Behavior:
+
+.. ipython:: python
+
+   s.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999])
+   df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999])
+
+In addition to this, both ``Series.describe()`` and ``DataFrame.describe()`` will now raise ``ValueError`` if passed ``pecentiles`` contain duplicates.
+
+Another bug is fixed that could raise ``TypeError`` when a column index of a data frame contained entries of different types (:issue:`13288`)
+
+.. ipython:: python
+
+   df = pd.DataFrame({'A': list("BCDE"), 0: [1, 2, 3, 4]})
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+   In [8]: df.describe()
+   Out[8]:
+   ...
+   ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'
+
+New Behavior:
+
+.. ipython:: python
+
+   df.describe()
+
 .. _whatsnew_0182.api.other:
 
 Other API changes
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -20,6 +20,7 @@
 import pandas.core.missing as missing
 import pandas.core.datetools as datetools
 from pandas.formats.printing import pprint_thing
+from pandas.formats.format import format_percentiles
 from pandas import compat
 from pandas.compat.numpy import function as nv
 from pandas.compat import (map, zip, lrange, string_types,
@@ -4868,32 +4869,33 @@ def abs(self):
     @Appender(_shared_docs['describe'] % _shared_doc_kwargs)
     def describe(self, percentiles=None, include=None, exclude=None):
         if self.ndim >= 3:
-            msg = "describe is not implemented on on Panel or PanelND objects."
+            msg = "describe is not implemented on Panel or PanelND objects."
             raise NotImplementedError(msg)
+        elif self.ndim == 2 and self.columns.size == 0:
+            raise ValueError("Cannot describe a DataFrame without columns")
 
         if percentiles is not None:
             # get them all to be in [0, 1]
             self._check_percentile(percentiles)
+
+            # median should always be included
+            if 0.5 not in percentiles:
+                percentiles.append(0.5)
             percentiles = np.asarray(percentiles)
         else:
             percentiles = np.array([0.25, 0.5, 0.75])
 
-        # median should always be included
-        if (percentiles != 0.5).all():  # median isn't included
-            lh = percentiles[percentiles < .5]
-            uh = percentiles[percentiles > .5]
-            percentiles = np.hstack([lh, 0.5, uh])
+        # sort and check for duplicates
+        unique_pcts = np.unique(percentiles)
+        if len(unique_pcts) < len(percentiles):
+            raise ValueError("percentiles cannot contain duplicates")
+        percentiles = unique_pcts
 
-        def pretty_name(x):
-            x *= 100
-            if x == int(x):
-                return '%.0f%%' % x
-            else:
-                return '%.1f%%' % x
+        formatted_percentiles = format_percentiles(percentiles)
 
-        def describe_numeric_1d(series, percentiles):
+        def describe_numeric_1d(series):
             stat_index = (['count', 'mean', 'std', 'min'] +
-                          [pretty_name(x) for x in percentiles] + ['max'])
+                          formatted_percentiles + ['max'])
             d = ([series.count(), series.mean(), series.std(), series.min()] +
                  [series.quantile(x) for x in percentiles] + [series.max()])
             return pd.Series(d, index=stat_index, name=series.name)
@@ -4918,18 +4920,18 @@ def describe_categorical_1d(data):
 
             return pd.Series(result, index=names, name=data.name)
 
-        def describe_1d(data, percentiles):
+        def describe_1d(data):
             if com.is_bool_dtype(data):
                 return describe_categorical_1d(data)
             elif com.is_numeric_dtype(data):
-                return describe_numeric_1d(data, percentiles)
+                return describe_numeric_1d(data)
             elif com.is_timedelta64_dtype(data):
-                return describe_numeric_1d(data, percentiles)
+                return describe_numeric_1d(data)
             else:
                 return describe_categorical_1d(data)
 
         if self.ndim == 1:
-            return describe_1d(self, percentiles)
+            return describe_1d(self)
         elif (include is None) and (exclude is None):
             if len(self._get_numeric_data()._info_axis) > 0:
                 # when some numerics are found, keep only numerics
@@ -4944,7 +4946,7 @@ def describe_1d(data, percentiles):
         else:
             data = self.select_dtypes(include=include, exclude=exclude)
 
-        ldesc = [describe_1d(s, percentiles) for _, s in data.iteritems()]
+        ldesc = [describe_1d(s) for _, s in data.iteritems()]
         # set a convenient order for rows
         names = []
         ldesc_indexes = sorted([x.index for x in ldesc], key=len)
@@ -4954,8 +4956,7 @@ def describe_1d(data, percentiles):
                     names.append(name)
 
         d = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)
-        d.columns = self.columns._shallow_copy(values=d.columns.values)
-        d.columns.names = data.columns.names
+        d.columns = data.columns.copy()
         return d
 
     def _check_percentile(self, q):
diff --git a/pandas/formats/format.py b/pandas/formats/format.py
@@ -6,7 +6,7 @@
 import sys
 
 from pandas.core.base import PandasObject
-from pandas.core.common import isnull, notnull
+from pandas.core.common import isnull, notnull, is_numeric_dtype
 from pandas.core.index import Index, MultiIndex, _ensure_index
 from pandas import compat
 from pandas.compat import (StringIO, lzip, range, map, zip, reduce, u,
@@ -2260,6 +2260,67 @@ def _format_strings(self):
         return fmt_values
 
 
+def format_percentiles(percentiles):
+    """
+    Outputs rounded and formatted percentiles.
+
+    Parameters
+    ----------
+    percentiles : list-like, containing floats from interval [0,1]
+
+    Returns
+    -------
+    formatted : list of strings
+
+    Notes
+    -----
+    Rounding precision is chosen so that: (1) if any two elements of
+    ``percentiles`` differ, they remain different after rounding
+    (2) no entry is *rounded* to 0% or 100%.
+    Any non-integer is always rounded to at least 1 decimal place.
+
+    Examples
+    --------
+    Keeps all entries different after rounding:
+
+    >>> format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999])
+    ['1.999%', '2.001%', '50%', '66.667%', '99.99%']
+
+    No element is rounded to 0% or 100% (unless already equal to it).
+    Duplicates are allowed:
+
+    >>> format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999])
+    ['0%', '50%', '2.0%', '50%', '66.67%', '99.99%']
+    """
+
+    percentiles = np.asarray(percentiles)
+
+    # It checks for np.NaN as well
+    if not is_numeric_dtype(percentiles) or not np.all(percentiles >= 0) \
+            or not np.all(percentiles <= 1):
+        raise ValueError("percentiles should all be in the interval [0,1]")
+
+    percentiles = 100 * percentiles
+    int_idx = (percentiles.astype(int) == percentiles)
+
+    if np.all(int_idx):
+        out = percentiles.astype(int).astype(str)
+        return [i + '%' for i in out]
+
+    unique_pcts = np.unique(percentiles)
+    to_begin = unique_pcts[0] if unique_pcts[0] > 0 else None
+    to_end = 100 - unique_pcts[-1] if unique_pcts[-1] < 100 else None
+    # Least precision that keeps percentiles unique after rounding
+    prec = -np.floor(np.log10(np.min(
+        np.ediff1d(unique_pcts, to_begin=to_begin, to_end=to_end)
+    ))).astype(int)
+    prec = max(1, prec)
+    out = np.empty_like(percentiles, dtype=object)
+    out[int_idx] = percentiles[int_idx].astype(int).astype(str)
+    out[~int_idx] = percentiles[~int_idx].round(prec).astype(str)
+    return [i + '%' for i in out]
+
+
 def _is_dates_only(values):
     # return a boolean if we are only dates (and don't have a timezone)
     values = DatetimeIndex(values)
diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py
@@ -4264,6 +4264,21 @@ def test_nat_representations(self):
             self.assertEqual(f(pd.NaT), 'NaT')
 
 
+def test_format_percentiles():
+    result = fmt.format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999])
+    expected = ['1.999%', '2.001%', '50%', '66.667%', '99.99%']
+    tm.assert_equal(result, expected)
+
+    result = fmt.format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999])
+    expected = ['0%', '50%', '2.0%', '50%', '66.67%', '99.99%']
+    tm.assert_equal(result, expected)
+
+    tm.assertRaises(ValueError, fmt.format_percentiles, [0.1, np.nan, 0.5])
+    tm.assertRaises(ValueError, fmt.format_percentiles, [-0.001, 0.1, 0.5])
+    tm.assertRaises(ValueError, fmt.format_percentiles, [2, 0.1, 0.5])
+    tm.assertRaises(ValueError, fmt.format_percentiles, [0.1, 0.5, 'a'])
+
+
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                    exit=False)
diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py
@@ -996,6 +996,59 @@ def test_describe_percentiles_insert_median(self):
         self.assertTrue('0%' in d1.index)
         self.assertTrue('100%' in d2.index)
 
+    def test_describe_percentiles_unique(self):
+        # GH13104
+        df = tm.makeDataFrame()
+        with self.assertRaises(ValueError):
+            df.describe(percentiles=[0.1, 0.2, 0.4, 0.5, 0.2, 0.6])
+        with self.assertRaises(ValueError):
+            df.describe(percentiles=[0.1, 0.2, 0.4, 0.2, 0.6])
+
+    def test_describe_percentiles_formatting(self):
+        # GH13104
+        df = tm.makeDataFrame()
+
+        # default
+        result = df.describe().index
+        expected = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%',
+                          'max'],
+                         dtype='object')
+        tm.assert_index_equal(result, expected)
+
+        result = df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999,
+                                          0.9995, 0.9999]).index
+        expected = Index(['count', 'mean', 'std', 'min', '0.01%', '0.05%',
+                          '0.1%', '50%', '99.9%', '99.95%', '99.99%', 'max'],
+                         dtype='object')
+        tm.assert_index_equal(result, expected)
+
+        result = df.describe(percentiles=[0.00499, 0.005, 0.25, 0.50,
+                                          0.75]).index
+        expected = Index(['count', 'mean', 'std', 'min', '0.499%', '0.5%',
+                          '25%', '50%', '75%', 'max'],
+                         dtype='object')
+        tm.assert_index_equal(result, expected)
+
+        result = df.describe(percentiles=[0.00499, 0.01001, 0.25, 0.50,
+                                          0.75]).index
+        expected = Index(['count', 'mean', 'std', 'min', '0.5%', '1.0%',
+                          '25%', '50%', '75%', 'max'],
+                         dtype='object')
+        tm.assert_index_equal(result, expected)
+
+    def test_describe_column_index_type(self):
+        # GH13288
+        df = pd.DataFrame([1, 2, 3, 4])
+        df.columns = pd.Index([0], dtype=object)
+        result = df.describe().columns
+        expected = Index([0], dtype=object)
+        tm.assert_index_equal(result, expected)
+
+        df = pd.DataFrame({'A': list("BCDE"), 0: [1, 2, 3, 4]})
+        result = df.describe().columns
+        expected = Index([0], dtype=object)
+        tm.assert_index_equal(result, expected)
+
     def test_describe_no_numeric(self):
         df = DataFrame({'A': ['foo', 'foo', 'bar'] * 8,
                         'B': ['a', 'b', 'c', 'd'] * 6})
@@ -1010,6 +1063,16 @@ def test_describe_no_numeric(self):
         desc = df.describe()
         self.assertEqual(desc.time['first'], min(ts.index))
 
+    def test_describe_empty(self):
+        df = DataFrame()
+        tm.assertRaisesRegexp(ValueError, 'DataFrame without columns',
+                              df.describe)
+
+        df = DataFrame(columns=['A', 'B'])
+        result = df.describe()
+        expected = DataFrame(0, columns=['A', 'B'], index=['count', 'unique'])
+        tm.assert_frame_equal(result, expected)
+
     def test_describe_empty_int_columns(self):
         df = DataFrame([[0, 1], [1, 2]])
         desc = df[df[0] < 0].describe()  # works