-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: Fix describe(): percentiles (#13104), col index (#13288) #13298
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,6 +20,7 @@ | |
import pandas.core.missing as missing | ||
import pandas.core.datetools as datetools | ||
from pandas.formats.printing import pprint_thing | ||
from pandas.formats.format import format_percentiles | ||
from pandas import compat | ||
from pandas.compat.numpy import function as nv | ||
from pandas.compat import (map, zip, lrange, string_types, | ||
|
@@ -4868,32 +4869,33 @@ def abs(self): | |
@Appender(_shared_docs['describe'] % _shared_doc_kwargs) | ||
def describe(self, percentiles=None, include=None, exclude=None): | ||
if self.ndim >= 3: | ||
msg = "describe is not implemented on on Panel or PanelND objects." | ||
msg = "describe is not implemented on Panel or PanelND objects." | ||
raise NotImplementedError(msg) | ||
elif self.ndim == 2 and self.columns.size == 0: | ||
raise ValueError("Cannot describe a DataFrame without columns") | ||
|
||
if percentiles is not None: | ||
# get them all to be in [0, 1] | ||
self._check_percentile(percentiles) | ||
|
||
# median should always be included | ||
if 0.5 not in percentiles: | ||
percentiles.append(0.5) | ||
percentiles = np.asarray(percentiles) | ||
else: | ||
percentiles = np.array([0.25, 0.5, 0.75]) | ||
|
||
# median should always be included | ||
if (percentiles != 0.5).all(): # median isn't included | ||
lh = percentiles[percentiles < .5] | ||
uh = percentiles[percentiles > .5] | ||
percentiles = np.hstack([lh, 0.5, uh]) | ||
# sort and check for duplicates | ||
unique_pcts = np.unique(percentiles) | ||
if len(unique_pcts) < len(percentiles): | ||
raise ValueError("percentiles cannot contain duplicates") | ||
percentiles = unique_pcts | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought this might be slightly better than the suggested: if len(set(percentiles)) < len(percentiles):
raise...
percentile = np.unique(percentiles) # can't use pd.unique here - we need percentiles sorted but I can change it if it's less idiomatic. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok that's fine, yea a feature of |
||
|
||
def pretty_name(x): | ||
x *= 100 | ||
if x == int(x): | ||
return '%.0f%%' % x | ||
else: | ||
return '%.1f%%' % x | ||
formatted_percentiles = format_percentiles(percentiles) | ||
|
||
def describe_numeric_1d(series, percentiles): | ||
def describe_numeric_1d(series): | ||
stat_index = (['count', 'mean', 'std', 'min'] + | ||
[pretty_name(x) for x in percentiles] + ['max']) | ||
formatted_percentiles + ['max']) | ||
d = ([series.count(), series.mean(), series.std(), series.min()] + | ||
[series.quantile(x) for x in percentiles] + [series.max()]) | ||
return pd.Series(d, index=stat_index, name=series.name) | ||
|
@@ -4918,18 +4920,18 @@ def describe_categorical_1d(data): | |
|
||
return pd.Series(result, index=names, name=data.name) | ||
|
||
def describe_1d(data, percentiles): | ||
def describe_1d(data): | ||
if com.is_bool_dtype(data): | ||
return describe_categorical_1d(data) | ||
elif com.is_numeric_dtype(data): | ||
return describe_numeric_1d(data, percentiles) | ||
return describe_numeric_1d(data) | ||
elif com.is_timedelta64_dtype(data): | ||
return describe_numeric_1d(data, percentiles) | ||
return describe_numeric_1d(data) | ||
else: | ||
return describe_categorical_1d(data) | ||
|
||
if self.ndim == 1: | ||
return describe_1d(self, percentiles) | ||
return describe_1d(self) | ||
elif (include is None) and (exclude is None): | ||
if len(self._get_numeric_data()._info_axis) > 0: | ||
# when some numerics are found, keep only numerics | ||
|
@@ -4944,7 +4946,7 @@ def describe_1d(data, percentiles): | |
else: | ||
data = self.select_dtypes(include=include, exclude=exclude) | ||
|
||
ldesc = [describe_1d(s, percentiles) for _, s in data.iteritems()] | ||
ldesc = [describe_1d(s) for _, s in data.iteritems()] | ||
# set a convenient order for rows | ||
names = [] | ||
ldesc_indexes = sorted([x.index for x in ldesc], key=len) | ||
|
@@ -4954,8 +4956,7 @@ def describe_1d(data, percentiles): | |
names.append(name) | ||
|
||
d = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1) | ||
d.columns = self.columns._shallow_copy(values=d.columns.values) | ||
d.columns.names = data.columns.names | ||
d.columns = data.columns.copy() | ||
return d | ||
|
||
def _check_percentile(self, q): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,7 +6,7 @@ | |
import sys | ||
|
||
from pandas.core.base import PandasObject | ||
from pandas.core.common import isnull, notnull | ||
from pandas.core.common import isnull, notnull, is_numeric_dtype | ||
from pandas.core.index import Index, MultiIndex, _ensure_index | ||
from pandas import compat | ||
from pandas.compat import (StringIO, lzip, range, map, zip, reduce, u, | ||
|
@@ -2260,6 +2260,67 @@ def _format_strings(self): | |
return fmt_values | ||
|
||
|
||
def format_percentiles(percentiles): | ||
""" | ||
Outputs rounded and formatted percentiles. | ||
|
||
Parameters | ||
---------- | ||
percentiles : list-like, containing floats from interval [0,1] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I slightly changed the design and don't require here the input to be sorted or unique. Uniqueness will be checked inside There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes that's fine. just whatever you are guaranteeing want to have the function do/test. |
||
|
||
Returns | ||
------- | ||
formatted : list of strings | ||
|
||
Notes | ||
----- | ||
Rounding precision is chosen so that: (1) if any two elements of | ||
``percentiles`` differ, they remain different after rounding | ||
(2) no entry is *rounded* to 0% or 100%. | ||
Any non-integer is always rounded to at least 1 decimal place. | ||
|
||
Examples | ||
-------- | ||
Keeps all entries different after rounding: | ||
|
||
>>> format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999]) | ||
['1.999%', '2.001%', '50%', '66.667%', '99.99%'] | ||
|
||
No element is rounded to 0% or 100% (unless already equal to it). | ||
Duplicates are allowed: | ||
|
||
>>> format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999]) | ||
['0%', '50%', '2.0%', '50%', '66.67%', '99.99%'] | ||
""" | ||
|
||
percentiles = np.asarray(percentiles) | ||
|
||
# It checks for np.NaN as well | ||
if not is_numeric_dtype(percentiles) or not np.all(percentiles >= 0) \ | ||
or not np.all(percentiles <= 1): | ||
raise ValueError("percentiles should all be in the interval [0,1]") | ||
|
||
percentiles = 100 * percentiles | ||
int_idx = (percentiles.astype(int) == percentiles) | ||
|
||
if np.all(int_idx): | ||
out = percentiles.astype(int).astype(str) | ||
return [i + '%' for i in out] | ||
|
||
unique_pcts = np.unique(percentiles) | ||
to_begin = unique_pcts[0] if unique_pcts[0] > 0 else None | ||
to_end = 100 - unique_pcts[-1] if unique_pcts[-1] < 100 else None | ||
# Least precision that keeps percentiles unique after rounding | ||
prec = -np.floor(np.log10(np.min( | ||
np.ediff1d(unique_pcts, to_begin=to_begin, to_end=to_end) | ||
))).astype(int) | ||
prec = max(1, prec) | ||
out = np.empty_like(percentiles, dtype=object) | ||
out[int_idx] = percentiles[int_idx].astype(int).astype(str) | ||
out[~int_idx] = percentiles[~int_idx].round(prec).astype(str) | ||
return [i + '%' for i in out] | ||
|
||
|
||
def _is_dates_only(values): | ||
# return a boolean if we are only dates (and don't have a timezone) | ||
values = DatetimeIndex(values) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4264,6 +4264,21 @@ def test_nat_representations(self): | |
self.assertEqual(f(pd.NaT), 'NaT') | ||
|
||
|
||
def test_format_percentiles(): | ||
result = fmt.format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999]) | ||
expected = ['1.999%', '2.001%', '50%', '66.667%', '99.99%'] | ||
tm.assert_equal(result, expected) | ||
|
||
result = fmt.format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999]) | ||
expected = ['0%', '50%', '2.0%', '50%', '66.67%', '99.99%'] | ||
tm.assert_equal(result, expected) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. test some error conditions here as well |
||
tm.assertRaises(ValueError, fmt.format_percentiles, [0.1, np.nan, 0.5]) | ||
tm.assertRaises(ValueError, fmt.format_percentiles, [-0.001, 0.1, 0.5]) | ||
tm.assertRaises(ValueError, fmt.format_percentiles, [2, 0.1, 0.5]) | ||
tm.assertRaises(ValueError, fmt.format_percentiles, [0.1, 0.5, 'a']) | ||
|
||
|
||
if __name__ == '__main__': | ||
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], | ||
exit=False) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -996,6 +996,59 @@ def test_describe_percentiles_insert_median(self): | |
self.assertTrue('0%' in d1.index) | ||
self.assertTrue('100%' in d2.index) | ||
|
||
def test_describe_percentiles_unique(self): | ||
# GH13104 | ||
df = tm.makeDataFrame() | ||
with self.assertRaises(ValueError): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think just a single 0.5 will cause this to raise (which you don't want) |
||
df.describe(percentiles=[0.1, 0.2, 0.4, 0.5, 0.2, 0.6]) | ||
with self.assertRaises(ValueError): | ||
df.describe(percentiles=[0.1, 0.2, 0.4, 0.2, 0.6]) | ||
|
||
def test_describe_percentiles_formatting(self): | ||
# GH13104 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we have a test with an empty frame, can you confirm? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, there's only one for an empty series, which is different. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. show me the example you are doing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pd.DataFrame().describe()
Traceback (most recent call last):
File "/usr/share/python3.5/site-packages/IPython/core/interactiveshell.py", line 3066, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-8-e81e6dc1b4a2>", line 1, in <module>
pd.DataFrame().describe()
File "/home/users/piotr/workspace/pandas-pijucha/pandas/core/generic.py", line 4975, in describe
d = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)
File "/home/users/piotr/workspace/pandas-pijucha/pandas/tools/merge.py", line 845, in concat
copy=copy)
File "/home/users/piotr/workspace/pandas-pijucha/pandas/tools/merge.py", line 878, in __init__
raise ValueError('No objects to concatenate')
ValueError: No objects to concatenate
pd.Series().describe()
Out[9]:
count 0.0
mean NaN
std NaN
min NaN
25% NaN
50% NaN
75% NaN
max NaN
dtype: float64 This one looks nicer: pd.Series(dtype=object).describe()
Out[10]:
count 0
unique 0
dtype: int64 An empty series has a float dtype by default - as it''s always considered to be 1d, I think. An empty data frame has no default number of columns and no dtypes at all, so I'm not sure what kind of output There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As for now, I don't have a better idea than to raise an exception if a data frame has no columns: def describe(self, percentiles=None, include=None, exclude=None):
if self.ndim >= 3:
msg = "describe is not implemented on Panel or PanelND objects."
raise NotImplementedError(msg)
+ elif self.ndim == 2 and self.columns.size == 0:
+ raise ValueError("Cannot describe a DataFrame without columns") An empty data frame with columns can still be treated as an empty series. And, in fact, there is a test for an empty data frame with columns. |
||
df = tm.makeDataFrame() | ||
|
||
# default | ||
result = df.describe().index | ||
expected = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', | ||
'max'], | ||
dtype='object') | ||
tm.assert_index_equal(result, expected) | ||
|
||
result = df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, | ||
0.9995, 0.9999]).index | ||
expected = Index(['count', 'mean', 'std', 'min', '0.01%', '0.05%', | ||
'0.1%', '50%', '99.9%', '99.95%', '99.99%', 'max'], | ||
dtype='object') | ||
tm.assert_index_equal(result, expected) | ||
|
||
result = df.describe(percentiles=[0.00499, 0.005, 0.25, 0.50, | ||
0.75]).index | ||
expected = Index(['count', 'mean', 'std', 'min', '0.499%', '0.5%', | ||
'25%', '50%', '75%', 'max'], | ||
dtype='object') | ||
tm.assert_index_equal(result, expected) | ||
|
||
result = df.describe(percentiles=[0.00499, 0.01001, 0.25, 0.50, | ||
0.75]).index | ||
expected = Index(['count', 'mean', 'std', 'min', '0.5%', '1.0%', | ||
'25%', '50%', '75%', 'max'], | ||
dtype='object') | ||
tm.assert_index_equal(result, expected) | ||
|
||
def test_describe_column_index_type(self): | ||
# GH13288 | ||
df = pd.DataFrame([1, 2, 3, 4]) | ||
df.columns = pd.Index([0], dtype=object) | ||
result = df.describe().columns | ||
expected = Index([0], dtype=object) | ||
tm.assert_index_equal(result, expected) | ||
|
||
df = pd.DataFrame({'A': list("BCDE"), 0: [1, 2, 3, 4]}) | ||
result = df.describe().columns | ||
expected = Index([0], dtype=object) | ||
tm.assert_index_equal(result, expected) | ||
|
||
def test_describe_no_numeric(self): | ||
df = DataFrame({'A': ['foo', 'foo', 'bar'] * 8, | ||
'B': ['a', 'b', 'c', 'd'] * 6}) | ||
|
@@ -1010,6 +1063,16 @@ def test_describe_no_numeric(self): | |
desc = df.describe() | ||
self.assertEqual(desc.time['first'], min(ts.index)) | ||
|
||
def test_describe_empty(self): | ||
df = DataFrame() | ||
tm.assertRaisesRegexp(ValueError, 'DataFrame without columns', | ||
df.describe) | ||
|
||
df = DataFrame(columns=['A', 'B']) | ||
result = df.describe() | ||
expected = DataFrame(0, columns=['A', 'B'], index=['count', 'unique']) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_describe_empty_int_columns(self): | ||
df = DataFrame([[0, 1], [1, 2]]) | ||
desc = df[df[0] < 0].describe() # works | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I chose this solution for a data frame without columns. Is it ok?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hmm, does this break anything if you raise on a completely empty frame?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, it's a non-breaking change.
pd.concat()
insidedescribe()
raises anyway (I put it in one of my earlier comments). The purpose of this one is just to give a more meaningful message (and skip some code).