Skip to content

CLN: consolidate Series.quantile and DataFrame.quantile #12469

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 10 additions & 97 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@
import pandas.algos as _algos

from pandas.core.config import get_option
from pandas import _np_version_under1p9

from textwrap import dedent

# ---------------------------------------------------------------------
# Docstring templates
Expand Down Expand Up @@ -4919,108 +4920,20 @@ def f(s):

return data.apply(f, axis=axis)

def quantile(self, q=0.5, axis=0, numeric_only=True,
interpolation='linear'):
"""
Return values at the given quantile over requested axis, a la
numpy.percentile.

Parameters
----------
q : float or array-like, default 0.5 (50% quantile)
0 <= q <= 1, the quantile(s) to compute
axis : {0, 1, 'index', 'columns'} (default 0)
0 or 'index' for row-wise, 1 or 'columns' for column-wise
interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
.. versionadded:: 0.18.0
This optional parameter specifies the interpolation method to use,
when the desired quantile lies between two data points `i` and `j`:

* linear: `i + (j - i) * fraction`, where `fraction` is the
fractional part of the index surrounded by `i` and `j`.
* lower: `i`.
* higher: `j`.
* nearest: `i` or `j` whichever is nearest.
* midpoint: (`i` + `j`) / 2.

Returns
-------
@Substitution(dedent("""
quantiles : Series or DataFrame
If ``q`` is an array, a DataFrame will be returned where the
index is ``q``, the columns are the columns of self, and the
values are the quantiles.
If ``q`` is a float, a Series will be returned where the
index is the columns of self and the values are the quantiles.

Examples
--------

>>> df = DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
columns=['a', 'b'])
>>> df.quantile(.1)
a 1.3
b 3.7
dtype: float64
>>> df.quantile([.1, .5])
a b
0.1 1.3 3.7
0.5 2.5 55.0
"""
self._check_percentile(q)
per = np.asarray(q) * 100

if not com.is_list_like(per):
per = [per]
q = [q]
squeeze = True
else:
squeeze = False

if _np_version_under1p9:
if interpolation != 'linear':
raise ValueError("Interpolation methods other than linear "
"are not supported in numpy < 1.9")

def f(arr, per, interpolation):
if arr._is_datelike_mixed_type:
values = _values_from_object(arr).view('i8')
else:
values = arr.astype(float)
values = values[notnull(values)]
if len(values) == 0:
return NA
else:
if _np_version_under1p9:
return _quantile(values, per)
else:
return _quantile(values, per, interpolation=interpolation)

data = self._get_numeric_data() if numeric_only else self

axis = self._get_axis_number(axis)

if axis == 1:
data = data.T

# need to know which cols are timestamp going in so that we can
# map timestamp over them after getting the quantile.
is_dt_col = data.dtypes.map(com.is_datetime64_dtype)
is_dt_col = is_dt_col[is_dt_col].index

quantiles = [[f(vals, x, interpolation) for x in per]
for (_, vals) in data.iteritems()]

result = self._constructor(quantiles, index=data._info_axis,
columns=q).T
if len(is_dt_col) > 0:
result[is_dt_col] = result[is_dt_col].applymap(lib.Timestamp)
if squeeze:
if result.shape == (1, 1):
result = result.T.iloc[:, 0] # don't want scalar
else:
result = result.T.squeeze()
result.name = None # For groupby, so it can set an index name
return result
"""))
@Appender(_shared_docs['quantile'])
def quantile(self, q=0.5, axis=0, numeric_only=True,
interpolation='linear'):
return super(DataFrame,
self).quantile(q=q, axis=axis, numeric_only=numeric_only,
interpolation=interpolation)

def to_timestamp(self, freq=None, how='start', axis=0, copy=True):
"""
Expand Down
159 changes: 122 additions & 37 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@
SettingWithCopyError, SettingWithCopyWarning,
AbstractMethodError)
import pandas.core.nanops as nanops
from numpy import percentile as _quantile
from pandas.util.decorators import Appender, Substitution, deprecate_kwarg
from pandas.core import config
from pandas import _np_version_under1p9

# goal is to be able to define the docs close to function, while still being
# able to share
Expand Down Expand Up @@ -842,43 +844,7 @@ def __contains__(self, key):

@property
def empty(self):
"""True if NDFrame is entirely empty [no items], meaning any of the
axes are of length 0.

Notes
-----
If NDFrame contains only NaNs, it is still not considered empty. See
the example below.

Examples
--------
An example of an actual empty DataFrame. Notice the index is empty:

>>> df_empty = pd.DataFrame({'A' : []})
>>> df_empty
Empty DataFrame
Columns: [A]
Index: []
>>> df_empty.empty
True

If we only have NaNs in our DataFrame, it is not considered empty! We
will need to drop the NaNs to make the DataFrame empty:

>>> df = pd.DataFrame({'A' : [np.nan]})
>>> df
A
0 NaN
>>> df.empty
False
>>> df.dropna().empty
True

See also
--------
pandas.Series.dropna
pandas.DataFrame.dropna
"""
"""True if NDFrame is entirely empty [no items]"""
return not all(len(self._get_axis(a)) > 0 for a in self._AXIS_ORDERS)

def __nonzero__(self):
Expand Down Expand Up @@ -4110,6 +4076,125 @@ def ranker(data):

return ranker(data)

_shared_docs['quantile'] = ("""
Return values at the given quantile over requested axis, a la
numpy.percentile.

Parameters
----------
q : float or array-like, default 0.5 (50 percentile)
0 <= q <= 1, the quantile(s) to compute
axis : {0, 1, 'index', 'columns'} (default 0)
0 or 'index' for row-wise, 1 or 'columns' for column-wise
numeric_only : boolean, default None
Include only float, int, boolean data. If None, will attempt to use
everything, then use only numeric data
interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
.. versionadded:: 0.18.0
This optional parameter specifies the interpolation method to use,
when the desired quantile lies between two data points `i` and `j`:

* linear: `i + (j - i) * fraction`, where `fraction` is the
fractional part of the index surrounded by `i` and `j`.
* lower: `i`.
* higher: `j`.
* nearest: `i` or `j` whichever is nearest.
* midpoint: (`i` + `j`) / 2.

Returns
-------
%s

Examples
--------

>>> s = Series([1, 2, 3, 4])
>>> s.quantile(.5)
2.5
>>> s.quantile([.25, .5, .75])
0.25 1.75
0.50 2.50
0.75 3.25
dtype: float64
>>> df = DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
columns=['a', 'b'])
>>> df.quantile(.1)
a 1.3
b 3.7
dtype: float64
>>> df.quantile([.1, .5])
a b
0.1 1.3 3.7
0.5 2.5 55.0
""")

@Appender(_shared_docs['quantile'] % '')
def quantile(self, q=0.5, axis=0, numeric_only=None,
interpolation='linear'):
if self.ndim >= 3:
msg = "quantile is not implemented on on Panel or PanelND objects."
raise NotImplementedError(msg)
elif self.ndim == 1:
result = self.to_frame().quantile(q=q, axis=axis,
numeric_only=numeric_only,
interpolation=interpolation)
if not com.is_list_like(q):
return result.iloc[0]
else:
return result[result.columns[0]]

self._check_percentile(q)
per = np.asarray(q) * 100

if not com.is_list_like(per):
per = [per]
q = [q]
squeeze = True
else:
squeeze = False

if _np_version_under1p9:
if interpolation != 'linear':
raise ValueError("Interpolation methods other than linear "
"are not supported in numpy < 1.9")

def f(arr, per, interpolation):
boxer = com.i8_boxer(arr) \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can combine these in 1
its either a needs_i8_conversion (where you need a boxer), or its not (and we upcast to float)

if com.needs_i8_conversion(arr) else lambda x: x
if arr._is_datelike_mixed_type:
values = _values_from_object(arr).view('i8')
else:
values = arr.astype(float)
values = values[notnull(values)]
if len(values) == 0:
return boxer(np.nan)
else:
if _np_version_under1p9:
return boxer(_quantile(values, per))
else:
return boxer(_quantile(values, per,
interpolation=interpolation))

data = self._get_numeric_data() if numeric_only else self

axis = self._get_axis_number(axis)

if axis == 1:
data = data.T

quantiles = [[f(vals, x, interpolation) for x in per]
for (_, vals) in data.iteritems()]

result = self._constructor(quantiles, index=data._info_axis,
columns=q).T
if squeeze:
if result.shape == (1, 1):
result = result.T.iloc[:, 0] # don't want scalar
else:
result = result.T.squeeze()
result.name = None # For groupby, so it can set an index name
return result

_shared_docs['align'] = ("""
Align two object on their axes with the
specified join method for each axis Index
Expand Down
Loading