Skip to content

Commit 4a64c39

Browse files
committed
+pd.Series.cut, +pd.Series.qcut
1 parent 20771d1 commit 4a64c39

File tree

2 files changed

+184
-114
lines changed

2 files changed

+184
-114
lines changed

pandas/core/series.py

+174-1
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,12 @@
8282
If True, performs operation inplace and returns None.""",
8383
unique='np.ndarray', duplicated='Series',
8484
optional_by='',
85-
versionadded_to_excel='\n.. versionadded:: 0.20.0\n')
85+
versionadded_to_excel='\n.. versionadded:: 0.20.0\n',
86+
versionadded_cut='\n.. versionadded:: 0.20.0\n',
87+
other_cut='cut',
88+
versionadded_qcut='\n.. versionadded:: 0.20.0\n',
89+
other_qcut='qcut')
90+
_shared_docs = dict()
8691

8792

8893
def _coerce_method(converter):
@@ -1525,6 +1530,174 @@ def searchsorted(self, value, side='left', sorter=None):
15251530
return self._values.searchsorted(Series(value)._values,
15261531
side=side, sorter=sorter)
15271532

1533+
# -------------------------------------------------------------------
1534+
# Partitions
1535+
1536+
_shared_docs['cut'] = """
1537+
Convert categorical variable into dummy/indicator variables
1538+
1539+
%(versionadded_cut)s
1540+
1541+
Parameters
1542+
----------
1543+
data : array-like, Series, or DataFrame
1544+
prefix : string, list of strings, or dict of strings, default None
1545+
String to append DataFrame column names
1546+
Pass a list with length equal to the number of columns
1547+
when calling get_dummies on a DataFrame. Alternativly, `prefix`
1548+
can be a dictionary mapping column names to prefixes.
1549+
prefix_sep : string, default '_'
1550+
If appending prefix, separator/delimiter to use. Or pass a
1551+
list or dictionary as with `prefix.`
1552+
dummy_na : bool, default False
1553+
Add a column to indicate NaNs, if False NaNs are ignored.
1554+
columns : list-like, default None
1555+
Column names in the DataFrame to be encoded.
1556+
If `columns` is None then all the columns with
1557+
`object` or `category` dtype will be converted.
1558+
sparse : bool, default False
1559+
Whether the dummy columns should be sparse or not. Returns
1560+
SparseDataFrame if `data` is a Series or if all columns are included.
1561+
Otherwise returns a DataFrame with some SparseBlocks.
1562+
1563+
.. versionadded:: 0.16.1
1564+
drop_first : bool, default False
1565+
Whether to get k-1 dummies out of k categorical levels by removing the
1566+
first level.
1567+
1568+
.. versionadded:: 0.18.0
1569+
Returns
1570+
-------
1571+
dummies : DataFrame or SparseDataFrame
1572+
1573+
Examples
1574+
--------
1575+
>>> import pandas as pd
1576+
>>> s = pd.Series(list('abca'))
1577+
1578+
>>> pd.get_dummies(s)
1579+
a b c
1580+
0 1 0 0
1581+
1 0 1 0
1582+
2 0 0 1
1583+
3 1 0 0
1584+
1585+
>>> s1 = ['a', 'b', np.nan]
1586+
1587+
>>> pd.get_dummies(s1)
1588+
a b
1589+
0 1 0
1590+
1 0 1
1591+
2 0 0
1592+
1593+
>>> pd.get_dummies(s1, dummy_na=True)
1594+
a b NaN
1595+
0 1 0 0
1596+
1 0 1 0
1597+
2 0 0 1
1598+
1599+
>>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
1600+
'C': [1, 2, 3]})
1601+
1602+
>>> pd.get_dummies(df, prefix=['col1', 'col2'])
1603+
C col1_a col1_b col2_a col2_b col2_c
1604+
0 1 1 0 0 1 0
1605+
1 2 0 1 1 0 0
1606+
2 3 1 0 0 0 1
1607+
1608+
>>> pd.get_dummies(pd.Series(list('abcaa')))
1609+
a b c
1610+
0 1 0 0
1611+
1 0 1 0
1612+
2 0 0 1
1613+
3 1 0 0
1614+
4 1 0 0
1615+
1616+
>>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True))
1617+
b c
1618+
0 0 0
1619+
1 1 0
1620+
2 0 1
1621+
3 0 0
1622+
4 0 0
1623+
1624+
See also
1625+
--------
1626+
%(other_cut)s
1627+
Series.str.get_dummies
1628+
"""
1629+
1630+
@Appender(_shared_docs['cut'] % _shared_doc_kwargs)
1631+
def cut(self, bins, right=True, labels=None, retbins=False, precision=3,
1632+
include_lowest=False):
1633+
from pandas.tools.tile import cut
1634+
return cut(self, bins, right=right, labels=labels, retbins=retbins,
1635+
precision=precision, include_lowest=include_lowest)
1636+
1637+
_shared_docs['qcut'] = """
1638+
Quantile-based discretization function. Discretize variable into
1639+
equal-sized buckets based on rank or based on sample quantiles. For example
1640+
1000 values for 10 quantiles would produce a Categorical object indicating
1641+
quantile membership for each data point.
1642+
1643+
%(versionadded_qcut)s
1644+
1645+
Parameters
1646+
----------
1647+
x : ndarray or Series
1648+
q : integer or array of quantiles
1649+
Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
1650+
array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles
1651+
labels : array or boolean, default None
1652+
Used as labels for the resulting bins. Must be of the same length as
1653+
the resulting bins. If False, return only integer indicators of the
1654+
bins.
1655+
retbins : bool, optional
1656+
Whether to return the bins or not. Can be useful if bins is given
1657+
as a scalar.
1658+
precision : int
1659+
The precision at which to store and display the bins labels
1660+
duplicates : {default 'raise', 'drop'}, optional
1661+
If bin edges are not unique, raise ValueError or drop non-uniques.
1662+
1663+
.. versionadded:: 0.20.0
1664+
1665+
Returns
1666+
-------
1667+
out : Categorical or Series or array of integers if labels is False
1668+
The return type (Categorical or Series) depends on the input: a Series
1669+
of type category if input is a Series else Categorical. Bins are
1670+
represented as categories when categorical data is returned.
1671+
bins : ndarray of floats
1672+
Returned only if `retbins` is True.
1673+
1674+
Notes
1675+
-----
1676+
Out of bounds values will be NA in the resulting Categorical object
1677+
1678+
See also
1679+
--------
1680+
%(other_qcut)s
1681+
1682+
Examples
1683+
--------
1684+
>>> pd.qcut(range(5), 4)
1685+
[[0, 1], [0, 1], (1, 2], (2, 3], (3, 4]]
1686+
Categories (4, object): [[0, 1] < (1, 2] < (2, 3] < (3, 4]]
1687+
>>> pd.qcut(range(5), 3, labels=["good","medium","bad"])
1688+
[good, good, medium, bad, bad]
1689+
Categories (3, object): [good < medium < bad]
1690+
>>> pd.qcut(range(5), 4, labels=False)
1691+
array([0, 0, 1, 2, 3], dtype=int64)
1692+
"""
1693+
1694+
@Appender(_shared_docs['qcut'] % _shared_doc_kwargs)
1695+
def qcut(self, q, labels=None, retbins=False, precision=3,
1696+
duplicates='raise'):
1697+
from pandas.tools.tile import qcut
1698+
return qcut(self, q, labels=labels, retbins=retbins,
1699+
precision=precision, duplicates=duplicates)
1700+
15281701
# -------------------------------------------------------------------
15291702
# Combination
15301703

pandas/tools/tile.py

+10-113
Original file line numberDiff line numberDiff line change
@@ -15,74 +15,20 @@
1515
from pandas.types.common import is_datetime64_dtype, is_timedelta64_dtype
1616
from pandas.lib import infer_dtype
1717

18+
from pandas.core.series import _shared_docs
19+
from pandas.util.decorators import Appender
20+
_shared_doc_kwargs = dict(
21+
versionadded_cut='',
22+
other_cut='Series.cut',
23+
versionadded_qcut='',
24+
other_qcut='Series.qcut')
25+
1826
import numpy as np
1927

2028

29+
@Appender(_shared_docs['cut'] % _shared_doc_kwargs)
2130
def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
2231
include_lowest=False):
23-
"""
24-
Return indices of half-open bins to which each value of `x` belongs.
25-
26-
Parameters
27-
----------
28-
x : array-like
29-
Input array to be binned. It has to be 1-dimensional.
30-
bins : int or sequence of scalars
31-
If `bins` is an int, it defines the number of equal-width bins in the
32-
range of `x`. However, in this case, the range of `x` is extended
33-
by .1% on each side to include the min or max values of `x`. If
34-
`bins` is a sequence it defines the bin edges allowing for
35-
non-uniform bin width. No extension of the range of `x` is done in
36-
this case.
37-
right : bool, optional
38-
Indicates whether the bins include the rightmost edge or not. If
39-
right == True (the default), then the bins [1,2,3,4] indicate
40-
(1,2], (2,3], (3,4].
41-
labels : array or boolean, default None
42-
Used as labels for the resulting bins. Must be of the same length as
43-
the resulting bins. If False, return only integer indicators of the
44-
bins.
45-
retbins : bool, optional
46-
Whether to return the bins or not. Can be useful if bins is given
47-
as a scalar.
48-
precision : int
49-
The precision at which to store and display the bins labels
50-
include_lowest : bool
51-
Whether the first interval should be left-inclusive or not.
52-
53-
Returns
54-
-------
55-
out : Categorical or Series or array of integers if labels is False
56-
The return type (Categorical or Series) depends on the input: a Series
57-
of type category if input is a Series else Categorical. Bins are
58-
represented as categories when categorical data is returned.
59-
bins : ndarray of floats
60-
Returned only if `retbins` is True.
61-
62-
Notes
63-
-----
64-
The `cut` function can be useful for going from a continuous variable to
65-
a categorical variable. For example, `cut` could convert ages to groups
66-
of age ranges.
67-
68-
Any NA values will be NA in the result. Out of bounds values will be NA in
69-
the resulting Categorical object
70-
71-
72-
Examples
73-
--------
74-
>>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True)
75-
([(0.191, 3.367], (0.191, 3.367], (0.191, 3.367], (3.367, 6.533],
76-
(6.533, 9.7], (0.191, 3.367]]
77-
Categories (3, object): [(0.191, 3.367] < (3.367, 6.533] < (6.533, 9.7]],
78-
array([ 0.1905 , 3.36666667, 6.53333333, 9.7 ]))
79-
>>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3,
80-
labels=["good","medium","bad"])
81-
[good, good, good, medium, bad, good]
82-
Categories (3, object): [good < medium < bad]
83-
>>> pd.cut(np.ones(5), 4, labels=False)
84-
array([1, 1, 1, 1, 1], dtype=int64)
85-
"""
8632
# NOTE: this binning code is changed a bit from histogram for var(x) == 0
8733

8834
# for handling the cut for datetime and timedelta objects
@@ -129,57 +75,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
12975
series_index, name)
13076

13177

78+
@Appender(_shared_docs['qcut'] % _shared_doc_kwargs)
13279
def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'):
133-
"""
134-
Quantile-based discretization function. Discretize variable into
135-
equal-sized buckets based on rank or based on sample quantiles. For example
136-
1000 values for 10 quantiles would produce a Categorical object indicating
137-
quantile membership for each data point.
138-
139-
Parameters
140-
----------
141-
x : ndarray or Series
142-
q : integer or array of quantiles
143-
Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
144-
array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles
145-
labels : array or boolean, default None
146-
Used as labels for the resulting bins. Must be of the same length as
147-
the resulting bins. If False, return only integer indicators of the
148-
bins.
149-
retbins : bool, optional
150-
Whether to return the bins or not. Can be useful if bins is given
151-
as a scalar.
152-
precision : int
153-
The precision at which to store and display the bins labels
154-
duplicates : {default 'raise', 'drop'}, optional
155-
If bin edges are not unique, raise ValueError or drop non-uniques.
156-
157-
.. versionadded:: 0.20.0
158-
159-
Returns
160-
-------
161-
out : Categorical or Series or array of integers if labels is False
162-
The return type (Categorical or Series) depends on the input: a Series
163-
of type category if input is a Series else Categorical. Bins are
164-
represented as categories when categorical data is returned.
165-
bins : ndarray of floats
166-
Returned only if `retbins` is True.
167-
168-
Notes
169-
-----
170-
Out of bounds values will be NA in the resulting Categorical object
171-
172-
Examples
173-
--------
174-
>>> pd.qcut(range(5), 4)
175-
[[0, 1], [0, 1], (1, 2], (2, 3], (3, 4]]
176-
Categories (4, object): [[0, 1] < (1, 2] < (2, 3] < (3, 4]]
177-
>>> pd.qcut(range(5), 3, labels=["good","medium","bad"])
178-
[good, good, medium, bad, bad]
179-
Categories (3, object): [good < medium < bad]
180-
>>> pd.qcut(range(5), 4, labels=False)
181-
array([0, 0, 1, 2, 3], dtype=int64)
182-
"""
18380
x_is_series, series_index, name, x = _preprocess_for_cut(x)
18481

18582
x, dtype = _coerce_to_type(x)

0 commit comments

Comments
 (0)