|
15 | 15 | from pandas.types.common import is_datetime64_dtype, is_timedelta64_dtype
|
16 | 16 | from pandas.lib import infer_dtype
|
17 | 17 |
|
| 18 | +from pandas.core.series import _shared_docs |
| 19 | +from pandas.util.decorators import Appender |
| 20 | +_shared_doc_kwargs = dict( |
| 21 | + versionadded_cut='', |
| 22 | + other_cut='Series.cut', |
| 23 | + versionadded_qcut='', |
| 24 | + other_qcut='Series.qcut') |
| 25 | + |
18 | 26 | import numpy as np
|
19 | 27 |
|
20 | 28 |
|
| 29 | +@Appender(_shared_docs['cut'] % _shared_doc_kwargs) |
21 | 30 | def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
|
22 | 31 | include_lowest=False):
|
23 |
| - """ |
24 |
| - Return indices of half-open bins to which each value of `x` belongs. |
25 |
| -
|
26 |
| - Parameters |
27 |
| - ---------- |
28 |
| - x : array-like |
29 |
| - Input array to be binned. It has to be 1-dimensional. |
30 |
| - bins : int or sequence of scalars |
31 |
| - If `bins` is an int, it defines the number of equal-width bins in the |
32 |
| - range of `x`. However, in this case, the range of `x` is extended |
33 |
| - by .1% on each side to include the min or max values of `x`. If |
34 |
| - `bins` is a sequence it defines the bin edges allowing for |
35 |
| - non-uniform bin width. No extension of the range of `x` is done in |
36 |
| - this case. |
37 |
| - right : bool, optional |
38 |
| - Indicates whether the bins include the rightmost edge or not. If |
39 |
| - right == True (the default), then the bins [1,2,3,4] indicate |
40 |
| - (1,2], (2,3], (3,4]. |
41 |
| - labels : array or boolean, default None |
42 |
| - Used as labels for the resulting bins. Must be of the same length as |
43 |
| - the resulting bins. If False, return only integer indicators of the |
44 |
| - bins. |
45 |
| - retbins : bool, optional |
46 |
| - Whether to return the bins or not. Can be useful if bins is given |
47 |
| - as a scalar. |
48 |
| - precision : int |
49 |
| - The precision at which to store and display the bins labels |
50 |
| - include_lowest : bool |
51 |
| - Whether the first interval should be left-inclusive or not. |
52 |
| -
|
53 |
| - Returns |
54 |
| - ------- |
55 |
| - out : Categorical or Series or array of integers if labels is False |
56 |
| - The return type (Categorical or Series) depends on the input: a Series |
57 |
| - of type category if input is a Series else Categorical. Bins are |
58 |
| - represented as categories when categorical data is returned. |
59 |
| - bins : ndarray of floats |
60 |
| - Returned only if `retbins` is True. |
61 |
| -
|
62 |
| - Notes |
63 |
| - ----- |
64 |
| - The `cut` function can be useful for going from a continuous variable to |
65 |
| - a categorical variable. For example, `cut` could convert ages to groups |
66 |
| - of age ranges. |
67 |
| -
|
68 |
| - Any NA values will be NA in the result. Out of bounds values will be NA in |
69 |
| - the resulting Categorical object |
70 |
| -
|
71 |
| -
|
72 |
| - Examples |
73 |
| - -------- |
74 |
| - >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True) |
75 |
| - ([(0.191, 3.367], (0.191, 3.367], (0.191, 3.367], (3.367, 6.533], |
76 |
| - (6.533, 9.7], (0.191, 3.367]] |
77 |
| - Categories (3, object): [(0.191, 3.367] < (3.367, 6.533] < (6.533, 9.7]], |
78 |
| - array([ 0.1905 , 3.36666667, 6.53333333, 9.7 ])) |
79 |
| - >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, |
80 |
| - labels=["good","medium","bad"]) |
81 |
| - [good, good, good, medium, bad, good] |
82 |
| - Categories (3, object): [good < medium < bad] |
83 |
| - >>> pd.cut(np.ones(5), 4, labels=False) |
84 |
| - array([1, 1, 1, 1, 1], dtype=int64) |
85 |
| - """ |
86 | 32 | # NOTE: this binning code is changed a bit from histogram for var(x) == 0
|
87 | 33 |
|
88 | 34 | # for handling the cut for datetime and timedelta objects
|
@@ -129,57 +75,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
|
129 | 75 | series_index, name)
|
130 | 76 |
|
131 | 77 |
|
| 78 | +@Appender(_shared_docs['qcut'] % _shared_doc_kwargs) |
132 | 79 | def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'):
|
133 |
| - """ |
134 |
| - Quantile-based discretization function. Discretize variable into |
135 |
| - equal-sized buckets based on rank or based on sample quantiles. For example |
136 |
| - 1000 values for 10 quantiles would produce a Categorical object indicating |
137 |
| - quantile membership for each data point. |
138 |
| -
|
139 |
| - Parameters |
140 |
| - ---------- |
141 |
| - x : ndarray or Series |
142 |
| - q : integer or array of quantiles |
143 |
| - Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately |
144 |
| - array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles |
145 |
| - labels : array or boolean, default None |
146 |
| - Used as labels for the resulting bins. Must be of the same length as |
147 |
| - the resulting bins. If False, return only integer indicators of the |
148 |
| - bins. |
149 |
| - retbins : bool, optional |
150 |
| - Whether to return the bins or not. Can be useful if bins is given |
151 |
| - as a scalar. |
152 |
| - precision : int |
153 |
| - The precision at which to store and display the bins labels |
154 |
| - duplicates : {default 'raise', 'drop'}, optional |
155 |
| - If bin edges are not unique, raise ValueError or drop non-uniques. |
156 |
| -
|
157 |
| - .. versionadded:: 0.20.0 |
158 |
| -
|
159 |
| - Returns |
160 |
| - ------- |
161 |
| - out : Categorical or Series or array of integers if labels is False |
162 |
| - The return type (Categorical or Series) depends on the input: a Series |
163 |
| - of type category if input is a Series else Categorical. Bins are |
164 |
| - represented as categories when categorical data is returned. |
165 |
| - bins : ndarray of floats |
166 |
| - Returned only if `retbins` is True. |
167 |
| -
|
168 |
| - Notes |
169 |
| - ----- |
170 |
| - Out of bounds values will be NA in the resulting Categorical object |
171 |
| -
|
172 |
| - Examples |
173 |
| - -------- |
174 |
| - >>> pd.qcut(range(5), 4) |
175 |
| - [[0, 1], [0, 1], (1, 2], (2, 3], (3, 4]] |
176 |
| - Categories (4, object): [[0, 1] < (1, 2] < (2, 3] < (3, 4]] |
177 |
| - >>> pd.qcut(range(5), 3, labels=["good","medium","bad"]) |
178 |
| - [good, good, medium, bad, bad] |
179 |
| - Categories (3, object): [good < medium < bad] |
180 |
| - >>> pd.qcut(range(5), 4, labels=False) |
181 |
| - array([0, 0, 1, 2, 3], dtype=int64) |
182 |
| - """ |
183 | 80 | x_is_series, series_index, name, x = _preprocess_for_cut(x)
|
184 | 81 |
|
185 | 82 | x, dtype = _coerce_to_type(x)
|
|
0 commit comments