+pd.Series.cut, +pd.Series.qcut

ResidentMario · ResidentMario · commit 4a64c390b04b · 2017-02-26T13:28:45.000-05:00
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -82,7 +82,12 @@
         If True, performs operation inplace and returns None.""",
     unique='np.ndarray', duplicated='Series',
     optional_by='',
-    versionadded_to_excel='\n.. versionadded:: 0.20.0\n')
+    versionadded_to_excel='\n.. versionadded:: 0.20.0\n',
+    versionadded_cut='\n.. versionadded:: 0.20.0\n',
+    other_cut='cut',
+    versionadded_qcut='\n.. versionadded:: 0.20.0\n',
+    other_qcut='qcut')
+_shared_docs = dict()
 
 
 def _coerce_method(converter):
@@ -1525,6 +1530,174 @@ def searchsorted(self, value, side='left', sorter=None):
         return self._values.searchsorted(Series(value)._values,
                                          side=side, sorter=sorter)
 
+    # -------------------------------------------------------------------
+    # Partitions
+
+    _shared_docs['cut'] = """
+    Convert categorical variable into dummy/indicator variables
+
+    %(versionadded_cut)s
+
+    Parameters
+    ----------
+    data : array-like, Series, or DataFrame
+    prefix : string, list of strings, or dict of strings, default None
+        String to append DataFrame column names
+        Pass a list with length equal to the number of columns
+        when calling get_dummies on a DataFrame. Alternativly, `prefix`
+        can be a dictionary mapping column names to prefixes.
+    prefix_sep : string, default '_'
+        If appending prefix, separator/delimiter to use. Or pass a
+        list or dictionary as with `prefix.`
+    dummy_na : bool, default False
+        Add a column to indicate NaNs, if False NaNs are ignored.
+    columns : list-like, default None
+        Column names in the DataFrame to be encoded.
+        If `columns` is None then all the columns with
+        `object` or `category` dtype will be converted.
+    sparse : bool, default False
+        Whether the dummy columns should be sparse or not.  Returns
+        SparseDataFrame if `data` is a Series or if all columns are included.
+        Otherwise returns a DataFrame with some SparseBlocks.
+
+        .. versionadded:: 0.16.1
+    drop_first : bool, default False
+        Whether to get k-1 dummies out of k categorical levels by removing the
+        first level.
+
+        .. versionadded:: 0.18.0
+    Returns
+    -------
+    dummies : DataFrame or SparseDataFrame
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> s = pd.Series(list('abca'))
+
+    >>> pd.get_dummies(s)
+       a  b  c
+    0  1  0  0
+    1  0  1  0
+    2  0  0  1
+    3  1  0  0
+
+    >>> s1 = ['a', 'b', np.nan]
+
+    >>> pd.get_dummies(s1)
+       a  b
+    0  1  0
+    1  0  1
+    2  0  0
+
+    >>> pd.get_dummies(s1, dummy_na=True)
+       a  b  NaN
+    0  1  0    0
+    1  0  1    0
+    2  0  0    1
+
+    >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
+                        'C': [1, 2, 3]})
+
+    >>> pd.get_dummies(df, prefix=['col1', 'col2'])
+       C  col1_a  col1_b  col2_a  col2_b  col2_c
+    0  1       1       0       0       1       0
+    1  2       0       1       1       0       0
+    2  3       1       0       0       0       1
+
+    >>> pd.get_dummies(pd.Series(list('abcaa')))
+       a  b  c
+    0  1  0  0
+    1  0  1  0
+    2  0  0  1
+    3  1  0  0
+    4  1  0  0
+
+    >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True))
+       b  c
+    0  0  0
+    1  1  0
+    2  0  1
+    3  0  0
+    4  0  0
+
+    See also
+    --------
+    %(other_cut)s
+    Series.str.get_dummies
+    """
+
+    @Appender(_shared_docs['cut'] % _shared_doc_kwargs)
+    def cut(self, bins, right=True, labels=None, retbins=False, precision=3,
+            include_lowest=False):
+        from pandas.tools.tile import cut
+        return cut(self, bins, right=right, labels=labels, retbins=retbins,
+            precision=precision, include_lowest=include_lowest)
+
+    _shared_docs['qcut'] = """
+        Quantile-based discretization function. Discretize variable into
+        equal-sized buckets based on rank or based on sample quantiles. For example
+        1000 values for 10 quantiles would produce a Categorical object indicating
+        quantile membership for each data point.
+
+        %(versionadded_qcut)s
+
+        Parameters
+        ----------
+        x : ndarray or Series
+        q : integer or array of quantiles
+            Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
+            array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles
+        labels : array or boolean, default None
+            Used as labels for the resulting bins. Must be of the same length as
+            the resulting bins. If False, return only integer indicators of the
+            bins.
+        retbins : bool, optional
+            Whether to return the bins or not. Can be useful if bins is given
+            as a scalar.
+        precision : int
+            The precision at which to store and display the bins labels
+        duplicates : {default 'raise', 'drop'}, optional
+            If bin edges are not unique, raise ValueError or drop non-uniques.
+
+            .. versionadded:: 0.20.0
+
+        Returns
+        -------
+        out : Categorical or Series or array of integers if labels is False
+            The return type (Categorical or Series) depends on the input: a Series
+            of type category if input is a Series else Categorical. Bins are
+            represented as categories when categorical data is returned.
+        bins : ndarray of floats
+            Returned only if `retbins` is True.
+
+        Notes
+        -----
+        Out of bounds values will be NA in the resulting Categorical object
+
+        See also
+        --------
+        %(other_qcut)s
+
+        Examples
+        --------
+        >>> pd.qcut(range(5), 4)
+        [[0, 1], [0, 1], (1, 2], (2, 3], (3, 4]]
+        Categories (4, object): [[0, 1] < (1, 2] < (2, 3] < (3, 4]]
+        >>> pd.qcut(range(5), 3, labels=["good","medium","bad"])
+        [good, good, medium, bad, bad]
+        Categories (3, object): [good < medium < bad]
+        >>> pd.qcut(range(5), 4, labels=False)
+        array([0, 0, 1, 2, 3], dtype=int64)
+        """
+
+    @Appender(_shared_docs['qcut'] % _shared_doc_kwargs)
+    def qcut(self, q, labels=None, retbins=False, precision=3,
+             duplicates='raise'):
+        from pandas.tools.tile import qcut
+        return qcut(self, q, labels=labels, retbins=retbins,
+                    precision=precision, duplicates=duplicates)
+
     # -------------------------------------------------------------------
     # Combination
 
diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py
@@ -15,74 +15,20 @@
 from pandas.types.common import is_datetime64_dtype, is_timedelta64_dtype
 from pandas.lib import infer_dtype
 
+from pandas.core.series import _shared_docs
+from pandas.util.decorators import Appender
+_shared_doc_kwargs = dict(
+    versionadded_cut='',
+    other_cut='Series.cut',
+    versionadded_qcut='',
+    other_qcut='Series.qcut')
+
 import numpy as np
 
 
+@Appender(_shared_docs['cut'] % _shared_doc_kwargs)
 def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
         include_lowest=False):
-    """
-    Return indices of half-open bins to which each value of `x` belongs.
-
-    Parameters
-    ----------
-    x : array-like
-        Input array to be binned. It has to be 1-dimensional.
-    bins : int or sequence of scalars
-        If `bins` is an int, it defines the number of equal-width bins in the
-        range of `x`. However, in this case, the range of `x` is extended
-        by .1% on each side to include the min or max values of `x`. If
-        `bins` is a sequence it defines the bin edges allowing for
-        non-uniform bin width. No extension of the range of `x` is done in
-        this case.
-    right : bool, optional
-        Indicates whether the bins include the rightmost edge or not. If
-        right == True (the default), then the bins [1,2,3,4] indicate
-        (1,2], (2,3], (3,4].
-    labels : array or boolean, default None
-        Used as labels for the resulting bins. Must be of the same length as
-        the resulting bins. If False, return only integer indicators of the
-        bins.
-    retbins : bool, optional
-        Whether to return the bins or not. Can be useful if bins is given
-        as a scalar.
-    precision : int
-        The precision at which to store and display the bins labels
-    include_lowest : bool
-        Whether the first interval should be left-inclusive or not.
-
-    Returns
-    -------
-    out : Categorical or Series or array of integers if labels is False
-        The return type (Categorical or Series) depends on the input: a Series
-        of type category if input is a Series else Categorical. Bins are
-        represented as categories when categorical data is returned.
-    bins : ndarray of floats
-        Returned only if `retbins` is True.
-
-    Notes
-    -----
-    The `cut` function can be useful for going from a continuous variable to
-    a categorical variable. For example, `cut` could convert ages to groups
-    of age ranges.
-
-    Any NA values will be NA in the result.  Out of bounds values will be NA in
-    the resulting Categorical object
-
-
-    Examples
-    --------
-    >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True)
-    ([(0.191, 3.367], (0.191, 3.367], (0.191, 3.367], (3.367, 6.533],
-      (6.533, 9.7], (0.191, 3.367]]
-    Categories (3, object): [(0.191, 3.367] < (3.367, 6.533] < (6.533, 9.7]],
-    array([ 0.1905    ,  3.36666667,  6.53333333,  9.7       ]))
-    >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3,
-               labels=["good","medium","bad"])
-    [good, good, good, medium, bad, good]
-    Categories (3, object): [good < medium < bad]
-    >>> pd.cut(np.ones(5), 4, labels=False)
-    array([1, 1, 1, 1, 1], dtype=int64)
-    """
     # NOTE: this binning code is changed a bit from histogram for var(x) == 0
 
     # for handling the cut for datetime and timedelta objects
@@ -129,57 +75,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
                                 series_index, name)
 
 
+@Appender(_shared_docs['qcut'] % _shared_doc_kwargs)
 def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'):
-    """
-    Quantile-based discretization function. Discretize variable into
-    equal-sized buckets based on rank or based on sample quantiles. For example
-    1000 values for 10 quantiles would produce a Categorical object indicating
-    quantile membership for each data point.
-
-    Parameters
-    ----------
-    x : ndarray or Series
-    q : integer or array of quantiles
-        Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
-        array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles
-    labels : array or boolean, default None
-        Used as labels for the resulting bins. Must be of the same length as
-        the resulting bins. If False, return only integer indicators of the
-        bins.
-    retbins : bool, optional
-        Whether to return the bins or not. Can be useful if bins is given
-        as a scalar.
-    precision : int
-        The precision at which to store and display the bins labels
-    duplicates : {default 'raise', 'drop'}, optional
-        If bin edges are not unique, raise ValueError or drop non-uniques.
-
-        .. versionadded:: 0.20.0
-
-    Returns
-    -------
-    out : Categorical or Series or array of integers if labels is False
-        The return type (Categorical or Series) depends on the input: a Series
-        of type category if input is a Series else Categorical. Bins are
-        represented as categories when categorical data is returned.
-    bins : ndarray of floats
-        Returned only if `retbins` is True.
-
-    Notes
-    -----
-    Out of bounds values will be NA in the resulting Categorical object
-
-    Examples
-    --------
-    >>> pd.qcut(range(5), 4)
-    [[0, 1], [0, 1], (1, 2], (2, 3], (3, 4]]
-    Categories (4, object): [[0, 1] < (1, 2] < (2, 3] < (3, 4]]
-    >>> pd.qcut(range(5), 3, labels=["good","medium","bad"])
-    [good, good, medium, bad, bad]
-    Categories (3, object): [good < medium < bad]
-    >>> pd.qcut(range(5), 4, labels=False)
-    array([0, 0, 1, 2, 3], dtype=int64)
-    """
     x_is_series, series_index, name, x = _preprocess_for_cut(x)
 
     x, dtype = _coerce_to_type(x)