Merge pull request #8413 from JanSchulz/CategoricalFixups3

jreback · jreback · commit 0f8367957e8e · 2014-09-29T17:36:42.000-04:00
Categorical doc fixups
diff --git a/doc/source/10min.rst b/doc/source/10min.rst
@@ -640,27 +640,44 @@ Categoricals
 ------------
 
 Since version 0.15, pandas can include categorical data in a ``DataFrame``. For full docs, see the
-:ref:`Categorical introduction <categorical>` and the :ref:`API documentation <api.categorical>` .
+:ref:`categorical introduction <categorical>` and the :ref:`API documentation <api.categorical>`.
 
 .. ipython:: python
 
     df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
 
-    # convert the raw grades to a categorical
-    df["grade"] = pd.Categorical(df["raw_grade"])
+Convert the raw grades to a categorical data type.
 
-    # Alternative: df["grade"] = df["raw_grade"].astype("category")
+.. ipython:: python
+
+    df["grade"] = df["raw_grade"].astype("category")
     df["grade"]
 
-    # Rename the categories inplace
+Rename the categories to more meaningful names (assigning to ``Series.cat.categories`` is inplace!)
+
+.. ipython:: python
+
     df["grade"].cat.categories = ["very good", "good", "very bad"]
 
-    # Reorder the categories and simultaneously add the missing categories
+Reorder the categories and simultaneously add the missing categories (methods under ``Series
+.cat`` return a new ``Series`` per default).
+
+.. ipython:: python
+
     df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])
     df["grade"]
+
+Sorting is per order in the categories, not lexical order.
+
+.. ipython:: python
+
     df.sort("grade")
-    df.groupby("grade").size()
 
+Grouping by a categorical column shows also empty categories.
+
+.. ipython:: python
+
+    df.groupby("grade").size()
 
 
 Plotting
diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
@@ -611,6 +611,8 @@ available ("missing value") or `np.nan` is a valid category.
     pd.isnull(s)
     s.fillna("a")
 
+.. _categorical.rfactor:
+
 Differences to R's `factor`
 ---------------------------
 
diff --git a/doc/source/comparison_with_r.rst b/doc/source/comparison_with_r.rst
@@ -6,7 +6,7 @@
 
    import pandas as pd
    import numpy as np
-   options.display.max_rows=15
+   pd.options.display.max_rows=15
 
 Comparison with R / R libraries
 *******************************
@@ -51,7 +51,7 @@ Selecting multiple columns by name in ``pandas`` is straightforward
 
 .. ipython:: python
 
-   df = DataFrame(np.random.randn(10, 3), columns=list('abc'))
+   df = pd.DataFrame(np.random.randn(10, 3), columns=list('abc'))
    df[['a', 'c']]
    df.loc[:, ['a', 'c']]
 
@@ -63,7 +63,7 @@ with a combination of the ``iloc`` indexer attribute and ``numpy.r_``.
    named = list('abcdefg')
    n = 30
    columns = named + np.arange(len(named), n).tolist()
-   df = DataFrame(np.random.randn(n, n), columns=columns)
+   df = pd.DataFrame(np.random.randn(n, n), columns=columns)
 
    df.iloc[:, np.r_[:10, 24:30]]
 
@@ -88,8 +88,7 @@ function.
 
 .. ipython:: python
 
-   from pandas import DataFrame
-   df = DataFrame({
+   df = pd.DataFrame({
      'v1': [1,3,5,7,8,3,5,np.nan,4,5,7,9],
      'v2': [11,33,55,77,88,33,55,np.nan,44,55,77,99],
      'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12],
@@ -166,7 +165,7 @@ In ``pandas`` we may use :meth:`~pandas.pivot_table` method to handle this:
    import random
    import string
 
-   baseball = DataFrame({
+   baseball = pd.DataFrame({
       'team': ["team %d" % (x+1) for x in range(5)]*5,
       'player': random.sample(list(string.ascii_lowercase),25),
       'batting avg': np.random.uniform(.200, .400, 25)
@@ -197,7 +196,7 @@ index/slice as well as standard boolean indexing:
 
 .. ipython:: python
 
-   df = DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)})
+   df = pd.DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)})
    df.query('a <= b')
    df[df.a <= df.b]
    df.loc[df.a <= df.b]
@@ -225,7 +224,7 @@ In ``pandas`` the equivalent expression, using the
 
 .. ipython:: python
 
-   df = DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)})
+   df = pd.DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)})
    df.eval('a + b')
    df.a + df.b  # same as the previous expression
 
@@ -283,7 +282,7 @@ In ``pandas`` the equivalent expression, using the
 
 .. ipython:: python
 
-   df = DataFrame({
+   df = pd.DataFrame({
        'x': np.random.uniform(1., 168., 120),
        'y': np.random.uniform(7., 334., 120),
        'z': np.random.uniform(1.7, 20.7, 120),
@@ -317,7 +316,7 @@ In Python, since ``a`` is a list, you can simply use list comprehension.
 .. ipython:: python
 
    a = np.array(list(range(1,24))+[np.NAN]).reshape(2,3,4)
-   DataFrame([tuple(list(x)+[val]) for x, val in np.ndenumerate(a)])
+   pd.DataFrame([tuple(list(x)+[val]) for x, val in np.ndenumerate(a)])
 
 |meltlist|_
 ~~~~~~~~~~~~
@@ -336,7 +335,7 @@ In Python, this list would be a list of tuples, so
 .. ipython:: python
 
    a = list(enumerate(list(range(1,5))+[np.NAN]))
-   DataFrame(a)
+   pd.DataFrame(a)
 
 For more details and examples see :ref:`the Into to Data Structures
 documentation <basics.dataframe.from_items>`.
@@ -361,7 +360,7 @@ In Python, the :meth:`~pandas.melt` method is the R equivalent:
 
 .. ipython:: python
 
-   cheese = DataFrame({'first' : ['John', 'Mary'],
+   cheese = pd.DataFrame({'first' : ['John', 'Mary'],
                        'last' : ['Doe', 'Bo'],
                        'height' : [5.5, 6.0],
                        'weight' : [130, 150]})
@@ -394,7 +393,7 @@ In Python the best way is to make use of :meth:`~pandas.pivot_table`:
 
 .. ipython:: python
 
-   df = DataFrame({
+   df = pd.DataFrame({
         'x': np.random.uniform(1., 168., 12),
         'y': np.random.uniform(7., 334., 12),
         'z': np.random.uniform(1.7, 20.7, 12),
@@ -426,7 +425,7 @@ using :meth:`~pandas.pivot_table`:
 
 .. ipython:: python
 
-   df = DataFrame({
+   df = pd.DataFrame({
        'Animal': ['Animal1', 'Animal2', 'Animal3', 'Animal2', 'Animal1',
                   'Animal2', 'Animal3'],
        'FeedType': ['A', 'B', 'A', 'A', 'B', 'B', 'A'],
@@ -444,6 +443,30 @@ The second approach is to use the :meth:`~pandas.DataFrame.groupby` method:
 For more details and examples see :ref:`the reshaping documentation
 <reshaping.pivot>` or :ref:`the groupby documentation<groupby.split>`.
 
+|factor|_
+~~~~~~~~
+
+.. versionadded:: 0.15
+
+pandas has a data type for categorical data.
+
+.. code-block:: r
+
+   cut(c(1,2,3,4,5,6), 3)
+   factor(c(1,2,3,2,2,3))
+
+In pandas this is accomplished with ``pd.cut`` and ``astype("category")``:
+
+.. ipython:: python
+
+   pd.cut(pd.Series([1,2,3,4,5,6]), 3)
+   pd.Series([1,2,3,2,2,3]).astype("category")
+
+For more details and examples see :ref:`categorical introduction <categorical>` and the
+:ref:`API documentation <api.categorical>`. There is also a documentation regarding the
+:ref:`differences to R's factor <categorical.rfactor>`.
+
+
 .. |c| replace:: ``c``
 .. _c: http://stat.ethz.ch/R-manual/R-patched/library/base/html/c.html
 
@@ -477,3 +500,5 @@ For more details and examples see :ref:`the reshaping documentation
 .. |cast| replace:: ``cast``
 .. cast: http://www.inside-r.org/packages/cran/reshape2/docs/cast
 
+.. |factor| replace:: ``factor``
+.. _factor: https://stat.ethz.ch/R-manual/R-devel/library/base/html/factor.html
diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt
@@ -540,21 +540,18 @@ Categoricals in Series/DataFrame
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 :class:`~pandas.Categorical` can now be included in `Series` and `DataFrames` and gained new
-methods to manipulate. Thanks to Jan Schultz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`,
+methods to manipulate. Thanks to Jan Schulz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`,
 :issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`,
 :issue:`8075`, :issue:`8076`, :issue:`8143`).
 
-For full docs, see the :ref:`Categorical introduction <categorical>` and the
+For full docs, see the :ref:`categorical introduction <categorical>` and the
 :ref:`API documentation <api.categorical>`.
 
 .. ipython:: python
 
     df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
 
-    # convert the raw grades to a categorical
-    df["grade"] = pd.Categorical(df["raw_grade"])
-
-    # Alternative: df["grade"] = df["raw_grade"].astype("category")
+    df["grade"] = df["raw_grade"].astype("category")
     df["grade"]
 
     # Rename the categories
diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py
@@ -34,7 +34,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
         right == True (the default), then the bins [1,2,3,4] indicate
         (1,2], (2,3], (3,4].
     labels : array or boolean, default None
-        Labels to use for bins, or False to return integer bin labels.
+        Used as labels for the resulting bins. Must be of the same length as the resulting
+        bins. If False, return only integer indicators of the bins.
     retbins : bool, optional
         Whether to return the bins or not. Can be useful if bins is given
         as a scalar.
@@ -47,7 +48,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
     -------
     out : Categorical or Series or array of integers if labels is False
         The return type (Categorical or Series) depends on the input: a Series of type category if
-        input is a Series else Categorical.
+        input is a Series else Categorical. Bins are represented as categories when categorical
+        data is returned.
     bins : ndarray of floats
         Returned only if `retbins` is True.
 
@@ -63,12 +65,15 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
 
     Examples
     --------
-    >>> cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True)
-    (array([(0.191, 3.367], (0.191, 3.367], (0.191, 3.367], (3.367, 6.533],
-           (6.533, 9.7], (0.191, 3.367]], dtype=object),
-     array([ 0.1905    ,  3.36666667,  6.53333333,  9.7       ]))
-    >>> cut(np.ones(5), 4, labels=False)
-    array([2, 2, 2, 2, 2])
+    >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True)
+    ([(0.191, 3.367], (0.191, 3.367], (0.191, 3.367], (3.367, 6.533], (6.533, 9.7], (0.191, 3.367]]
+    Categories (3, object): [(0.191, 3.367] < (3.367, 6.533] < (6.533, 9.7]],
+    array([ 0.1905    ,  3.36666667,  6.53333333,  9.7       ]))
+    >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, labels=["good","medium","bad"])
+    [good, good, good, medium, bad, good]
+    Categories (3, object): [good < medium < bad]
+    >>> pd.cut(np.ones(5), 4, labels=False)
+    array([1, 1, 1, 1, 1], dtype=int64)
     """
     # NOTE: this binning code is changed a bit from histogram for var(x) == 0
     if not np.iterable(bins):
@@ -126,7 +131,8 @@ def qcut(x, q, labels=None, retbins=False, precision=3):
         Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
         array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles
     labels : array or boolean, default None
-        Labels to use for bin edges, or False to return integer bin labels
+        Used as labels for the resulting bins. Must be of the same length as the resulting
+        bins. If False, return only integer indicators of the bins.
     retbins : bool, optional
         Whether to return the bins or not. Can be useful if bins is given
         as a scalar.
@@ -135,15 +141,27 @@ def qcut(x, q, labels=None, retbins=False, precision=3):
 
     Returns
     -------
-    cat : Categorical or Series
-        Returns a Series of type category if input is a Series else Categorical.
+    out : Categorical or Series or array of integers if labels is False
+        The return type (Categorical or Series) depends on the input: a Series of type category if
+        input is a Series else Categorical. Bins are represented as categories when categorical
+        data is returned.
+    bins : ndarray of floats
+        Returned only if `retbins` is True.
 
     Notes
     -----
     Out of bounds values will be NA in the resulting Categorical object
 
     Examples
     --------
+    >>> pd.qcut(range(5), 4)
+    [[0, 1], [0, 1], (1, 2], (2, 3], (3, 4]]
+    Categories (4, object): [[0, 1] < (1, 2] < (2, 3] < (3, 4]]
+    >>> pd.qcut(range(5), 3, labels=["good","medium","bad"])
+    [good, good, medium, bad, bad]
+    Categories (3, object): [good < medium < bad]
+    >>> pd.qcut(range(5), 4, labels=False)
+    array([0, 0, 1, 2, 3], dtype=int64)
     """
     if com.is_integer(q):
         quantiles = np.linspace(0, 1, q + 1)