diff --git a/doc/source/10min.rst b/doc/source/10min.rst index 6320be3920730..10921c2a32ed5 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -640,27 +640,44 @@ Categoricals ------------ Since version 0.15, pandas can include categorical data in a ``DataFrame``. For full docs, see the -:ref:`Categorical introduction ` and the :ref:`API documentation ` . +:ref:`categorical introduction ` and the :ref:`API documentation `. .. ipython:: python df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']}) - # convert the raw grades to a categorical - df["grade"] = pd.Categorical(df["raw_grade"]) +Convert the raw grades to a categorical data type. - # Alternative: df["grade"] = df["raw_grade"].astype("category") +.. ipython:: python + + df["grade"] = df["raw_grade"].astype("category") df["grade"] - # Rename the categories inplace +Rename the categories to more meaningful names (assigning to ``Series.cat.categories`` is inplace!) + +.. ipython:: python + df["grade"].cat.categories = ["very good", "good", "very bad"] - # Reorder the categories and simultaneously add the missing categories +Reorder the categories and simultaneously add the missing categories (methods under ``Series +.cat`` return a new ``Series`` per default). + +.. ipython:: python + df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"]) df["grade"] + +Sorting is per order in the categories, not lexical order. + +.. ipython:: python + df.sort("grade") - df.groupby("grade").size() +Grouping by a categorical column shows also empty categories. + +.. ipython:: python + + df.groupby("grade").size() Plotting diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index a4e97a6e8d17c..669a39d437a34 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -611,6 +611,8 @@ available ("missing value") or `np.nan` is a valid category. pd.isnull(s) s.fillna("a") +.. _categorical.rfactor: + Differences to R's `factor` --------------------------- diff --git a/doc/source/comparison_with_r.rst b/doc/source/comparison_with_r.rst index 84bba77e0dfa3..89c46d21c56a2 100644 --- a/doc/source/comparison_with_r.rst +++ b/doc/source/comparison_with_r.rst @@ -6,7 +6,7 @@ import pandas as pd import numpy as np - options.display.max_rows=15 + pd.options.display.max_rows=15 Comparison with R / R libraries ******************************* @@ -51,7 +51,7 @@ Selecting multiple columns by name in ``pandas`` is straightforward .. ipython:: python - df = DataFrame(np.random.randn(10, 3), columns=list('abc')) + df = pd.DataFrame(np.random.randn(10, 3), columns=list('abc')) df[['a', 'c']] df.loc[:, ['a', 'c']] @@ -63,7 +63,7 @@ with a combination of the ``iloc`` indexer attribute and ``numpy.r_``. named = list('abcdefg') n = 30 columns = named + np.arange(len(named), n).tolist() - df = DataFrame(np.random.randn(n, n), columns=columns) + df = pd.DataFrame(np.random.randn(n, n), columns=columns) df.iloc[:, np.r_[:10, 24:30]] @@ -88,8 +88,7 @@ function. .. ipython:: python - from pandas import DataFrame - df = DataFrame({ + df = pd.DataFrame({ 'v1': [1,3,5,7,8,3,5,np.nan,4,5,7,9], 'v2': [11,33,55,77,88,33,55,np.nan,44,55,77,99], 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12], @@ -166,7 +165,7 @@ In ``pandas`` we may use :meth:`~pandas.pivot_table` method to handle this: import random import string - baseball = DataFrame({ + baseball = pd.DataFrame({ 'team': ["team %d" % (x+1) for x in range(5)]*5, 'player': random.sample(list(string.ascii_lowercase),25), 'batting avg': np.random.uniform(.200, .400, 25) @@ -197,7 +196,7 @@ index/slice as well as standard boolean indexing: .. ipython:: python - df = DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)}) + df = pd.DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)}) df.query('a <= b') df[df.a <= df.b] df.loc[df.a <= df.b] @@ -225,7 +224,7 @@ In ``pandas`` the equivalent expression, using the .. ipython:: python - df = DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)}) + df = pd.DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)}) df.eval('a + b') df.a + df.b # same as the previous expression @@ -283,7 +282,7 @@ In ``pandas`` the equivalent expression, using the .. ipython:: python - df = DataFrame({ + df = pd.DataFrame({ 'x': np.random.uniform(1., 168., 120), 'y': np.random.uniform(7., 334., 120), 'z': np.random.uniform(1.7, 20.7, 120), @@ -317,7 +316,7 @@ In Python, since ``a`` is a list, you can simply use list comprehension. .. ipython:: python a = np.array(list(range(1,24))+[np.NAN]).reshape(2,3,4) - DataFrame([tuple(list(x)+[val]) for x, val in np.ndenumerate(a)]) + pd.DataFrame([tuple(list(x)+[val]) for x, val in np.ndenumerate(a)]) |meltlist|_ ~~~~~~~~~~~~ @@ -336,7 +335,7 @@ In Python, this list would be a list of tuples, so .. ipython:: python a = list(enumerate(list(range(1,5))+[np.NAN])) - DataFrame(a) + pd.DataFrame(a) For more details and examples see :ref:`the Into to Data Structures documentation `. @@ -361,7 +360,7 @@ In Python, the :meth:`~pandas.melt` method is the R equivalent: .. ipython:: python - cheese = DataFrame({'first' : ['John', 'Mary'], + cheese = pd.DataFrame({'first' : ['John', 'Mary'], 'last' : ['Doe', 'Bo'], 'height' : [5.5, 6.0], 'weight' : [130, 150]}) @@ -394,7 +393,7 @@ In Python the best way is to make use of :meth:`~pandas.pivot_table`: .. ipython:: python - df = DataFrame({ + df = pd.DataFrame({ 'x': np.random.uniform(1., 168., 12), 'y': np.random.uniform(7., 334., 12), 'z': np.random.uniform(1.7, 20.7, 12), @@ -426,7 +425,7 @@ using :meth:`~pandas.pivot_table`: .. ipython:: python - df = DataFrame({ + df = pd.DataFrame({ 'Animal': ['Animal1', 'Animal2', 'Animal3', 'Animal2', 'Animal1', 'Animal2', 'Animal3'], 'FeedType': ['A', 'B', 'A', 'A', 'B', 'B', 'A'], @@ -444,6 +443,30 @@ The second approach is to use the :meth:`~pandas.DataFrame.groupby` method: For more details and examples see :ref:`the reshaping documentation ` or :ref:`the groupby documentation`. +|factor|_ +~~~~~~~~ + +.. versionadded:: 0.15 + +pandas has a data type for categorical data. + +.. code-block:: r + + cut(c(1,2,3,4,5,6), 3) + factor(c(1,2,3,2,2,3)) + +In pandas this is accomplished with ``pd.cut`` and ``astype("category")``: + +.. ipython:: python + + pd.cut(pd.Series([1,2,3,4,5,6]), 3) + pd.Series([1,2,3,2,2,3]).astype("category") + +For more details and examples see :ref:`categorical introduction ` and the +:ref:`API documentation `. There is also a documentation regarding the +:ref:`differences to R's factor `. + + .. |c| replace:: ``c`` .. _c: http://stat.ethz.ch/R-manual/R-patched/library/base/html/c.html @@ -477,3 +500,5 @@ For more details and examples see :ref:`the reshaping documentation .. |cast| replace:: ``cast`` .. cast: http://www.inside-r.org/packages/cran/reshape2/docs/cast +.. |factor| replace:: ``factor`` +.. _factor: https://stat.ethz.ch/R-manual/R-devel/library/base/html/factor.html \ No newline at end of file diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 8c0e193ec6348..55e2c440754b1 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -540,21 +540,18 @@ Categoricals in Series/DataFrame ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :class:`~pandas.Categorical` can now be included in `Series` and `DataFrames` and gained new -methods to manipulate. Thanks to Jan Schultz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`, +methods to manipulate. Thanks to Jan Schulz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`, :issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`, :issue:`8075`, :issue:`8076`, :issue:`8143`). -For full docs, see the :ref:`Categorical introduction ` and the +For full docs, see the :ref:`categorical introduction ` and the :ref:`API documentation `. .. ipython:: python df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']}) - # convert the raw grades to a categorical - df["grade"] = pd.Categorical(df["raw_grade"]) - - # Alternative: df["grade"] = df["raw_grade"].astype("category") + df["grade"] = df["raw_grade"].astype("category") df["grade"] # Rename the categories diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 06fee377be749..5eddd2f8dec33 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -34,7 +34,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, right == True (the default), then the bins [1,2,3,4] indicate (1,2], (2,3], (3,4]. labels : array or boolean, default None - Labels to use for bins, or False to return integer bin labels. + Used as labels for the resulting bins. Must be of the same length as the resulting + bins. If False, return only integer indicators of the bins. retbins : bool, optional Whether to return the bins or not. Can be useful if bins is given as a scalar. @@ -47,7 +48,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, ------- out : Categorical or Series or array of integers if labels is False The return type (Categorical or Series) depends on the input: a Series of type category if - input is a Series else Categorical. + input is a Series else Categorical. Bins are represented as categories when categorical + data is returned. bins : ndarray of floats Returned only if `retbins` is True. @@ -63,12 +65,15 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, Examples -------- - >>> cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True) - (array([(0.191, 3.367], (0.191, 3.367], (0.191, 3.367], (3.367, 6.533], - (6.533, 9.7], (0.191, 3.367]], dtype=object), - array([ 0.1905 , 3.36666667, 6.53333333, 9.7 ])) - >>> cut(np.ones(5), 4, labels=False) - array([2, 2, 2, 2, 2]) + >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True) + ([(0.191, 3.367], (0.191, 3.367], (0.191, 3.367], (3.367, 6.533], (6.533, 9.7], (0.191, 3.367]] + Categories (3, object): [(0.191, 3.367] < (3.367, 6.533] < (6.533, 9.7]], + array([ 0.1905 , 3.36666667, 6.53333333, 9.7 ])) + >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, labels=["good","medium","bad"]) + [good, good, good, medium, bad, good] + Categories (3, object): [good < medium < bad] + >>> pd.cut(np.ones(5), 4, labels=False) + array([1, 1, 1, 1, 1], dtype=int64) """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 if not np.iterable(bins): @@ -126,7 +131,8 @@ def qcut(x, q, labels=None, retbins=False, precision=3): Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles labels : array or boolean, default None - Labels to use for bin edges, or False to return integer bin labels + Used as labels for the resulting bins. Must be of the same length as the resulting + bins. If False, return only integer indicators of the bins. retbins : bool, optional Whether to return the bins or not. Can be useful if bins is given as a scalar. @@ -135,8 +141,12 @@ def qcut(x, q, labels=None, retbins=False, precision=3): Returns ------- - cat : Categorical or Series - Returns a Series of type category if input is a Series else Categorical. + out : Categorical or Series or array of integers if labels is False + The return type (Categorical or Series) depends on the input: a Series of type category if + input is a Series else Categorical. Bins are represented as categories when categorical + data is returned. + bins : ndarray of floats + Returned only if `retbins` is True. Notes ----- @@ -144,6 +154,14 @@ def qcut(x, q, labels=None, retbins=False, precision=3): Examples -------- + >>> pd.qcut(range(5), 4) + [[0, 1], [0, 1], (1, 2], (2, 3], (3, 4]] + Categories (4, object): [[0, 1] < (1, 2] < (2, 3] < (3, 4]] + >>> pd.qcut(range(5), 3, labels=["good","medium","bad"]) + [good, good, medium, bad, bad] + Categories (3, object): [good < medium < bad] + >>> pd.qcut(range(5), 4, labels=False) + array([0, 0, 1, 2, 3], dtype=int64) """ if com.is_integer(q): quantiles = np.linspace(0, 1, q + 1)