From 783f5f0de6ae2ea6a8d8ac1d05b72ad1de4fc948 Mon Sep 17 00:00:00 2001 From: Vincent La Date: Mon, 14 May 2018 13:48:04 -0400 Subject: [PATCH 01/11] Deleting a duplicate example in pd.DataFrame.pivot_table documentation --- pandas/core/frame.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0437c479c9d81..9912a7c45c3a7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5246,15 +5246,7 @@ def pivot(self, index=None, columns=None, values=None): 7 bar two small 6 8 bar two large 7 - >>> table = pivot_table(df, values='D', index=['A', 'B'], - ... columns=['C'], aggfunc=np.sum) - >>> table - C large small - A B - bar one 4.0 5.0 - two 7.0 6.0 - foo one 4.0 1.0 - two NaN 6.0 + This first example aggregates values by taking the sum. >>> table = pivot_table(df, values='D', index=['A', 'B'], ... columns=['C'], aggfunc=np.sum) From 9e79c2f610d2d2abf47dc58c035972450a6ca3d7 Mon Sep 17 00:00:00 2001 From: Vincent La Date: Mon, 14 May 2018 14:08:41 -0400 Subject: [PATCH 02/11] Fixing a broken example, the broken example referred to a column E that did not exist. Also added more examples --- pandas/core/frame.py | 71 +++++++++++++++++++++++++++++++------------- 1 file changed, 51 insertions(+), 20 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9912a7c45c3a7..823a8df7558ea 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5233,18 +5233,19 @@ def pivot(self, index=None, columns=None, values=None): ... "C": ["small", "large", "large", "small", ... "small", "large", "small", "small", ... "large"], - ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7]}) + ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]}) >>> df - A B C D - 0 foo one small 1 - 1 foo one large 2 - 2 foo one large 2 - 3 foo two small 3 - 4 foo two small 3 - 5 bar one large 4 - 6 bar one small 5 - 7 bar two small 6 - 8 bar two large 7 + A B C D E + 0 foo one small 1 2 + 1 foo one large 2 4 + 2 foo one large 2 5 + 3 foo two small 3 5 + 4 foo two small 3 6 + 5 bar one large 4 6 + 6 bar one small 5 8 + 7 bar two small 6 9 + 8 bar two large 7 9 This first example aggregates values by taking the sum. @@ -5253,22 +5254,52 @@ def pivot(self, index=None, columns=None, values=None): >>> table C large small A B - bar one 4.0 5.0 - two 7.0 6.0 - foo one 4.0 1.0 - two NaN 6.0 + bar one 4 5 + two 7 6 + foo one 4 1 + two NaN 6 + + We can also fill missing values using the `fill_value` parameter. + + >>> table = pivot_table(df, values='D', index=['A', 'B'], + ... columns=['C'], aggfunc=np.sum, fill_value=0) + >>> table + C large small + A B + bar one 4 5 + two 7 6 + foo one 4 1 + two 0 6 + + The next example aggregates by taking the mean using values for multiple + columns. + + >>> table = pivot_table(df, values=['D', 'E'], index=['A', 'C'], + ... aggfunc={'D': np.mean, + ... 'E': np.mean}) + >>> table + D E + mean mean + A C + bar large 5.500000 7.500000 + small 5.500000 8.500000 + foo large 2.000000 4.500000 + small 2.333333 4.333333 + + We can also calculate multiple types of aggregations for any given value + column. >>> table = pivot_table(df, values=['D', 'E'], index=['A', 'C'], ... aggfunc={'D': np.mean, ... 'E': [min, max, np.mean]}) >>> table D E - mean max median min + mean max mean min A C - bar large 5.500000 16 14.5 13 - small 5.500000 15 14.5 14 - foo large 2.000000 10 9.5 9 - small 2.333333 12 11.0 8 + bar large 5.500000 9 7.500000 6 + small 5.500000 9 8.500000 8 + foo large 2.000000 5 4.500000 4 + small 2.333333 6 4.333333 2 Returns ------- From f60874a4561dae91d726bb1caf21c7e9b58229fe Mon Sep 17 00:00:00 2001 From: Vincent La Date: Mon, 14 May 2018 14:49:18 -0400 Subject: [PATCH 03/11] Adding a clarification note on an error with pivot due to non-unique index/column pairs --- doc/source/reshaping.rst | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 250a1808e496e..ee3ea5007ec40 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -81,7 +81,7 @@ column: .. ipython:: python df['value2'] = df['value'] * 2 - pivoted = df.pivot('date', 'variable') + pivoted = df.pivot(index='date', columns='variable') pivoted You can then select subsets from the pivoted ``DataFrame``: @@ -93,6 +93,12 @@ You can then select subsets from the pivoted ``DataFrame``: Note that this returns a view on the underlying data in the case where the data are homogeneously-typed. +.. note:: + ``pandas.pivot`` will error with a ``ValueError: Index contains duplicate + entries, cannot reshape`` if the index/column pair is not unique. In this + case, consider using ``pandas.pivot_table`` which is a generalization + of pivot that can handle duplicate values for one index/column pair. + .. _reshaping.stacking: Reshaping by stacking and unstacking From ab4584d3990af3052ee1b69044be815b721a4294 Mon Sep 17 00:00:00 2001 From: Vincent La Date: Mon, 14 May 2018 14:51:32 -0400 Subject: [PATCH 04/11] In my opinion, it makes more sense to have the overall image at the top of the section --- doc/source/reshaping.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index ee3ea5007ec40..72e63c13645e4 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -17,6 +17,8 @@ Reshaping and Pivot Tables Reshaping by pivoting DataFrame objects --------------------------------------- +.. image:: _static/reshaping_pivot.png + .. ipython:: :suppress: @@ -60,8 +62,6 @@ To select out everything for variable ``A`` we could do: df[df['variable'] == 'A'] -.. image:: _static/reshaping_pivot.png - But suppose we wish to do time series operations with the variables. A better representation would be where the ``columns`` are the unique variables and an ``index`` of dates identifies individual observations. To reshape the data into From 263b1d82d54f4e9eeaead663d687bdbdd35a0bc4 Mon Sep 17 00:00:00 2001 From: Vincent La Date: Mon, 14 May 2018 14:58:26 -0400 Subject: [PATCH 05/11] Removing unnecessary phrase --- doc/source/reshaping.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 72e63c13645e4..25b3e2f97cf23 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -35,7 +35,7 @@ Reshaping by pivoting DataFrame objects In [3]: df = unpivot(tm.makeTimeDataFrame()) -Data is often stored in CSV files or databases in so-called "stacked" or +Data is often stored in so-called "stacked" or "record" format: .. ipython:: python From db68b01337cc2ba2babdb4821219a9c2a1e4a3b2 Mon Sep 17 00:00:00 2001 From: Vincent La Date: Mon, 14 May 2018 15:54:37 -0400 Subject: [PATCH 06/11] Adding frequently asked questions section --- doc/source/reshaping.rst | 121 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 117 insertions(+), 4 deletions(-) diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 25b3e2f97cf23..d3b11ae73cceb 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -35,8 +35,7 @@ Reshaping by pivoting DataFrame objects In [3]: df = unpivot(tm.makeTimeDataFrame()) -Data is often stored in so-called "stacked" or -"record" format: +Data is often stored in so-called "stacked" or "record" format: .. ipython:: python @@ -96,7 +95,7 @@ are homogeneously-typed. .. note:: ``pandas.pivot`` will error with a ``ValueError: Index contains duplicate entries, cannot reshape`` if the index/column pair is not unique. In this - case, consider using ``pandas.pivot_table`` which is a generalization + case, consider using :func:`~pandas.pivot_table` which is a generalization of pivot that can handle duplicate values for one index/column pair. .. _reshaping.stacking: @@ -704,10 +703,124 @@ handling of NaN: In [3]: np.unique(x, return_inverse=True)[::-1] Out[3]: (array([3, 3, 0, 4, 1, 2]), array([nan, 3.14, inf, 'A', 'B'], dtype=object)) - .. note:: If you just want to handle one column as a categorical variable (like R's factor), you can use ``df["cat_col"] = pd.Categorical(df["col"])`` or ``df["cat_col"] = df["col"].astype("category")``. For full docs on :class:`~pandas.Categorical`, see the :ref:`Categorical introduction ` and the :ref:`API documentation `. + +Frequently Asked Questions (and Examples) +------------------ + +In this section, we will review frequently asked questions and examples. The +column names and relevant column values are named to correspond with how this +DataFrame will be pivoted in the answers below. + +.. ipython:: python + + np.random.seed([3,1415]) + n = 20 + + cols = np.array(['key', 'row', 'item', 'col']) + arr1 = (np.random.randint(5, size=(n, 4)) // [2, 1, 2, 1]).astype(str) + + df = pd.DataFrame(np.core.defchararray.add(cols, arr1), columns=cols).join( + pd.DataFrame(np.random.rand(n, 2).round(2)).add_prefix('val') + ) + + df + +Question 1 +~~~~~~~~~~ + +How do I pivot ``df`` such that the ``col`` values are columns, +``row`` values are the index, and mean of ``val0`` are the values? In +particular, the resulting DataFrame should look like: + +.. code-block:: ipython + + col col0 col1 col2 col3 col4 + row + row0 0.77 0.605 NaN 0.860 0.65 + row2 0.13 NaN 0.395 0.500 0.25 + row3 NaN 0.310 NaN 0.545 NaN + row4 NaN 0.100 0.395 0.760 0.24 + +**Answer** +This solution uses :func:`~pandas.pivot_table`. Also note that +``aggfunc='mean'`` is the default. It is included here to be explicit. + +.. ipython:: python + + df.pivot_table( + values='val0', index='row', columns='col', aggfunc='mean') + +Note that we can also replace the missing values by using the ``fill_value`` +parameter. + +.. ipython:: python + + df.pivot_table( + values='val0', index='row', columns='col', aggfunc='mean', fill_value=0) + +Also note that we can pass in other aggregation functions as well. For example, +we can also pass in ``sum``. + +.. ipython:: python + + df.pivot_table( + values='val0', index='row', columns='col', aggfunc='sum', fill_value=0) + +Question 2 +~~~~~~~~~~ + +How can I perform multiple aggregations at the same time? For example, what if +I wanted to perform both a ``sum`` and ``mean`` aggregation? + +**Answer** +We can pass in a list to the ``aggfunc`` argument. + +.. ipython:: python + + df.pivot_table( + values='val0', index='row', columns='col', aggfunc=['mean', 'sum']) + +Question 3 +~~~~~~~~~~ + +How can I aggregate over multiple value columns? + +**Answer** +We can pass in a list to the ``values`` parameter. + +.. ipython:: python + + df.pivot_table( + values=['val0', 'val1'], index='row', columns='col', aggfunc=['mean']) + +Question 4 +~~~~~~~~~~ + +How can I Group By over multiple columns? + +**Answer** +We can pass in a list to the ``columns`` parameter. + +.. ipython:: python + + df.pivot_table( + values=['val0'], index='row', columns=['item', 'col'], aggfunc=['mean']) + +Question 5 +~~~~~~~~~~ + +How can I aggregate the frequency in which the columns and rows occur together +a.k.a. "cross tabulation"? + +**Answer** +We can pass ``size`` to the ``aggfunc`` parameter. + +.. ipython:: python + + df.pivot_table(index='row', columns='col', fill_value=0, aggfunc='size') From 8c9ae270c21c0cdec4d2dcd6c4ab50854b7fd4c9 Mon Sep 17 00:00:00 2001 From: Vincent La Date: Mon, 14 May 2018 16:14:44 -0400 Subject: [PATCH 07/11] fixing linter errors --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/frame.py | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 1660c8d9fcdc5..0100b0242747b 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1375,6 +1375,7 @@ Reshaping - Bug in :func:`isna`, which cannot handle ambiguous typed lists (:issue:`20675`) - Bug in :func:`concat` which raises an error when concatenating TZ-aware dataframes and all-NaT dataframes (:issue:`12396`) - Bug in :func:`concat` which raises an error when concatenating empty TZ-aware series (:issue:`18447`) +- Updated :func:`~pandas.pivot_table` with more comprehensive examples. Also updated Reshaping and Pivot Tables documentation with a Frequenty Asked Questions example (:issue:`19089`) Other ^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 823a8df7558ea..87c777f65ae21 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5271,8 +5271,8 @@ def pivot(self, index=None, columns=None, values=None): foo one 4 1 two 0 6 - The next example aggregates by taking the mean using values for multiple - columns. + The next example aggregates by taking the mean using values for + multiple columns. >>> table = pivot_table(df, values=['D', 'E'], index=['A', 'C'], ... aggfunc={'D': np.mean, @@ -5286,8 +5286,8 @@ def pivot(self, index=None, columns=None, values=None): foo large 2.000000 4.500000 small 2.333333 4.333333 - We can also calculate multiple types of aggregations for any given value - column. + We can also calculate multiple types of aggregations for any given + value column. >>> table = pivot_table(df, values=['D', 'E'], index=['A', 'C'], ... aggfunc={'D': np.mean, From e0d95011bb56883c8248213af8d3dc764bdc7c2f Mon Sep 17 00:00:00 2001 From: Vincent La Date: Tue, 15 May 2018 11:44:40 -0400 Subject: [PATCH 08/11] Removing whatsnew and fixing some typos --- doc/source/reshaping.rst | 4 ++-- doc/source/whatsnew/v0.23.0.txt | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index d3b11ae73cceb..03cdcfbe11e3c 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -93,7 +93,7 @@ Note that this returns a view on the underlying data in the case where the data are homogeneously-typed. .. note:: - ``pandas.pivot`` will error with a ``ValueError: Index contains duplicate + :func:`~pandas.pivot` will error with a ``ValueError: Index contains duplicate entries, cannot reshape`` if the index/column pair is not unique. In this case, consider using :func:`~pandas.pivot_table` which is a generalization of pivot that can handle duplicate values for one index/column pair. @@ -735,7 +735,7 @@ Question 1 ~~~~~~~~~~ How do I pivot ``df`` such that the ``col`` values are columns, -``row`` values are the index, and mean of ``val0`` are the values? In +``row`` values are the index, and the mean of ``val0`` are the values? In particular, the resulting DataFrame should look like: .. code-block:: ipython diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index e29153a91dc86..89dab728d2bd4 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1384,7 +1384,6 @@ Reshaping - Bug in :func:`isna`, which cannot handle ambiguous typed lists (:issue:`20675`) - Bug in :func:`concat` which raises an error when concatenating TZ-aware dataframes and all-NaT dataframes (:issue:`12396`) - Bug in :func:`concat` which raises an error when concatenating empty TZ-aware series (:issue:`18447`) -- Updated :func:`~pandas.pivot_table` with more comprehensive examples. Also updated Reshaping and Pivot Tables documentation with a Frequenty Asked Questions example (:issue:`19089`) Other ^^^^^ From 61f9f437ae73322044e7c60346edac68dfc5a091 Mon Sep 17 00:00:00 2001 From: Vincent La Date: Tue, 15 May 2018 12:11:07 -0400 Subject: [PATCH 09/11] Rephrasing reshaping docs instead of q+a just examples --- doc/source/reshaping.rst | 67 ++++++++++++++-------------------------- pandas/core/frame.py | 3 +- 2 files changed, 24 insertions(+), 46 deletions(-) diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 03cdcfbe11e3c..a2424562256a0 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -710,8 +710,8 @@ handling of NaN: see the :ref:`Categorical introduction ` and the :ref:`API documentation `. -Frequently Asked Questions (and Examples) ------------------- +Examples +-------- In this section, we will review frequently asked questions and examples. The column names and relevant column values are named to correspond with how this @@ -723,18 +723,16 @@ DataFrame will be pivoted in the answers below. n = 20 cols = np.array(['key', 'row', 'item', 'col']) - arr1 = (np.random.randint(5, size=(n, 4)) // [2, 1, 2, 1]).astype(str) - - df = pd.DataFrame(np.core.defchararray.add(cols, arr1), columns=cols).join( - pd.DataFrame(np.random.rand(n, 2).round(2)).add_prefix('val') - ) + df = cols + pd.DataFrame((np.random.randint(5, size=(n, 4)) // [2, 1, 2, 1]).astype(str)) + df.columns = cols + df = df.join(pd.DataFrame(np.random.rand(n, 2).round(2)).add_prefix('val')) df -Question 1 -~~~~~~~~~~ +Pivoting with Single Aggregations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -How do I pivot ``df`` such that the ``col`` values are columns, +Suppose we wanted to pivot ``df`` such that the ``col`` values are columns, ``row`` values are the index, and the mean of ``val0`` are the values? In particular, the resulting DataFrame should look like: @@ -747,7 +745,6 @@ particular, the resulting DataFrame should look like: row3 NaN 0.310 NaN 0.545 NaN row4 NaN 0.100 0.395 0.760 0.24 -**Answer** This solution uses :func:`~pandas.pivot_table`. Also note that ``aggfunc='mean'`` is the default. It is included here to be explicit. @@ -772,55 +769,37 @@ we can also pass in ``sum``. df.pivot_table( values='val0', index='row', columns='col', aggfunc='sum', fill_value=0) -Question 2 -~~~~~~~~~~ +Another aggregation we can do is calculate the frequency in which the columns +and rows occur together a.k.a. "cross tabulation". To do this, we can pass +``size`` to the ``aggfunc`` parameter. -How can I perform multiple aggregations at the same time? For example, what if -I wanted to perform both a ``sum`` and ``mean`` aggregation? +.. ipython:: python + + df.pivot_table(index='row', columns='col', fill_value=0, aggfunc='size') -**Answer** -We can pass in a list to the ``aggfunc`` argument. +Pivoting with Multiple Aggregations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We can also perform multiple aggregations. For example, to perform both a +``sum`` and ``mean``, we can pass in a list to the ``aggfunc`` argument. .. ipython:: python df.pivot_table( values='val0', index='row', columns='col', aggfunc=['mean', 'sum']) -Question 3 -~~~~~~~~~~ - -How can I aggregate over multiple value columns? - -**Answer** -We can pass in a list to the ``values`` parameter. +Note to aggregate over multiple value columns, we can pass in a list to the +``values`` parameter. .. ipython:: python df.pivot_table( values=['val0', 'val1'], index='row', columns='col', aggfunc=['mean']) -Question 4 -~~~~~~~~~~ - -How can I Group By over multiple columns? - -**Answer** -We can pass in a list to the ``columns`` parameter. +Note to subdivide over multiple columns we can pass in a list to the +``columns`` parameter. .. ipython:: python df.pivot_table( values=['val0'], index='row', columns=['item', 'col'], aggfunc=['mean']) - -Question 5 -~~~~~~~~~~ - -How can I aggregate the frequency in which the columns and rows occur together -a.k.a. "cross tabulation"? - -**Answer** -We can pass ``size`` to the ``aggfunc`` parameter. - -.. ipython:: python - - df.pivot_table(index='row', columns='col', fill_value=0, aggfunc='size') diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 87c777f65ae21..07c89fec82378 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5271,8 +5271,7 @@ def pivot(self, index=None, columns=None, values=None): foo one 4 1 two 0 6 - The next example aggregates by taking the mean using values for - multiple columns. + The next example aggregates by taking the mean across multiple columns. >>> table = pivot_table(df, values=['D', 'E'], index=['A', 'C'], ... aggfunc={'D': np.mean, From 5283d29b23a69b67a61d2c333d3e3e200c5e6241 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 2 Nov 2018 09:50:17 -0700 Subject: [PATCH 10/11] whitespace --- doc/source/reshaping.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 0564979d5ca2a..a231e47bee870 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -713,7 +713,7 @@ handling of NaN: Examples -------- -In this section, we will review frequently asked questions and examples. The +In this section, we will review frequently asked questions and examples. The column names and relevant column values are named to correspond with how this DataFrame will be pivoted in the answers below. @@ -721,7 +721,7 @@ DataFrame will be pivoted in the answers below. np.random.seed([3,1415]) n = 20 - + cols = np.array(['key', 'row', 'item', 'col']) df = cols + pd.DataFrame((np.random.randint(5, size=(n, 4)) // [2, 1, 2, 1]).astype(str)) df.columns = cols @@ -737,9 +737,9 @@ Suppose we wanted to pivot ``df`` such that the ``col`` values are columns, particular, the resulting DataFrame should look like: .. code-block:: ipython - + col col0 col1 col2 col3 col4 - row + row row0 0.77 0.605 NaN 0.860 0.65 row2 0.13 NaN 0.395 0.500 0.25 row3 NaN 0.310 NaN 0.545 NaN From c146d7c34fdf2532f6a63a4c64cffd2f924a8faa Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Sat, 3 Nov 2018 06:47:30 +0000 Subject: [PATCH 11/11] pep8 issue --- doc/source/reshaping.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index a231e47bee870..feb58c1c11dfd 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -719,7 +719,7 @@ DataFrame will be pivoted in the answers below. .. ipython:: python - np.random.seed([3,1415]) + np.random.seed([3, 1415]) n = 20 cols = np.array(['key', 'row', 'item', 'col'])