From 155e85eb91728a538fb8a257e1b38a1cb37fee33 Mon Sep 17 00:00:00 2001 From: pdpark Date: Wed, 27 Dec 2017 00:14:44 -0800 Subject: [PATCH 1/5] Added note about groupby excluding Decimal columns by default --- doc/source/groupby.rst | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 552ddabb7359a..16ec0a8f027a5 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -497,6 +497,28 @@ index are the group names and whose values are the sizes of each group. ``nth`` can act as a reducer *or* a filter, see :ref:`here ` + Decimal columns are "nuisance" columns that .agg automatically excludes in groupby. + + If you do wish to aggregate them you must do so explicitly: + +.. ipython:: python + + from decimal import Decimal + dec = pd.DataFrame( + {'name': ['foo', 'bar', 'foo', 'bar'], + 'title': ['boo', 'far', 'boo', 'far'], + 'id': [123, 456, 123, 456], + 'int_column': [1, 2, 3, 4], + 'dec_column1': [Decimal('0.50'), Decimal('0.15'), Decimal('0.25'), Decimal('0.40')], + 'dec_column2': [Decimal('0.20'), Decimal('0.30'), Decimal('0.55'), Decimal('0.60')] + }, + columns=['name','title','id','int_column','dec_column1','dec_column2'] + ) + + dec.groupby(['name', 'title', 'id'], as_index=False).sum() + + dec.groupby(['name', 'title', 'id'], as_index=False).agg({'dec_column1': 'sum', 'dec_column2': 'sum'}) + .. _groupby.aggregate.multifunc: Applying multiple functions at once From 5bb33211e97ed1a82e58b10708277ce5a0563ad5 Mon Sep 17 00:00:00 2001 From: pdpark Date: Wed, 27 Dec 2017 19:31:00 -0800 Subject: [PATCH 2/5] Moved note about exclusion of Decimal columns from agg functions to automatic-exclusion-of-nuisance-columns section --- doc/source/groupby.rst | 58 ++++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 16ec0a8f027a5..c24f05cc5a75a 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -497,28 +497,6 @@ index are the group names and whose values are the sizes of each group. ``nth`` can act as a reducer *or* a filter, see :ref:`here ` - Decimal columns are "nuisance" columns that .agg automatically excludes in groupby. - - If you do wish to aggregate them you must do so explicitly: - -.. ipython:: python - - from decimal import Decimal - dec = pd.DataFrame( - {'name': ['foo', 'bar', 'foo', 'bar'], - 'title': ['boo', 'far', 'boo', 'far'], - 'id': [123, 456, 123, 456], - 'int_column': [1, 2, 3, 4], - 'dec_column1': [Decimal('0.50'), Decimal('0.15'), Decimal('0.25'), Decimal('0.40')], - 'dec_column2': [Decimal('0.20'), Decimal('0.30'), Decimal('0.55'), Decimal('0.60')] - }, - columns=['name','title','id','int_column','dec_column1','dec_column2'] - ) - - dec.groupby(['name', 'title', 'id'], as_index=False).sum() - - dec.groupby(['name', 'title', 'id'], as_index=False).agg({'dec_column1': 'sum', 'dec_column2': 'sum'}) - .. _groupby.aggregate.multifunc: Applying multiple functions at once @@ -977,6 +955,42 @@ will be (silently) dropped. Thus, this does not pose any problems: df.groupby('A').std() +.. note:: + Decimal columns are also "nuisance" columns. They are excluded from aggregate functions automatically in groupby. + + If you do wish to include decimal columns in the aggregation, you must do so explicitly: + +.. ipython:: python + + from decimal import Decimal + dec = pd.DataFrame( + {'name': ['foo', 'bar', 'foo', 'bar'], + 'title': ['boo', 'far', 'boo', 'far'], + 'id': [123, 456, 123, 456], + 'int_column': [1, 2, 3, 4], + 'dec_column1': [Decimal('0.50'), Decimal('0.15'), Decimal('0.25'), Decimal('0.40')], + 'dec_column2': [Decimal('0.20'), Decimal('0.30'), Decimal('0.55'), Decimal('0.60')] + }, + columns=['name','title','id','int_column','dec_column1','dec_column2'] + ) + + dec.head() + + dec.dtypes + + # Decimal columns excluded from sum by default + dec.groupby(['name', 'title', 'id'], as_index=False).sum() + + # Decimal columns can be sum'd explicitly by themselves... + dec.groupby(['name', 'title', 'id'], as_index=False)['dec_column1','dec_column2'].sum() + + # ...but cannot be combined with standard data types or they will be excluded + dec.groupby(['name', 'title', 'id'], as_index=False)['int_column','dec_column1','dec_column2'].sum() + + # Use .agg function to aggregate over standard and "nuisance" data types at the same time + dec.groupby(['name', 'title', 'id'], as_index=False).agg({'int_column': 'sum', 'dec_column1': 'sum', 'dec_column2': 'sum'}) + + .. _groupby.missing: NA and NaT group handling From 6cf1c2c47a8a27faf9116f7396fd26339a6ae726 Mon Sep 17 00:00:00 2001 From: pdpark Date: Fri, 5 Jan 2018 00:00:16 -0800 Subject: [PATCH 3/5] Adding example of exploding nested lists --- doc/source/gotchas.rst | 94 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst index 5da0f4fd07819..7ed0b1c800183 100644 --- a/doc/source/gotchas.rst +++ b/doc/source/gotchas.rst @@ -332,3 +332,97 @@ using something similar to the following: See `the NumPy documentation on byte order `__ for more details. + + +Alternative to storing lists in Pandas DataFrame Cells +------------------------------------------------------ +Storing nested lists/arrays inside a pandas object should be avoided for performance and memory use reasons. Instead they should be "exploded" into a flat DataFrame structure. + +Example of exploding nested lists into a DataFrame: + +.. ipython:: python + + from collections import OrderedDict + df = (pd.DataFrame(OrderedDict([('name', ['A.J. Price']*3), + ('opponent', ['76ers', 'blazers', 'bobcats']), + ('attribute x', ['A','B','C']) + ]) + )) + df + + nn = [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']]*3 + nn + + # Step 1: Create an index with the "parent" columns to be included in the final Dataframe + df2 = pd.concat([df[['name','opponent']], pd.DataFrame(nn)], axis=1) + df2 + + # Step 2: Transform the column with lists into series, which become columns in a new Dataframe. + # Note that only the index from the original df is retained - + # any other columns in the original df are not part of the new df + df3 = df2.set_index(['name', 'opponent']) + df3 + + # Step 3: Stack the new columns as rows; this creates a new index level we'll want to drop in the next step. + # Note that at this point we have a Series, not a Dataframe + ser = df3.stack() + ser + + # Step 4: Drop the extraneous index level created by the stack + ser.reset_index(level=2, drop=True, inplace=True) + ser + + # Step 5: Create a Dataframe from the Series + df4 = ser.to_frame('nearest_neighbors') + df4 + + # All steps in one stack + df4 = (df2.set_index(['name', 'opponent']) + .stack() + .reset_index(level=2, drop=True) + .to_frame('nearest_neighbors')) + df4 + +Example of exploding a list embedded in a dataframe: + +.. ipython:: python + + df = (pd.DataFrame(OrderedDict([('name', ['A.J. Price']*3), + ('opponent', ['76ers', 'blazers', 'bobcats']), + ('attribute x', ['A','B','C']), + ('nearest_neighbors', [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']]*3) + ]) + )) + + df + + # Step 1: Create an index with the "parent" columns to be included in the final Dataframe + df2 = df.set_index(['name', 'opponent']) + df2 + + # Step 2: Transform the column with lists into series, which become columns in a new Dataframe. + # Note that only the index from the original df is retained - + # any other columns in the original df are not part of the new df + df3 = df2.nearest_neighbors.apply(pd.Series) + df3 + + # Step 3: Stack the new columns as rows; this creates a new index level we'll want to drop in the next step. + # Note that at this point we have a Series, not a Dataframe + ser = df3.stack() + ser + + # Step 4: Drop the extraneous index level created by the stack + ser.reset_index(level=2, drop=True, inplace=True) + ser + + # Step 5: Create a Dataframe from the Series + df4 = ser.to_frame('nearest_neighbors') + df4 + + # All steps in one stack + df4 = (df.set_index(['name', 'opponent']) + .nearest_neighbors.apply(pd.Series) + .stack() + .reset_index(level=2, drop=True) + .to_frame('nearest_neighbors')) + df4 From e212e78e1f8ac2fbd97857ccac4da1d89df6c41a Mon Sep 17 00:00:00 2001 From: pdpark Date: Sat, 6 Jan 2018 22:06:16 -0800 Subject: [PATCH 4/5] docs: Add warning to treat group chunks as immutable to "Flexible apply" section of groupby.rst Resolves: #14180 --- doc/source/groupby.rst | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index c24f05cc5a75a..1f9befb4bd59c 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -915,13 +915,17 @@ The dimension of the returned result can also change: So depending on the path taken, and exactly what you are grouping. Thus the grouped columns(s) may be included in the output as well as set the indices. -.. warning:: +.. warnings:: - In the current implementation apply calls func twice on the + * In the current implementation apply calls func twice on the first group to decide whether it can take a fast or slow code path. This can lead to unexpected behavior if func has side-effects, as they will take effect twice for the first group. + + * Apply should not perform in-place operations on the group chunk. + Group chunks should be treated as immutable, and changes to a + group chunk may produce unexpected results. .. ipython:: python From e239981a1eb322c8169e27fe74302bf0023b7165 Mon Sep 17 00:00:00 2001 From: pdpark Date: Sun, 7 Jan 2018 22:19:32 -0800 Subject: [PATCH 5/5] Doc: Add example of merging a Series with a DataFrame Resolves: #12550 --- doc/source/merging.rst | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 86d2ec2254057..9aad7e5bf079f 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -712,6 +712,32 @@ either the left or right tables, the values in the joined table will be labels=['left', 'right'], vertical=False); plt.close('all'); +To join a Series and a DataFrame, the Series has to be transformed into a DataFrame first: + +.. ipython:: python + + df = pd.DataFrame({"Let": ["A", "B", "C"], "Num": [1, 2, 3]}) + df + + # The series has a multi-index with levels corresponding to columns in the DataFrame we want to merge with + ser = pd.Series( + ['a', 'b', 'c', 'd', 'e', 'f'], + index=pd.MultiIndex.from_arrays([["A", "B", "C"]*2, [1, 2, 3, 4, 5, 6]]) + ) + ser + + # Name the row index levels + ser.index.names=['Let','Num'] + ser + + # reset_index turns the multi-level row index into columns, which requires a DataFrame + df2 = ser.reset_index() + type(df2) + + # Now we merge the DataFrames + pd.merge(df, df2, on=['Let','Num']) + + Here is another example with duplicate join keys in DataFrames: .. ipython:: python