From e91444edd1ce8afbc8617ee1cff1d8dde5dfd00e Mon Sep 17 00:00:00 2001 From: pdpark Date: Fri, 12 Jan 2018 15:01:03 -0800 Subject: [PATCH 1/2] DOC: Adds example of alternative to storing lists in a Dataframe Restores: #17027 --- doc/source/gotchas.rst | 89 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst index bc490877e190d..0e99cebc30abd 100644 --- a/doc/source/gotchas.rst +++ b/doc/source/gotchas.rst @@ -332,3 +332,92 @@ using something similar to the following: See `the NumPy documentation on byte order `__ for more details. + + +Alternative to storing lists in DataFrame Cells +------------------------------------------------------ +Storing nested lists/arrays inside a pandas object should be avoided for performance and memory use reasons. Instead they should be "exploded" into a flat ``DataFrame`` structure. + +Example of exploding nested lists into a DataFrame: + +.. ipython:: python + + df = pd.DataFrame({'name': ['A.J. Price'] * 3, + 'opponent': ['76ers', 'blazers', 'bobcats'], + 'nearest_neighbors': [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']] * 3}, + columns=['name','opponent','attribute x','nearest_neighbors']) + df + + nearest_neighbors = [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']]*3 + nearest_neighbors + + #. Create an index with the "parent" columns to be included in the final Dataframe + df2 = pd.concat([df[['name','opponent']], pd.DataFrame(nearest_neighbors)], axis=1) + df2 + + #. Transform the column with lists into series, which become columns in a new Dataframe. + # Note that only the index from the original df is retained - + # any other columns in the original df are not part of the new df + df3 = df2.set_index(['name', 'opponent']) + df3 + + #. Stack the new columns as rows; this creates a new index level we'll want to drop in the next step. + # Note that at this point we have a Series, not a Dataframe + ser = df3.stack() + ser + + #. Drop the extraneous index level created by the stack + ser.reset_index(level=2, drop=True, inplace=True) + ser + + #. Create a Dataframe from the Series + df4 = ser.to_frame('nearest_neighbors') + df4 + + # All steps in one stack + df4 = (df2.set_index(['name', 'opponent']) + .stack() + .reset_index(level=2, drop=True) + .to_frame('nearest_neighbors')) + df4 + +Example of exploding a list embedded in a dataframe: + +.. ipython:: python + + df = pd.DataFrame({'name': ['A.J. Price'] * 3, + 'opponent': ['76ers', 'blazers', 'bobcats'], + 'nearest_neighbors': [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']] * 3}, + columns=['name','opponent','attribute x','nearest_neighbors']) + df + + #. Create an index with the "parent" columns to be included in the final Dataframe + df2 = df.set_index(['name', 'opponent']) + df2 + + #. Transform the column with lists into series, which become columns in a new Dataframe. + # Note that only the index from the original df is retained - + # any other columns in the original df are not part of the new df + df3 = df2.nearest_neighbors.apply(pd.Series) + df3 + + #. Stack the new columns as rows; this creates a new index level we'll want to drop in the next step. + # Note that at this point we have a Series, not a Dataframe + ser = df3.stack() + ser + + #. Drop the extraneous index level created by the stack + ser.reset_index(level=2, drop=True, inplace=True) + ser + + #. Create a Dataframe from the Series + df4 = ser.to_frame('nearest_neighbors') + df4 + + # All steps in one stack + df4 = (df.set_index(['name', 'opponent']) + .nearest_neighbors.apply(pd.Series) + .stack() + .reset_index(level=2, drop=True) + .to_frame('nearest_neighbors')) + df4 From 11ff8a7d898130de49d8a366501ac90b1408b727 Mon Sep 17 00:00:00 2001 From: pdpark Date: Fri, 12 Jan 2018 15:43:10 -0800 Subject: [PATCH 2/2] Doc: Fixes issues with code examples. --- doc/source/gotchas.rst | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst index 0e99cebc30abd..b2854670739f4 100644 --- a/doc/source/gotchas.rst +++ b/doc/source/gotchas.rst @@ -343,9 +343,8 @@ Example of exploding nested lists into a DataFrame: .. ipython:: python df = pd.DataFrame({'name': ['A.J. Price'] * 3, - 'opponent': ['76ers', 'blazers', 'bobcats'], - 'nearest_neighbors': [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']] * 3}, - columns=['name','opponent','attribute x','nearest_neighbors']) + 'opponent': ['76ers', 'blazers', 'bobcats']}, + columns=['name','opponent']) df nearest_neighbors = [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']]*3 @@ -388,7 +387,7 @@ Example of exploding a list embedded in a dataframe: df = pd.DataFrame({'name': ['A.J. Price'] * 3, 'opponent': ['76ers', 'blazers', 'bobcats'], 'nearest_neighbors': [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']] * 3}, - columns=['name','opponent','attribute x','nearest_neighbors']) + columns=['name','opponent','nearest_neighbors']) df #. Create an index with the "parent" columns to be included in the final Dataframe