From e91444edd1ce8afbc8617ee1cff1d8dde5dfd00e Mon Sep 17 00:00:00 2001 From: pdpark Date: Fri, 12 Jan 2018 15:01:03 -0800 Subject: [PATCH 1/5] DOC: Adds example of alternative to storing lists in a Dataframe Restores: #17027 --- doc/source/gotchas.rst | 89 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst index bc490877e190d..0e99cebc30abd 100644 --- a/doc/source/gotchas.rst +++ b/doc/source/gotchas.rst @@ -332,3 +332,92 @@ using something similar to the following: See `the NumPy documentation on byte order `__ for more details. + + +Alternative to storing lists in DataFrame Cells +------------------------------------------------------ +Storing nested lists/arrays inside a pandas object should be avoided for performance and memory use reasons. Instead they should be "exploded" into a flat ``DataFrame`` structure. + +Example of exploding nested lists into a DataFrame: + +.. ipython:: python + + df = pd.DataFrame({'name': ['A.J. Price'] * 3, + 'opponent': ['76ers', 'blazers', 'bobcats'], + 'nearest_neighbors': [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']] * 3}, + columns=['name','opponent','attribute x','nearest_neighbors']) + df + + nearest_neighbors = [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']]*3 + nearest_neighbors + + #. Create an index with the "parent" columns to be included in the final Dataframe + df2 = pd.concat([df[['name','opponent']], pd.DataFrame(nearest_neighbors)], axis=1) + df2 + + #. Transform the column with lists into series, which become columns in a new Dataframe. + # Note that only the index from the original df is retained - + # any other columns in the original df are not part of the new df + df3 = df2.set_index(['name', 'opponent']) + df3 + + #. Stack the new columns as rows; this creates a new index level we'll want to drop in the next step. + # Note that at this point we have a Series, not a Dataframe + ser = df3.stack() + ser + + #. Drop the extraneous index level created by the stack + ser.reset_index(level=2, drop=True, inplace=True) + ser + + #. Create a Dataframe from the Series + df4 = ser.to_frame('nearest_neighbors') + df4 + + # All steps in one stack + df4 = (df2.set_index(['name', 'opponent']) + .stack() + .reset_index(level=2, drop=True) + .to_frame('nearest_neighbors')) + df4 + +Example of exploding a list embedded in a dataframe: + +.. ipython:: python + + df = pd.DataFrame({'name': ['A.J. Price'] * 3, + 'opponent': ['76ers', 'blazers', 'bobcats'], + 'nearest_neighbors': [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']] * 3}, + columns=['name','opponent','attribute x','nearest_neighbors']) + df + + #. Create an index with the "parent" columns to be included in the final Dataframe + df2 = df.set_index(['name', 'opponent']) + df2 + + #. Transform the column with lists into series, which become columns in a new Dataframe. + # Note that only the index from the original df is retained - + # any other columns in the original df are not part of the new df + df3 = df2.nearest_neighbors.apply(pd.Series) + df3 + + #. Stack the new columns as rows; this creates a new index level we'll want to drop in the next step. + # Note that at this point we have a Series, not a Dataframe + ser = df3.stack() + ser + + #. Drop the extraneous index level created by the stack + ser.reset_index(level=2, drop=True, inplace=True) + ser + + #. Create a Dataframe from the Series + df4 = ser.to_frame('nearest_neighbors') + df4 + + # All steps in one stack + df4 = (df.set_index(['name', 'opponent']) + .nearest_neighbors.apply(pd.Series) + .stack() + .reset_index(level=2, drop=True) + .to_frame('nearest_neighbors')) + df4 From 11ff8a7d898130de49d8a366501ac90b1408b727 Mon Sep 17 00:00:00 2001 From: pdpark Date: Fri, 12 Jan 2018 15:43:10 -0800 Subject: [PATCH 2/5] Doc: Fixes issues with code examples. --- doc/source/gotchas.rst | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst index 0e99cebc30abd..b2854670739f4 100644 --- a/doc/source/gotchas.rst +++ b/doc/source/gotchas.rst @@ -343,9 +343,8 @@ Example of exploding nested lists into a DataFrame: .. ipython:: python df = pd.DataFrame({'name': ['A.J. Price'] * 3, - 'opponent': ['76ers', 'blazers', 'bobcats'], - 'nearest_neighbors': [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']] * 3}, - columns=['name','opponent','attribute x','nearest_neighbors']) + 'opponent': ['76ers', 'blazers', 'bobcats']}, + columns=['name','opponent']) df nearest_neighbors = [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']]*3 @@ -388,7 +387,7 @@ Example of exploding a list embedded in a dataframe: df = pd.DataFrame({'name': ['A.J. Price'] * 3, 'opponent': ['76ers', 'blazers', 'bobcats'], 'nearest_neighbors': [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']] * 3}, - columns=['name','opponent','attribute x','nearest_neighbors']) + columns=['name','opponent','nearest_neighbors']) df #. Create an index with the "parent" columns to be included in the final Dataframe From 6d379b49b209b7e7284fe97ee734f05bc964b6a8 Mon Sep 17 00:00:00 2001 From: Gautam Date: Mon, 8 Oct 2018 14:15:16 +0530 Subject: [PATCH 3/5] DOC: Add example of alternative to storing lists in a Dataframe fix PR19215 --- doc/source/gotchas.rst | 97 ++++++++++++++++++++++++++---------------- 1 file changed, 61 insertions(+), 36 deletions(-) diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst index fa16a357f82d0..625ab8b12a638 100644 --- a/doc/source/gotchas.rst +++ b/doc/source/gotchas.rst @@ -339,34 +339,43 @@ details. Alternative to storing lists in DataFrame Cells ------------------------------------------------------- +----------------------------------------------- Storing nested lists/arrays inside a pandas object should be avoided for performance and memory use reasons. Instead they should be "exploded" into a flat ``DataFrame`` structure. Example of exploding nested lists into a DataFrame: .. ipython:: python - df = pd.DataFrame({'name': ['A.J. Price'] * 3, + dframe = pd.DataFrame({'name': ['A.J. Price'] * 3, 'opponent': ['76ers', 'blazers', 'bobcats']}, columns=['name','opponent']) - df + dframe nearest_neighbors = [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']]*3 nearest_neighbors - #. Create an index with the "parent" columns to be included in the final Dataframe - df2 = pd.concat([df[['name','opponent']], pd.DataFrame(nearest_neighbors)], axis=1) - df2 +Create an index with the "parent" columns to be included in the final Dataframe + +.. ipython:: python + + df = pd.concat([dframe[['name','opponent']], pd.DataFrame(nearest_neighbors)], axis=1) + df + +Transform the column with lists into series, which become columns in a new Dataframe. + Note that only the index from the original df is retained - + Any other columns in the original df are not part of the new df + +.. ipython:: python + + df = df.set_index(['name', 'opponent']) + df + +Stack the new columns as rows; this creates a new index level we'll want to drop in the next step. + Note that at this point we have a Series, not a Dataframe - #. Transform the column with lists into series, which become columns in a new Dataframe. - # Note that only the index from the original df is retained - - # any other columns in the original df are not part of the new df - df3 = df2.set_index(['name', 'opponent']) - df3 +.. ipython:: python - #. Stack the new columns as rows; this creates a new index level we'll want to drop in the next step. - # Note that at this point we have a Series, not a Dataframe - ser = df3.stack() + ser = df.stack() ser #. Drop the extraneous index level created by the stack @@ -374,39 +383,52 @@ Example of exploding nested lists into a DataFrame: ser #. Create a Dataframe from the Series - df4 = ser.to_frame('nearest_neighbors') - df4 + df = ser.to_frame('nearest_neighbors') + df + +All steps in one stack + +.. ipython:: python - # All steps in one stack - df4 = (df2.set_index(['name', 'opponent']) + df = (dframe.concat([df[['name','opponent']], pd.DataFrame(nearest_neighbors)], axis=1) + .set_index(['name', 'opponent']) .stack() .reset_index(level=2, drop=True) .to_frame('nearest_neighbors')) - df4 + df Example of exploding a list embedded in a dataframe: .. ipython:: python - df = pd.DataFrame({'name': ['A.J. Price'] * 3, + dframe = pd.DataFrame({'name': ['A.J. Price'] * 3, 'opponent': ['76ers', 'blazers', 'bobcats'], 'nearest_neighbors': [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']] * 3}, columns=['name','opponent','nearest_neighbors']) + dframe + +Create an index with the "parent" columns to be included in the final Dataframe + +.. ipython:: python + + df = dframe.set_index(['name', 'opponent']) + df + +Transform the column with lists into series, which become columns in a new Dataframe. + Note that only the index from the original df is retained - + +.. ipython:: python + + any other columns in the original df are not part of the new df + df = df.nearest_neighbors.apply(pd.Series) df - #. Create an index with the "parent" columns to be included in the final Dataframe - df2 = df.set_index(['name', 'opponent']) - df2 +Stack the new columns as rows; this creates a new index level we'll want to drop in the next step. + Note that at this point we have a Series, not a Dataframe - #. Transform the column with lists into series, which become columns in a new Dataframe. - # Note that only the index from the original df is retained - - # any other columns in the original df are not part of the new df - df3 = df2.nearest_neighbors.apply(pd.Series) - df3 +.. ipython:: python - #. Stack the new columns as rows; this creates a new index level we'll want to drop in the next step. - # Note that at this point we have a Series, not a Dataframe - ser = df3.stack() + ser = df.stack() ser #. Drop the extraneous index level created by the stack @@ -414,13 +436,16 @@ Example of exploding a list embedded in a dataframe: ser #. Create a Dataframe from the Series - df4 = ser.to_frame('nearest_neighbors') - df4 + df = ser.to_frame('nearest_neighbors') + df - # All steps in one stack - df4 = (df.set_index(['name', 'opponent']) +All steps in one stack + +.. ipython:: python + + df = (dframe.set_index(['name', 'opponent']) .nearest_neighbors.apply(pd.Series) .stack() .reset_index(level=2, drop=True) .to_frame('nearest_neighbors')) - df4 + df From a5a9ec2a6df8d4a1631c1bdd51791092ddf21019 Mon Sep 17 00:00:00 2001 From: Gautam Date: Mon, 8 Oct 2018 14:31:55 +0530 Subject: [PATCH 4/5] DOC: Adds example of alternative to storing lists in a Dataframe - made more fixes --- doc/source/gotchas.rst | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst index 625ab8b12a638..3670e949a4592 100644 --- a/doc/source/gotchas.rst +++ b/doc/source/gotchas.rst @@ -361,17 +361,16 @@ Create an index with the "parent" columns to be included in the final Dataframe df = pd.concat([dframe[['name','opponent']], pd.DataFrame(nearest_neighbors)], axis=1) df -Transform the column with lists into series, which become columns in a new Dataframe. - Note that only the index from the original df is retained - - Any other columns in the original df are not part of the new df +Transform the column with lists into series, which become columns in a new Dataframe. +Note that only the index from the original df is retained - Any other columns in the original df are not part of the new df .. ipython:: python df = df.set_index(['name', 'opponent']) df -Stack the new columns as rows; this creates a new index level we'll want to drop in the next step. - Note that at this point we have a Series, not a Dataframe +Stack the new columns as rows; this creates a new index level we'll want to drop in the next step. +Note that at this point we have a Series, not a Dataframe .. ipython:: python @@ -414,17 +413,16 @@ Create an index with the "parent" columns to be included in the final Dataframe df = dframe.set_index(['name', 'opponent']) df -Transform the column with lists into series, which become columns in a new Dataframe. - Note that only the index from the original df is retained - +Transform the column with lists into series, which become columns in a new Dataframe. +Note that only the index from the original df is retained - any other columns in the original df are not part of the new df .. ipython:: python - any other columns in the original df are not part of the new df df = df.nearest_neighbors.apply(pd.Series) df -Stack the new columns as rows; this creates a new index level we'll want to drop in the next step. - Note that at this point we have a Series, not a Dataframe +Stack the new columns as rows; this creates a new index level we'll want to drop in the next step. +Note that at this point we have a Series, not a Dataframe .. ipython:: python From 4952597f4a94a4e21a5ec72fd8d5fef2dfa81a5f Mon Sep 17 00:00:00 2001 From: Gautam Date: Mon, 8 Oct 2018 15:55:19 +0530 Subject: [PATCH 5/5] Doc: Adds example of exploding lists into columns instead of storing in dataframe cells --- doc/source/gotchas.rst | 36 ++++++++---------------------------- 1 file changed, 8 insertions(+), 28 deletions(-) diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst index 3670e949a4592..1da7e38c7bab7 100644 --- a/doc/source/gotchas.rst +++ b/doc/source/gotchas.rst @@ -346,10 +346,10 @@ Example of exploding nested lists into a DataFrame: .. ipython:: python - dframe = pd.DataFrame({'name': ['A.J. Price'] * 3, + df = pd.DataFrame({'name': ['A.J. Price'] * 3, 'opponent': ['76ers', 'blazers', 'bobcats']}, - columns=['name','opponent']) - dframe + columns=['name','opponent']) + df nearest_neighbors = [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']]*3 nearest_neighbors @@ -358,7 +358,7 @@ Create an index with the "parent" columns to be included in the final Dataframe .. ipython:: python - df = pd.concat([dframe[['name','opponent']], pd.DataFrame(nearest_neighbors)], axis=1) + df = pd.concat([df[['name','opponent']], pd.DataFrame(nearest_neighbors)], axis=1) df Transform the column with lists into series, which become columns in a new Dataframe. @@ -385,32 +385,22 @@ Note that at this point we have a Series, not a Dataframe df = ser.to_frame('nearest_neighbors') df -All steps in one stack - -.. ipython:: python - - df = (dframe.concat([df[['name','opponent']], pd.DataFrame(nearest_neighbors)], axis=1) - .set_index(['name', 'opponent']) - .stack() - .reset_index(level=2, drop=True) - .to_frame('nearest_neighbors')) - df Example of exploding a list embedded in a dataframe: .. ipython:: python - dframe = pd.DataFrame({'name': ['A.J. Price'] * 3, + df = pd.DataFrame({'name': ['A.J. Price'] * 3, 'opponent': ['76ers', 'blazers', 'bobcats'], 'nearest_neighbors': [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']] * 3}, - columns=['name','opponent','nearest_neighbors']) - dframe + columns=['name','opponent','nearest_neighbors']) + df Create an index with the "parent" columns to be included in the final Dataframe .. ipython:: python - df = dframe.set_index(['name', 'opponent']) + df = df.set_index(['name', 'opponent']) df Transform the column with lists into series, which become columns in a new Dataframe. @@ -437,13 +427,3 @@ Note that at this point we have a Series, not a Dataframe df = ser.to_frame('nearest_neighbors') df -All steps in one stack - -.. ipython:: python - - df = (dframe.set_index(['name', 'opponent']) - .nearest_neighbors.apply(pd.Series) - .stack() - .reset_index(level=2, drop=True) - .to_frame('nearest_neighbors')) - df