From e91444edd1ce8afbc8617ee1cff1d8dde5dfd00e Mon Sep 17 00:00:00 2001
From: pdpark <adad@sbcglobal.net>
Date: Fri, 12 Jan 2018 15:01:03 -0800
Subject: [PATCH 1/2] DOC: Adds example of alternative to storing lists in a
 Dataframe

Restores: #17027
---
 doc/source/gotchas.rst | 89 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)

diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst
index bc490877e190d..0e99cebc30abd 100644
--- a/doc/source/gotchas.rst
+++ b/doc/source/gotchas.rst
@@ -332,3 +332,92 @@ using something similar to the following:
 See `the NumPy documentation on byte order
 <https://docs.scipy.org/doc/numpy/user/basics.byteswapping.html>`__ for more
 details.
+
+
+Alternative to storing lists in DataFrame Cells
+------------------------------------------------------
+Storing nested lists/arrays inside a pandas object should be avoided for performance and memory use reasons. Instead they should be "exploded" into a flat ``DataFrame`` structure.
+
+Example of exploding nested lists into a DataFrame:
+
+.. ipython:: python
+
+   df = pd.DataFrame({'name': ['A.J. Price'] * 3, 
+                      'opponent': ['76ers', 'blazers', 'bobcats'], 
+                      'nearest_neighbors': [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']] * 3},
+                     columns=['name','opponent','attribute x','nearest_neighbors'])
+   df
+
+   nearest_neighbors = [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']]*3
+   nearest_neighbors
+
+   #. Create an index with the "parent" columns to be included in the final Dataframe
+   df2 = pd.concat([df[['name','opponent']], pd.DataFrame(nearest_neighbors)], axis=1)
+   df2
+
+   #. Transform the column with lists into series, which become columns in a new Dataframe.
+   #    Note that only the index from the original df is retained - 
+   #    any other columns in the original df are not part of the new df
+   df3 = df2.set_index(['name', 'opponent'])
+   df3
+
+   #. Stack the new columns as rows; this creates a new index level we'll want to drop in the next step.
+   #    Note that at this point we have a Series, not a Dataframe
+   ser = df3.stack()
+   ser
+
+   #. Drop the extraneous index level created by the stack
+   ser.reset_index(level=2, drop=True, inplace=True)
+   ser
+
+   #. Create a Dataframe from the Series
+   df4 = ser.to_frame('nearest_neighbors')
+   df4
+
+   # All steps in one stack
+   df4 = (df2.set_index(['name', 'opponent'])
+           .stack()
+           .reset_index(level=2, drop=True)
+           .to_frame('nearest_neighbors'))
+   df4
+
+Example of exploding a list embedded in a dataframe:
+
+.. ipython:: python
+
+   df = pd.DataFrame({'name': ['A.J. Price'] * 3, 
+                      'opponent': ['76ers', 'blazers', 'bobcats'], 
+                      'nearest_neighbors': [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']] * 3},
+                     columns=['name','opponent','attribute x','nearest_neighbors'])
+   df
+
+   #. Create an index with the "parent" columns to be included in the final Dataframe
+   df2 = df.set_index(['name', 'opponent'])
+   df2
+
+   #. Transform the column with lists into series, which become columns in a new Dataframe.
+   #    Note that only the index from the original df is retained - 
+   #    any other columns in the original df are not part of the new df
+   df3 = df2.nearest_neighbors.apply(pd.Series)
+   df3
+
+   #. Stack the new columns as rows; this creates a new index level we'll want to drop in the next step.
+   #    Note that at this point we have a Series, not a Dataframe
+   ser = df3.stack()
+   ser
+
+   #. Drop the extraneous index level created by the stack
+   ser.reset_index(level=2, drop=True, inplace=True)
+   ser
+
+   #. Create a Dataframe from the Series
+   df4 = ser.to_frame('nearest_neighbors')
+   df4
+
+   # All steps in one stack
+   df4 = (df.set_index(['name', 'opponent'])
+           .nearest_neighbors.apply(pd.Series)
+           .stack()
+           .reset_index(level=2, drop=True)
+           .to_frame('nearest_neighbors'))
+   df4

From 11ff8a7d898130de49d8a366501ac90b1408b727 Mon Sep 17 00:00:00 2001
From: pdpark <adad@sbcglobal.net>
Date: Fri, 12 Jan 2018 15:43:10 -0800
Subject: [PATCH 2/2] Doc: Fixes issues with code examples.

---
 doc/source/gotchas.rst | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst
index 0e99cebc30abd..b2854670739f4 100644
--- a/doc/source/gotchas.rst
+++ b/doc/source/gotchas.rst
@@ -343,9 +343,8 @@ Example of exploding nested lists into a DataFrame:
 .. ipython:: python
 
    df = pd.DataFrame({'name': ['A.J. Price'] * 3, 
-                      'opponent': ['76ers', 'blazers', 'bobcats'], 
-                      'nearest_neighbors': [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']] * 3},
-                     columns=['name','opponent','attribute x','nearest_neighbors'])
+                      'opponent': ['76ers', 'blazers', 'bobcats']},
+                     columns=['name','opponent'])
    df
 
    nearest_neighbors = [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']]*3
@@ -388,7 +387,7 @@ Example of exploding a list embedded in a dataframe:
    df = pd.DataFrame({'name': ['A.J. Price'] * 3, 
                       'opponent': ['76ers', 'blazers', 'bobcats'], 
                       'nearest_neighbors': [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']] * 3},
-                     columns=['name','opponent','attribute x','nearest_neighbors'])
+                     columns=['name','opponent','nearest_neighbors'])
    df
 
    #. Create an index with the "parent" columns to be included in the final Dataframe