Adding example of exploding nested lists

pdpark · gfyoung · commit fa9496057325 · 2018-06-26T22:01:57.000-07:00
diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst
@@ -336,3 +336,97 @@ constructors using something similar to the following:
 See `the NumPy documentation on byte order
 <https://docs.scipy.org/doc/numpy/user/basics.byteswapping.html>`__ for more
 details.
+
+
+Alternative to storing lists in Pandas DataFrame Cells
+------------------------------------------------------
+Storing nested lists/arrays inside a pandas object should be avoided for performance and memory use reasons. Instead they should be "exploded" into a flat DataFrame structure.
+
+Example of exploding nested lists into a DataFrame:
+
+.. ipython:: python
+
+   from collections import OrderedDict
+   df = (pd.DataFrame(OrderedDict([('name', ['A.J. Price']*3), 
+                                   ('opponent', ['76ers', 'blazers', 'bobcats']), 
+                                   ('attribute x', ['A','B','C'])
+                                  ])
+                     ))
+   df
+
+   nn = [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']]*3
+   nn
+
+   # Step 1: Create an index with the "parent" columns to be included in the final Dataframe
+   df2 = pd.concat([df[['name','opponent']], pd.DataFrame(nn)], axis=1)
+   df2
+
+   # Step 2: Transform the column with lists into series, which become columns in a new Dataframe.
+   #    Note that only the index from the original df is retained - 
+   #    any other columns in the original df are not part of the new df
+   df3 = df2.set_index(['name', 'opponent'])
+   df3
+
+   # Step 3: Stack the new columns as rows; this creates a new index level we'll want to drop in the next step.
+   #    Note that at this point we have a Series, not a Dataframe
+   ser = df3.stack()
+   ser
+
+   # Step 4: Drop the extraneous index level created by the stack
+   ser.reset_index(level=2, drop=True, inplace=True)
+   ser
+
+   # Step 5: Create a Dataframe from the Series
+   df4 = ser.to_frame('nearest_neighbors')
+   df4
+
+   # All steps in one stack
+   df4 = (df2.set_index(['name', 'opponent'])
+           .stack()
+           .reset_index(level=2, drop=True)
+           .to_frame('nearest_neighbors'))
+   df4
+
+Example of exploding a list embedded in a dataframe:
+
+.. ipython:: python
+
+   df = (pd.DataFrame(OrderedDict([('name', ['A.J. Price']*3), 
+                                   ('opponent', ['76ers', 'blazers', 'bobcats']), 
+                                   ('attribute x', ['A','B','C']),
+                                   ('nearest_neighbors', [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']]*3)
+                                  ])
+                     ))
+   
+   df
+
+   # Step 1: Create an index with the "parent" columns to be included in the final Dataframe
+   df2 = df.set_index(['name', 'opponent'])
+   df2
+
+   # Step 2: Transform the column with lists into series, which become columns in a new Dataframe.
+   #    Note that only the index from the original df is retained - 
+   #    any other columns in the original df are not part of the new df
+   df3 = df2.nearest_neighbors.apply(pd.Series)
+   df3
+
+   # Step 3: Stack the new columns as rows; this creates a new index level we'll want to drop in the next step.
+   #    Note that at this point we have a Series, not a Dataframe
+   ser = df3.stack()
+   ser
+
+   # Step 4: Drop the extraneous index level created by the stack
+   ser.reset_index(level=2, drop=True, inplace=True)
+   ser
+
+   # Step 5: Create a Dataframe from the Series
+   df4 = ser.to_frame('nearest_neighbors')
+   df4
+
+   # All steps in one stack
+   df4 = (df.set_index(['name', 'opponent'])
+           .nearest_neighbors.apply(pd.Series)
+           .stack()
+           .reset_index(level=2, drop=True)
+           .to_frame('nearest_neighbors'))
+   df4