diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst index 79e312ca12833..1da7e38c7bab7 100644 --- a/doc/source/gotchas.rst +++ b/doc/source/gotchas.rst @@ -336,3 +336,94 @@ constructors using something similar to the following: See `the NumPy documentation on byte order `__ for more details. + + +Alternative to storing lists in DataFrame Cells +----------------------------------------------- +Storing nested lists/arrays inside a pandas object should be avoided for performance and memory use reasons. Instead they should be "exploded" into a flat ``DataFrame`` structure. + +Example of exploding nested lists into a DataFrame: + +.. ipython:: python + + df = pd.DataFrame({'name': ['A.J. Price'] * 3, + 'opponent': ['76ers', 'blazers', 'bobcats']}, + columns=['name','opponent']) + df + + nearest_neighbors = [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']]*3 + nearest_neighbors + +Create an index with the "parent" columns to be included in the final Dataframe + +.. ipython:: python + + df = pd.concat([df[['name','opponent']], pd.DataFrame(nearest_neighbors)], axis=1) + df + +Transform the column with lists into series, which become columns in a new Dataframe. +Note that only the index from the original df is retained - Any other columns in the original df are not part of the new df + +.. ipython:: python + + df = df.set_index(['name', 'opponent']) + df + +Stack the new columns as rows; this creates a new index level we'll want to drop in the next step. +Note that at this point we have a Series, not a Dataframe + +.. ipython:: python + + ser = df.stack() + ser + + #. Drop the extraneous index level created by the stack + ser.reset_index(level=2, drop=True, inplace=True) + ser + + #. Create a Dataframe from the Series + df = ser.to_frame('nearest_neighbors') + df + + +Example of exploding a list embedded in a dataframe: + +.. ipython:: python + + df = pd.DataFrame({'name': ['A.J. Price'] * 3, + 'opponent': ['76ers', 'blazers', 'bobcats'], + 'nearest_neighbors': [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']] * 3}, + columns=['name','opponent','nearest_neighbors']) + df + +Create an index with the "parent" columns to be included in the final Dataframe + +.. ipython:: python + + df = df.set_index(['name', 'opponent']) + df + +Transform the column with lists into series, which become columns in a new Dataframe. +Note that only the index from the original df is retained - any other columns in the original df are not part of the new df + +.. ipython:: python + + df = df.nearest_neighbors.apply(pd.Series) + df + +Stack the new columns as rows; this creates a new index level we'll want to drop in the next step. +Note that at this point we have a Series, not a Dataframe + +.. ipython:: python + + ser = df.stack() + ser + + #. Drop the extraneous index level created by the stack + ser.reset_index(level=2, drop=True, inplace=True) + ser + + #. Create a Dataframe from the Series + df = ser.to_frame('nearest_neighbors') + df +