@@ -332,3 +332,92 @@ using something similar to the following:
332
332
See `the NumPy documentation on byte order
333
333
<https://docs.scipy.org/doc/numpy/user/basics.byteswapping.html> `__ for more
334
334
details.
335
+
336
+
337
+ Alternative to storing lists in DataFrame Cells
338
+ ------------------------------------------------------
339
+ Storing nested lists/arrays inside a pandas object should be avoided for performance and memory use reasons. Instead they should be "exploded" into a flat ``DataFrame `` structure.
340
+
341
+ Example of exploding nested lists into a DataFrame:
342
+
343
+ .. ipython :: python
344
+
345
+ df = pd.DataFrame({' name' : [' A.J. Price' ] * 3 ,
346
+ ' opponent' : [' 76ers' , ' blazers' , ' bobcats' ],
347
+ ' nearest_neighbors' : [[' Zach LaVine' , ' Jeremy Lin' , ' Nate Robinson' , ' Isaia' ]] * 3 },
348
+ columns = [' name' ,' opponent' ,' attribute x' ,' nearest_neighbors' ])
349
+ df
350
+
351
+ nearest_neighbors = [[' Zach LaVine' , ' Jeremy Lin' , ' Nate Robinson' , ' Isaia' ]]* 3
352
+ nearest_neighbors
353
+
354
+ # . Create an index with the "parent" columns to be included in the final Dataframe
355
+ df2 = pd.concat([df[[' name' ,' opponent' ]], pd.DataFrame(nearest_neighbors)], axis = 1 )
356
+ df2
357
+
358
+ # . Transform the column with lists into series, which become columns in a new Dataframe.
359
+ # Note that only the index from the original df is retained -
360
+ # any other columns in the original df are not part of the new df
361
+ df3 = df2.set_index([' name' , ' opponent' ])
362
+ df3
363
+
364
+ # . Stack the new columns as rows; this creates a new index level we'll want to drop in the next step.
365
+ # Note that at this point we have a Series, not a Dataframe
366
+ ser = df3.stack()
367
+ ser
368
+
369
+ # . Drop the extraneous index level created by the stack
370
+ ser.reset_index(level = 2 , drop = True , inplace = True )
371
+ ser
372
+
373
+ # . Create a Dataframe from the Series
374
+ df4 = ser.to_frame(' nearest_neighbors' )
375
+ df4
376
+
377
+ # All steps in one stack
378
+ df4 = (df2.set_index([' name' , ' opponent' ])
379
+ .stack()
380
+ .reset_index(level = 2 , drop = True )
381
+ .to_frame(' nearest_neighbors' ))
382
+ df4
383
+
384
+ Example of exploding a list embedded in a dataframe:
385
+
386
+ .. ipython :: python
387
+
388
+ df = pd.DataFrame({' name' : [' A.J. Price' ] * 3 ,
389
+ ' opponent' : [' 76ers' , ' blazers' , ' bobcats' ],
390
+ ' nearest_neighbors' : [[' Zach LaVine' , ' Jeremy Lin' , ' Nate Robinson' , ' Isaia' ]] * 3 },
391
+ columns = [' name' ,' opponent' ,' attribute x' ,' nearest_neighbors' ])
392
+ df
393
+
394
+ # . Create an index with the "parent" columns to be included in the final Dataframe
395
+ df2 = df.set_index([' name' , ' opponent' ])
396
+ df2
397
+
398
+ # . Transform the column with lists into series, which become columns in a new Dataframe.
399
+ # Note that only the index from the original df is retained -
400
+ # any other columns in the original df are not part of the new df
401
+ df3 = df2.nearest_neighbors.apply(pd.Series)
402
+ df3
403
+
404
+ # . Stack the new columns as rows; this creates a new index level we'll want to drop in the next step.
405
+ # Note that at this point we have a Series, not a Dataframe
406
+ ser = df3.stack()
407
+ ser
408
+
409
+ # . Drop the extraneous index level created by the stack
410
+ ser.reset_index(level = 2 , drop = True , inplace = True )
411
+ ser
412
+
413
+ # . Create a Dataframe from the Series
414
+ df4 = ser.to_frame(' nearest_neighbors' )
415
+ df4
416
+
417
+ # All steps in one stack
418
+ df4 = (df.set_index([' name' , ' opponent' ])
419
+ .nearest_neighbors.apply(pd.Series)
420
+ .stack()
421
+ .reset_index(level = 2 , drop = True )
422
+ .to_frame(' nearest_neighbors' ))
423
+ df4
0 commit comments