@@ -332,3 +332,97 @@ using something similar to the following:
332
332
See `the NumPy documentation on byte order
333
333
<https://docs.scipy.org/doc/numpy/user/basics.byteswapping.html> `__ for more
334
334
details.
335
+
336
+
337
+ Alternative to storing lists in Pandas DataFrame Cells
338
+ ------------------------------------------------------
339
+ Storing nested lists/arrays inside a pandas object should be avoided for performance and memory use reasons. Instead they should be "exploded" into a flat DataFrame structure.
340
+
341
+ Example of exploding nested lists into a DataFrame:
342
+
343
+ .. ipython :: python
344
+
345
+ from collections import OrderedDict
346
+ df = (pd.DataFrame(OrderedDict([(' name' , [' A.J. Price' ]* 3 ),
347
+ (' opponent' , [' 76ers' , ' blazers' , ' bobcats' ]),
348
+ (' attribute x' , [' A' ,' B' ,' C' ])
349
+ ])
350
+ ))
351
+ df
352
+
353
+ nn = [[' Zach LaVine' , ' Jeremy Lin' , ' Nate Robinson' , ' Isaia' ]]* 3
354
+ nn
355
+
356
+ # Step 1: Create an index with the "parent" columns to be included in the final Dataframe
357
+ df2 = pd.concat([df[[' name' ,' opponent' ]], pd.DataFrame(nn)], axis = 1 )
358
+ df2
359
+
360
+ # Step 2: Transform the column with lists into series, which become columns in a new Dataframe.
361
+ # Note that only the index from the original df is retained -
362
+ # any other columns in the original df are not part of the new df
363
+ df3 = df2.set_index([' name' , ' opponent' ])
364
+ df3
365
+
366
+ # Step 3: Stack the new columns as rows; this creates a new index level we'll want to drop in the next step.
367
+ # Note that at this point we have a Series, not a Dataframe
368
+ ser = df3.stack()
369
+ ser
370
+
371
+ # Step 4: Drop the extraneous index level created by the stack
372
+ ser.reset_index(level = 2 , drop = True , inplace = True )
373
+ ser
374
+
375
+ # Step 5: Create a Dataframe from the Series
376
+ df4 = ser.to_frame(' nearest_neighbors' )
377
+ df4
378
+
379
+ # All steps in one stack
380
+ df4 = (df2.set_index([' name' , ' opponent' ])
381
+ .stack()
382
+ .reset_index(level = 2 , drop = True )
383
+ .to_frame(' nearest_neighbors' ))
384
+ df4
385
+
386
+ Example of exploding a list embedded in a dataframe:
387
+
388
+ .. ipython :: python
389
+
390
+ df = (pd.DataFrame(OrderedDict([(' name' , [' A.J. Price' ]* 3 ),
391
+ (' opponent' , [' 76ers' , ' blazers' , ' bobcats' ]),
392
+ (' attribute x' , [' A' ,' B' ,' C' ]),
393
+ (' nearest_neighbors' , [[' Zach LaVine' , ' Jeremy Lin' , ' Nate Robinson' , ' Isaia' ]]* 3 )
394
+ ])
395
+ ))
396
+
397
+ df
398
+
399
+ # Step 1: Create an index with the "parent" columns to be included in the final Dataframe
400
+ df2 = df.set_index([' name' , ' opponent' ])
401
+ df2
402
+
403
+ # Step 2: Transform the column with lists into series, which become columns in a new Dataframe.
404
+ # Note that only the index from the original df is retained -
405
+ # any other columns in the original df are not part of the new df
406
+ df3 = df2.nearest_neighbors.apply(pd.Series)
407
+ df3
408
+
409
+ # Step 3: Stack the new columns as rows; this creates a new index level we'll want to drop in the next step.
410
+ # Note that at this point we have a Series, not a Dataframe
411
+ ser = df3.stack()
412
+ ser
413
+
414
+ # Step 4: Drop the extraneous index level created by the stack
415
+ ser.reset_index(level = 2 , drop = True , inplace = True )
416
+ ser
417
+
418
+ # Step 5: Create a Dataframe from the Series
419
+ df4 = ser.to_frame(' nearest_neighbors' )
420
+ df4
421
+
422
+ # All steps in one stack
423
+ df4 = (df.set_index([' name' , ' opponent' ])
424
+ .nearest_neighbors.apply(pd.Series)
425
+ .stack()
426
+ .reset_index(level = 2 , drop = True )
427
+ .to_frame(' nearest_neighbors' ))
428
+ df4
0 commit comments