@@ -336,3 +336,97 @@ constructors using something similar to the following:
336
336
See `the NumPy documentation on byte order
337
337
<https://docs.scipy.org/doc/numpy/user/basics.byteswapping.html> `__ for more
338
338
details.
339
+
340
+
341
+ Alternative to storing lists in Pandas DataFrame Cells
342
+ ------------------------------------------------------
343
+ Storing nested lists/arrays inside a pandas object should be avoided for performance and memory use reasons. Instead they should be "exploded" into a flat DataFrame structure.
344
+
345
+ Example of exploding nested lists into a DataFrame:
346
+
347
+ .. ipython :: python
348
+
349
+ from collections import OrderedDict
350
+ df = (pd.DataFrame(OrderedDict([(' name' , [' A.J. Price' ]* 3 ),
351
+ (' opponent' , [' 76ers' , ' blazers' , ' bobcats' ]),
352
+ (' attribute x' , [' A' ,' B' ,' C' ])
353
+ ])
354
+ ))
355
+ df
356
+
357
+ nn = [[' Zach LaVine' , ' Jeremy Lin' , ' Nate Robinson' , ' Isaia' ]]* 3
358
+ nn
359
+
360
+ # Step 1: Create an index with the "parent" columns to be included in the final Dataframe
361
+ df2 = pd.concat([df[[' name' ,' opponent' ]], pd.DataFrame(nn)], axis = 1 )
362
+ df2
363
+
364
+ # Step 2: Transform the column with lists into series, which become columns in a new Dataframe.
365
+ # Note that only the index from the original df is retained -
366
+ # any other columns in the original df are not part of the new df
367
+ df3 = df2.set_index([' name' , ' opponent' ])
368
+ df3
369
+
370
+ # Step 3: Stack the new columns as rows; this creates a new index level we'll want to drop in the next step.
371
+ # Note that at this point we have a Series, not a Dataframe
372
+ ser = df3.stack()
373
+ ser
374
+
375
+ # Step 4: Drop the extraneous index level created by the stack
376
+ ser.reset_index(level = 2 , drop = True , inplace = True )
377
+ ser
378
+
379
+ # Step 5: Create a Dataframe from the Series
380
+ df4 = ser.to_frame(' nearest_neighbors' )
381
+ df4
382
+
383
+ # All steps in one stack
384
+ df4 = (df2.set_index([' name' , ' opponent' ])
385
+ .stack()
386
+ .reset_index(level = 2 , drop = True )
387
+ .to_frame(' nearest_neighbors' ))
388
+ df4
389
+
390
+ Example of exploding a list embedded in a dataframe:
391
+
392
+ .. ipython :: python
393
+
394
+ df = (pd.DataFrame(OrderedDict([(' name' , [' A.J. Price' ]* 3 ),
395
+ (' opponent' , [' 76ers' , ' blazers' , ' bobcats' ]),
396
+ (' attribute x' , [' A' ,' B' ,' C' ]),
397
+ (' nearest_neighbors' , [[' Zach LaVine' , ' Jeremy Lin' , ' Nate Robinson' , ' Isaia' ]]* 3 )
398
+ ])
399
+ ))
400
+
401
+ df
402
+
403
+ # Step 1: Create an index with the "parent" columns to be included in the final Dataframe
404
+ df2 = df.set_index([' name' , ' opponent' ])
405
+ df2
406
+
407
+ # Step 2: Transform the column with lists into series, which become columns in a new Dataframe.
408
+ # Note that only the index from the original df is retained -
409
+ # any other columns in the original df are not part of the new df
410
+ df3 = df2.nearest_neighbors.apply(pd.Series)
411
+ df3
412
+
413
+ # Step 3: Stack the new columns as rows; this creates a new index level we'll want to drop in the next step.
414
+ # Note that at this point we have a Series, not a Dataframe
415
+ ser = df3.stack()
416
+ ser
417
+
418
+ # Step 4: Drop the extraneous index level created by the stack
419
+ ser.reset_index(level = 2 , drop = True , inplace = True )
420
+ ser
421
+
422
+ # Step 5: Create a Dataframe from the Series
423
+ df4 = ser.to_frame(' nearest_neighbors' )
424
+ df4
425
+
426
+ # All steps in one stack
427
+ df4 = (df.set_index([' name' , ' opponent' ])
428
+ .nearest_neighbors.apply(pd.Series)
429
+ .stack()
430
+ .reset_index(level = 2 , drop = True )
431
+ .to_frame(' nearest_neighbors' ))
432
+ df4
0 commit comments