@@ -6441,71 +6441,299 @@ def to_orc(
6441
6441
def stack (self , level = - 1 , dropna = True ):
6442
6442
"""Stack the prescribed level(s) from columns to index
6443
6443
6444
- Return a reshaped Series
6444
+ Return a reshaped DataFrame or Series having a multi-level
6445
+ index with one or more new inner-most levels compared to
6446
+ the current DataFrame. The new inner-most levels are created
6447
+ by pivoting the columns of the current dataframe:
6448
+
6449
+ - if the columns have a single level, the output is a Series;
6450
+ - if the columns have multiple levels, the new index
6451
+ level(s) is (are) taken from the prescribed level(s) and
6452
+ the output is a DataFrame.
6445
6453
6446
6454
Parameters
6447
6455
----------
6456
+ level : int, str, list default -1
6457
+ Level(s) to stack from the column axis onto the index axis,
6458
+ defined as one index or label, or a list of indices or labels.
6448
6459
dropna : bool, default True
6449
- Whether to drop rows in the resulting Series with missing values.
6460
+ Whether to drop rows in the resulting Frame/Series with missing
6461
+ values. When multiple levels are specified, `dropna==False` is
6462
+ unsupported.
6450
6463
6451
6464
Returns
6452
6465
-------
6453
- The stacked cudf.Series
6466
+ DataFrame or Series
6467
+ Stacked dataframe or series.
6468
+
6469
+ See Also
6470
+ --------
6471
+ DataFrame.unstack : Unstack prescribed level(s) from index axis
6472
+ onto column axis.
6473
+ DataFrame.pivot : Reshape dataframe from long format to wide
6474
+ format.
6475
+ DataFrame.pivot_table : Create a spreadsheet-style pivot table
6476
+ as a DataFrame.
6477
+
6478
+ Notes
6479
+ -----
6480
+ The function is named by analogy with a collection of books
6481
+ being reorganized from being side by side on a horizontal
6482
+ position (the columns of the dataframe) to being stacked
6483
+ vertically on top of each other (in the index of the
6484
+ dataframe).
6454
6485
6455
6486
Examples
6456
6487
--------
6457
- >>> import cudf
6458
- >>> df = cudf.DataFrame({'a': [0, 1, 3], 'b': [1, 2, 4]})
6459
- >>> df.stack()
6460
- 0 a 0
6461
- b 1
6462
- 1 a 1
6463
- b 2
6464
- 2 a 3
6465
- b 4
6488
+ **Single level columns**
6489
+
6490
+ >>> df_single_level_cols = cudf.DataFrame([[0, 1], [2, 3]],
6491
+ ... index=['cat', 'dog'],
6492
+ ... columns=['weight', 'height'])
6493
+
6494
+ Stacking a dataframe with a single level column axis returns a Series:
6495
+
6496
+ >>> df_single_level_cols
6497
+ weight height
6498
+ cat 0 1
6499
+ dog 2 3
6500
+ >>> df_single_level_cols.stack()
6501
+ cat height 1
6502
+ weight 0
6503
+ dog height 3
6504
+ weight 2
6466
6505
dtype: int64
6506
+
6507
+ **Multi level columns: simple case**
6508
+
6509
+ >>> import pandas as pd
6510
+ >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),
6511
+ ... ('weight', 'pounds')])
6512
+ >>> df_multi_level_cols1 = cudf.DataFrame([[1, 2], [2, 4]],
6513
+ ... index=['cat', 'dog'],
6514
+ ... columns=multicol1)
6515
+
6516
+ Stacking a dataframe with a multi-level column axis:
6517
+
6518
+ >>> df_multi_level_cols1
6519
+ weight
6520
+ kg pounds
6521
+ cat 1 2
6522
+ dog 2 4
6523
+ >>> df_multi_level_cols1.stack()
6524
+ weight
6525
+ cat kg 1
6526
+ pounds 2
6527
+ dog kg 2
6528
+ pounds 4
6529
+
6530
+ **Missing values**
6531
+
6532
+ >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),
6533
+ ... ('height', 'm')])
6534
+ >>> df_multi_level_cols2 = cudf.DataFrame([[1.0, 2.0], [3.0, 4.0]],
6535
+ ... index=['cat', 'dog'],
6536
+ ... columns=multicol2)
6537
+
6538
+ It is common to have missing values when stacking a dataframe
6539
+ with multi-level columns, as the stacked dataframe typically
6540
+ has more values than the original dataframe. Missing values
6541
+ are filled with NULLs:
6542
+
6543
+ >>> df_multi_level_cols2
6544
+ weight height
6545
+ kg m
6546
+ cat 1.0 2.0
6547
+ dog 3.0 4.0
6548
+ >>> df_multi_level_cols2.stack()
6549
+ height weight
6550
+ cat kg <NA> 1.0
6551
+ m 2.0 <NA>
6552
+ dog kg <NA> 3.0
6553
+ m 4.0 <NA>
6554
+
6555
+ **Prescribing the level(s) to be stacked**
6556
+
6557
+ The first parameter controls which level or levels are stacked:
6558
+
6559
+ >>> df_multi_level_cols2.stack(0)
6560
+ kg m
6561
+ cat height <NA> 2.0
6562
+ weight 1.0 <NA>
6563
+ dog height <NA> 4.0
6564
+ weight 3.0 <NA>
6565
+
6566
+ >>> df_multi_level_cols2.stack([0, 1])
6567
+ cat height m 2.0
6568
+ weight kg 1.0
6569
+ dog height m 4.0
6570
+ weight kg 3.0
6571
+ dtype: float64
6467
6572
"""
6468
- assert level in ( None , - 1 )
6469
- repeated_index = self . index . repeat ( self . shape [ 1 ])
6470
- name_index = libcudf . reshape . tile (
6471
- [ as_column ( self . _column_names )], self . shape [ 0 ]
6472
- )
6473
- new_index_columns = [ * repeated_index . _columns , * name_index ]
6474
- if isinstance ( self . _index , MultiIndex ):
6475
- index_names = self . _index . names + [ None ]
6573
+
6574
+ if isinstance ( level , ( int , str )):
6575
+ level = [ level ]
6576
+ elif isinstance ( level , list ):
6577
+ if not all ( isinstance ( lv , ( int , str )) for lv in level ):
6578
+ raise ValueError (
6579
+ "level must be either an int/str, or a list of int/str."
6580
+ )
6476
6581
else :
6477
- index_names = [None ] * len (new_index_columns )
6582
+ raise ValueError (
6583
+ "level must be either an int/str, or a list of int/str."
6584
+ )
6585
+
6586
+ level = [level ] if not isinstance (level , list ) else level
6587
+
6588
+ if len (level ) > 1 and not dropna :
6589
+ raise NotImplementedError (
6590
+ "When stacking multiple levels, setting `dropna` to False "
6591
+ "will generate new column combination that does not exist "
6592
+ "in original dataframe. This behavior is unsupported in "
6593
+ "cuDF. See pandas deprecation note: "
6594
+ "https://github.com/pandas-dev/pandas/issues/53515"
6595
+ )
6596
+
6597
+ # Compute the columns to stack based on specified levels
6598
+
6599
+ level_indices : list [int ] = []
6600
+
6601
+ # If all passed in level names match up to the dataframe column's level
6602
+ # names, cast them to indices
6603
+ if all (lv in self ._data .level_names for lv in level ):
6604
+ level_indices = [self ._data .level_names .index (lv ) for lv in level ]
6605
+ elif not all (isinstance (lv , int ) for lv in level ):
6606
+ raise ValueError (
6607
+ "`level` must either be a list of names or positions, not a "
6608
+ "mixture of both."
6609
+ )
6610
+ else :
6611
+ # Must be a list of positions, normalize negative positions
6612
+ level_indices = [
6613
+ lv + self ._data .nlevels if lv < 0 else lv for lv in level
6614
+ ]
6615
+
6616
+ unnamed_levels_indices = [
6617
+ i for i in range (self ._data .nlevels ) if i not in level_indices
6618
+ ]
6619
+ has_unnamed_levels = len (unnamed_levels_indices ) > 0
6620
+
6621
+ column_name_idx = self ._data .to_pandas_index ()
6622
+ # Construct new index from the levels specified by `level`
6623
+ named_levels = pd .MultiIndex .from_arrays (
6624
+ [column_name_idx .get_level_values (lv ) for lv in level_indices ]
6625
+ )
6626
+
6627
+ # Since `level` may only specify a subset of all levels, `unique()` is
6628
+ # required to remove duplicates. In pandas, the order of the keys in
6629
+ # the specified levels are always sorted.
6630
+ unique_named_levels = named_levels .unique ().sort_values ()
6631
+
6632
+ # Each index from the original dataframe should repeat by the number
6633
+ # of unique values in the named_levels
6634
+ repeated_index = self .index .repeat (len (unique_named_levels ))
6635
+
6636
+ # Each column name should tile itself by len(df) times
6637
+ tiled_index = libcudf .reshape .tile (
6638
+ [
6639
+ as_column (unique_named_levels .get_level_values (i ))
6640
+ for i in range (unique_named_levels .nlevels )
6641
+ ],
6642
+ self .shape [0 ],
6643
+ )
6644
+
6645
+ # Assemble the final index
6646
+ new_index_columns = [* repeated_index ._columns , * tiled_index ]
6647
+ index_names = [* self ._index .names , * unique_named_levels .names ]
6478
6648
new_index = MultiIndex .from_frame (
6479
6649
DataFrame ._from_data (
6480
6650
dict (zip (range (0 , len (new_index_columns )), new_index_columns ))
6481
6651
),
6482
6652
names = index_names ,
6483
6653
)
6484
6654
6485
- # Collect datatypes and cast columns as that type
6486
- common_type = np .result_type (* self .dtypes )
6487
- homogenized = DataFrame ._from_data (
6488
- {
6489
- c : (
6490
- self ._data [c ].astype (common_type )
6491
- if not np .issubdtype (self ._data [c ].dtype , common_type )
6492
- else self ._data [c ]
6493
- )
6494
- for c in self ._data
6495
- }
6655
+ # Compute the column indices that serves as the input for
6656
+ # `interleave_columns`
6657
+ column_idx_df = pd .DataFrame (
6658
+ data = range (len (self ._data )), index = named_levels
6496
6659
)
6497
6660
6498
- result = Series ._from_data (
6499
- {
6500
- None : libcudf .reshape .interleave_columns (
6501
- [* homogenized ._columns ]
6661
+ column_indices : list [list [int ]] = []
6662
+ if has_unnamed_levels :
6663
+ unnamed_level_values = list (
6664
+ map (column_name_idx .get_level_values , unnamed_levels_indices )
6665
+ )
6666
+ unnamed_level_values = pd .MultiIndex .from_arrays (
6667
+ unnamed_level_values
6668
+ )
6669
+
6670
+ def unnamed_group_generator ():
6671
+ if has_unnamed_levels :
6672
+ for _ , grpdf in column_idx_df .groupby (by = unnamed_level_values ):
6673
+ # When stacking part of the levels, some combinations
6674
+ # of keys may not be present in this group but can be
6675
+ # present in others. Reindexing with the globally computed
6676
+ # `unique_named_levels` assigns -1 to these key
6677
+ # combinations, representing an all-null column that
6678
+ # is used in the subsequent libcudf call.
6679
+ yield grpdf .reindex (
6680
+ unique_named_levels , axis = 0 , fill_value = - 1
6681
+ ).sort_index ().values
6682
+ else :
6683
+ yield column_idx_df .sort_index ().values
6684
+
6685
+ column_indices = list (unnamed_group_generator ())
6686
+
6687
+ # For each of the group constructed from the unnamed levels,
6688
+ # invoke `interleave_columns` to stack the values.
6689
+ stacked = []
6690
+
6691
+ for column_idx in column_indices :
6692
+ # Collect columns based on indices, append None for -1 indices.
6693
+ columns = [
6694
+ None if i == - 1 else self ._data .select_by_index (i ).columns [0 ]
6695
+ for i in column_idx
6696
+ ]
6697
+
6698
+ # Collect datatypes and cast columns as that type
6699
+ common_type = np .result_type (
6700
+ * (col .dtype for col in columns if col is not None )
6701
+ )
6702
+
6703
+ all_nulls = functools .cache (
6704
+ functools .partial (
6705
+ column_empty , self .shape [0 ], common_type , masked = True
6502
6706
)
6503
- },
6504
- index = new_index ,
6505
- )
6707
+ )
6708
+
6709
+ # homogenize the dtypes of the columns
6710
+ homogenized = [
6711
+ col .astype (common_type ) if col is not None else all_nulls ()
6712
+ for col in columns
6713
+ ]
6714
+
6715
+ stacked .append (libcudf .reshape .interleave_columns (homogenized ))
6716
+
6717
+ # Construct the resulting dataframe / series
6718
+ if not has_unnamed_levels :
6719
+ result = Series ._from_data (
6720
+ data = {None : stacked [0 ]}, index = new_index
6721
+ )
6722
+ else :
6723
+ if unnamed_level_values .nlevels == 1 :
6724
+ unnamed_level_values = unnamed_level_values .get_level_values (0 )
6725
+ unnamed_level_values = unnamed_level_values .unique ().sort_values ()
6726
+
6727
+ data = ColumnAccessor (
6728
+ dict (zip (unnamed_level_values , stacked )),
6729
+ isinstance (unnamed_level_values , pd .MultiIndex ),
6730
+ unnamed_level_values .names ,
6731
+ )
6732
+
6733
+ result = DataFrame ._from_data (data , index = new_index )
6506
6734
6507
6735
if dropna :
6508
- return result .dropna ()
6736
+ return result .dropna (how = "all" )
6509
6737
else :
6510
6738
return result
6511
6739
0 commit comments