|
17 | 17 | Iterable,
|
18 | 18 | Mapping,
|
19 | 19 | NamedTuple,
|
| 20 | + Sequence, |
20 | 21 | TypeVar,
|
21 | 22 | Union,
|
22 | 23 | cast,
|
|
76 | 77 | _transform_template,
|
77 | 78 | warn_dropping_nuisance_columns_deprecated,
|
78 | 79 | )
|
| 80 | +from pandas.core.groupby.grouper import get_grouper |
79 | 81 | from pandas.core.indexes.api import (
|
80 | 82 | Index,
|
81 | 83 | MultiIndex,
|
@@ -1569,6 +1571,193 @@ def func(df):
|
1569 | 1571 |
|
1570 | 1572 | boxplot = boxplot_frame_groupby
|
1571 | 1573 |
|
| 1574 | + def value_counts( |
| 1575 | + self, |
| 1576 | + subset: Sequence[Hashable] | None = None, |
| 1577 | + normalize: bool = False, |
| 1578 | + sort: bool = True, |
| 1579 | + ascending: bool = False, |
| 1580 | + dropna: bool = True, |
| 1581 | + ) -> DataFrame | Series: |
| 1582 | + """ |
| 1583 | + Return a Series or DataFrame containing counts of unique rows. |
| 1584 | +
|
| 1585 | + .. versionadded:: 1.4.0 |
| 1586 | +
|
| 1587 | + Parameters |
| 1588 | + ---------- |
| 1589 | + subset : list-like, optional |
| 1590 | + Columns to use when counting unique combinations. |
| 1591 | + normalize : bool, default False |
| 1592 | + Return proportions rather than frequencies. |
| 1593 | + sort : bool, default True |
| 1594 | + Sort by frequencies. |
| 1595 | + ascending : bool, default False |
| 1596 | + Sort in ascending order. |
| 1597 | + dropna : bool, default True |
| 1598 | + Don’t include counts of rows that contain NA values. |
| 1599 | +
|
| 1600 | + Returns |
| 1601 | + ------- |
| 1602 | + Series or DataFrame |
| 1603 | + Series if the groupby as_index is True, otherwise DataFrame. |
| 1604 | +
|
| 1605 | + See Also |
| 1606 | + -------- |
| 1607 | + Series.value_counts: Equivalent method on Series. |
| 1608 | + DataFrame.value_counts: Equivalent method on DataFrame. |
| 1609 | + SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy. |
| 1610 | +
|
| 1611 | + Notes |
| 1612 | + ----- |
| 1613 | + - If the groupby as_index is True then the returned Series will have a |
| 1614 | + MultiIndex with one level per input column. |
| 1615 | + - If the groupby as_index is False then the returned DataFrame will have an |
| 1616 | + additional column with the value_counts. The column is labelled 'count' or |
| 1617 | + 'proportion', depending on the ``normalize`` parameter. |
| 1618 | +
|
| 1619 | + By default, rows that contain any NA values are omitted from |
| 1620 | + the result. |
| 1621 | +
|
| 1622 | + By default, the result will be in descending order so that the |
| 1623 | + first element of each group is the most frequently-occurring row. |
| 1624 | +
|
| 1625 | + Examples |
| 1626 | + -------- |
| 1627 | + >>> df = pd.DataFrame({ |
| 1628 | + ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], |
| 1629 | + ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], |
| 1630 | + ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] |
| 1631 | + ... }) |
| 1632 | +
|
| 1633 | + >>> df |
| 1634 | + gender education country |
| 1635 | + 0 male low US |
| 1636 | + 1 male medium FR |
| 1637 | + 2 female high US |
| 1638 | + 3 male low FR |
| 1639 | + 4 female high FR |
| 1640 | + 5 male low FR |
| 1641 | +
|
| 1642 | + >>> df.groupby('gender').value_counts() |
| 1643 | + gender education country |
| 1644 | + female high FR 1 |
| 1645 | + US 1 |
| 1646 | + male low FR 2 |
| 1647 | + US 1 |
| 1648 | + medium FR 1 |
| 1649 | + dtype: int64 |
| 1650 | +
|
| 1651 | + >>> df.groupby('gender').value_counts(ascending=True) |
| 1652 | + gender education country |
| 1653 | + female high FR 1 |
| 1654 | + US 1 |
| 1655 | + male low US 1 |
| 1656 | + medium FR 1 |
| 1657 | + low FR 2 |
| 1658 | + dtype: int64 |
| 1659 | +
|
| 1660 | + >>> df.groupby('gender').value_counts(normalize=True) |
| 1661 | + gender education country |
| 1662 | + female high FR 0.50 |
| 1663 | + US 0.50 |
| 1664 | + male low FR 0.50 |
| 1665 | + US 0.25 |
| 1666 | + medium FR 0.25 |
| 1667 | + dtype: float64 |
| 1668 | +
|
| 1669 | + >>> df.groupby('gender', as_index=False).value_counts() |
| 1670 | + gender education country count |
| 1671 | + 0 female high FR 1 |
| 1672 | + 1 female high US 1 |
| 1673 | + 2 male low FR 2 |
| 1674 | + 3 male low US 1 |
| 1675 | + 4 male medium FR 1 |
| 1676 | +
|
| 1677 | + >>> df.groupby('gender', as_index=False).value_counts(normalize=True) |
| 1678 | + gender education country proportion |
| 1679 | + 0 female high FR 0.50 |
| 1680 | + 1 female high US 0.50 |
| 1681 | + 2 male low FR 0.50 |
| 1682 | + 3 male low US 0.25 |
| 1683 | + 4 male medium FR 0.25 |
| 1684 | + """ |
| 1685 | + if self.axis == 1: |
| 1686 | + raise NotImplementedError( |
| 1687 | + "DataFrameGroupBy.value_counts only handles axis=0" |
| 1688 | + ) |
| 1689 | + |
| 1690 | + with self._group_selection_context(): |
| 1691 | + df = self.obj |
| 1692 | + |
| 1693 | + in_axis_names = { |
| 1694 | + grouping.name for grouping in self.grouper.groupings if grouping.in_axis |
| 1695 | + } |
| 1696 | + if isinstance(self._selected_obj, Series): |
| 1697 | + name = self._selected_obj.name |
| 1698 | + keys = [] if name in in_axis_names else [self._selected_obj] |
| 1699 | + else: |
| 1700 | + keys = [ |
| 1701 | + # Can't use .values because the column label needs to be preserved |
| 1702 | + self._selected_obj.iloc[:, idx] |
| 1703 | + for idx, name in enumerate(self._selected_obj.columns) |
| 1704 | + if name not in in_axis_names |
| 1705 | + ] |
| 1706 | + |
| 1707 | + if subset is not None: |
| 1708 | + clashing = set(subset) & set(in_axis_names) |
| 1709 | + if clashing: |
| 1710 | + raise ValueError( |
| 1711 | + f"Keys {clashing} in subset cannot be in " |
| 1712 | + "the groupby column keys" |
| 1713 | + ) |
| 1714 | + |
| 1715 | + groupings = list(self.grouper.groupings) |
| 1716 | + for key in keys: |
| 1717 | + grouper, _, _ = get_grouper( |
| 1718 | + df, |
| 1719 | + key=key, |
| 1720 | + axis=self.axis, |
| 1721 | + sort=self.sort, |
| 1722 | + dropna=dropna, |
| 1723 | + ) |
| 1724 | + groupings += list(grouper.groupings) |
| 1725 | + |
| 1726 | + # Take the size of the overall columns |
| 1727 | + gb = df.groupby( |
| 1728 | + groupings, |
| 1729 | + sort=self.sort, |
| 1730 | + observed=self.observed, |
| 1731 | + dropna=self.dropna, |
| 1732 | + ) |
| 1733 | + result = cast(Series, gb.size()) |
| 1734 | + |
| 1735 | + if normalize: |
| 1736 | + # Normalize the results by dividing by the original group sizes. |
| 1737 | + # We are guaranteed to have the first N levels be the |
| 1738 | + # user-requested grouping. |
| 1739 | + levels = list(range(len(self.grouper.groupings), result.index.nlevels)) |
| 1740 | + indexed_group_size = result.groupby( |
| 1741 | + result.index.droplevel(levels), |
| 1742 | + sort=self.sort, |
| 1743 | + observed=self.observed, |
| 1744 | + dropna=self.dropna, |
| 1745 | + ).transform("sum") |
| 1746 | + |
| 1747 | + result /= indexed_group_size |
| 1748 | + |
| 1749 | + if sort: |
| 1750 | + # Sort the values and then resort by the main grouping |
| 1751 | + index_level = range(len(self.grouper.groupings)) |
| 1752 | + result = result.sort_values(ascending=ascending).sort_index( |
| 1753 | + level=index_level, sort_remaining=False |
| 1754 | + ) |
| 1755 | + |
| 1756 | + if not self.as_index: |
| 1757 | + # Convert to frame |
| 1758 | + result = result.reset_index(name="proportion" if normalize else "count") |
| 1759 | + return result.__finalize__(self.obj, method="value_counts") |
| 1760 | + |
1572 | 1761 |
|
1573 | 1762 | def _wrap_transform_general_frame(
|
1574 | 1763 | obj: DataFrame, group: DataFrame, res: DataFrame | Series
|
|
0 commit comments