From 33a89adff6de821e6de87f48c90efca7693a7ed9 Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Sat, 3 Oct 2020 13:11:42 +0200 Subject: [PATCH 1/2] blacken groupby.rst --- doc/source/user_guide/groupby.rst | 463 ++++++++++++++++-------------- 1 file changed, 254 insertions(+), 209 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 52342de98de79..9696f14f03b56 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -68,19 +68,23 @@ object (more on what the GroupBy object is later), you may do the following: .. ipython:: python - df = pd.DataFrame([('bird', 'Falconiformes', 389.0), - ('bird', 'Psittaciformes', 24.0), - ('mammal', 'Carnivora', 80.2), - ('mammal', 'Primates', np.nan), - ('mammal', 'Carnivora', 58)], - index=['falcon', 'parrot', 'lion', 'monkey', 'leopard'], - columns=('class', 'order', 'max_speed')) + df = pd.DataFrame( + [ + ("bird", "Falconiformes", 389.0), + ("bird", "Psittaciformes", 24.0), + ("mammal", "Carnivora", 80.2), + ("mammal", "Primates", np.nan), + ("mammal", "Carnivora", 58), + ], + index=["falcon", "parrot", "lion", "monkey", "leopard"], + columns=("class", "order", "max_speed"), + ) df # default is axis=0 - grouped = df.groupby('class') - grouped = df.groupby('order', axis='columns') - grouped = df.groupby(['class', 'order']) + grouped = df.groupby("class") + grouped = df.groupby("order", axis="columns") + grouped = df.groupby(["class", "order"]) The mapping can be specified many different ways: @@ -103,12 +107,14 @@ consider the following ``DataFrame``: .. ipython:: python - df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) + df = pd.DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) df On a DataFrame, we obtain a GroupBy object by calling :meth:`~DataFrame.groupby`. @@ -116,8 +122,8 @@ We could naturally group by either the ``A`` or ``B`` columns, or both: .. ipython:: python - grouped = df.groupby('A') - grouped = df.groupby(['A', 'B']) + grouped = df.groupby("A") + grouped = df.groupby(["A", "B"]) .. versionadded:: 0.24 @@ -126,8 +132,8 @@ but the specified columns .. ipython:: python - df2 = df.set_index(['A', 'B']) - grouped = df2.groupby(level=df2.index.names.difference(['B'])) + df2 = df.set_index(["A", "B"]) + grouped = df2.groupby(level=df2.index.names.difference(["B"])) grouped.sum() These will split the DataFrame on its index (rows). We could also split by the @@ -181,9 +187,9 @@ By default the group keys are sorted during the ``groupby`` operation. You may h .. ipython:: python - df2 = pd.DataFrame({'X': ['B', 'B', 'A', 'A'], 'Y': [1, 2, 3, 4]}) - df2.groupby(['X']).sum() - df2.groupby(['X'], sort=False).sum() + df2 = pd.DataFrame({"X": ["B", "B", "A", "A"], "Y": [1, 2, 3, 4]}) + df2.groupby(["X"]).sum() + df2.groupby(["X"], sort=False).sum() Note that ``groupby`` will preserve the order in which *observations* are sorted *within* each group. @@ -191,10 +197,10 @@ For example, the groups created by ``groupby()`` below are in the order they app .. ipython:: python - df3 = pd.DataFrame({'X': ['A', 'B', 'A', 'B'], 'Y': [1, 4, 3, 2]}) - df3.groupby(['X']).get_group('A') + df3 = pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}) + df3.groupby(["X"]).get_group("A") - df3.groupby(['X']).get_group('B') + df3.groupby(["X"]).get_group("B") .. _groupby.dropna: @@ -236,7 +242,7 @@ above example we have: .. ipython:: python - df.groupby('A').groups + df.groupby("A").groups df.groupby(get_letter_type, axis=1).groups Calling the standard Python ``len`` function on the GroupBy object just returns @@ -244,7 +250,7 @@ the length of the ``groups`` dict, so it is largely just a convenience: .. ipython:: python - grouped = df.groupby(['A', 'B']) + grouped = df.groupby(["A", "B"]) grouped.groups len(grouped) @@ -259,15 +265,14 @@ the length of the ``groups`` dict, so it is largely just a convenience: n = 10 weight = np.random.normal(166, 20, size=n) height = np.random.normal(60, 10, size=n) - time = pd.date_range('1/1/2000', periods=n) - gender = np.random.choice(['male', 'female'], size=n) - df = pd.DataFrame({'height': height, 'weight': weight, - 'gender': gender}, index=time) + time = pd.date_range("1/1/2000", periods=n) + gender = np.random.choice(["male", "female"], size=n) + df = pd.DataFrame({"height": height, "weight": weight, "gender": gender}, index=time) .. ipython:: python df - gb = df.groupby('gender') + gb = df.groupby("gender") .. ipython:: @@ -291,9 +296,11 @@ Let's create a Series with a two-level ``MultiIndex``. .. ipython:: python - arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] - index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second']) + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + index = pd.MultiIndex.from_arrays(arrays, names=["first", "second"]) s = pd.Series(np.random.randn(8), index=index) s @@ -309,7 +316,7 @@ number: .. ipython:: python - s.groupby(level='second').sum() + s.groupby(level="second").sum() The aggregation functions such as ``sum`` will take the level parameter directly. Additionally, the resulting index will be named according to the @@ -317,30 +324,32 @@ chosen level: .. ipython:: python - s.sum(level='second') + s.sum(level="second") Grouping with multiple levels is supported. .. ipython:: python :suppress: - arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['doo', 'doo', 'bee', 'bee', 'bop', 'bop', 'bop', 'bop'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["doo", "doo", "bee", "bee", "bop", "bop", "bop", "bop"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] tuples = list(zip(*arrays)) - index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second', 'third']) + index = pd.MultiIndex.from_tuples(tuples, names=["first", "second", "third"]) s = pd.Series(np.random.randn(8), index=index) .. ipython:: python s - s.groupby(level=['first', 'second']).sum() + s.groupby(level=["first", "second"]).sum() Index level names may be supplied as keys. .. ipython:: python - s.groupby(['first', 'second']).sum() + s.groupby(["first", "second"]).sum() More on the ``sum`` function and aggregation later. @@ -352,14 +361,14 @@ objects. .. ipython:: python - arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] - index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second']) + index = pd.MultiIndex.from_arrays(arrays, names=["first", "second"]) - df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 3, 3], - 'B': np.arange(8)}, - index=index) + df = pd.DataFrame({"A": [1, 1, 1, 1, 2, 2, 3, 3], "B": np.arange(8)}, index=index) df @@ -368,19 +377,19 @@ the ``A`` column. .. ipython:: python - df.groupby([pd.Grouper(level=1), 'A']).sum() + df.groupby([pd.Grouper(level=1), "A"]).sum() Index levels may also be specified by name. .. ipython:: python - df.groupby([pd.Grouper(level='second'), 'A']).sum() + df.groupby([pd.Grouper(level="second"), "A"]).sum() Index level names may be specified as keys directly to ``groupby``. .. ipython:: python - df.groupby(['second', 'A']).sum() + df.groupby(["second", "A"]).sum() DataFrame column selection in GroupBy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -392,24 +401,26 @@ getting a column from a DataFrame, you can do: .. ipython:: python :suppress: - df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) + df = pd.DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) .. ipython:: python - grouped = df.groupby(['A']) - grouped_C = grouped['C'] - grouped_D = grouped['D'] + grouped = df.groupby(["A"]) + grouped_C = grouped["C"] + grouped_D = grouped["D"] This is mainly syntactic sugar for the alternative and much more verbose: .. ipython:: python - df['C'].groupby(df['A']) + df["C"].groupby(df["A"]) Additionally this method avoids recomputing the internal grouping information derived from the passed key. @@ -450,13 +461,13 @@ A single group can be selected using .. ipython:: python - grouped.get_group('bar') + grouped.get_group("bar") Or for an object grouped on multiple columns: .. ipython:: python - df.groupby(['A', 'B']).get_group(('bar', 'one')) + df.groupby(["A", "B"]).get_group(("bar", "one")) .. _groupby.aggregate: @@ -474,10 +485,10 @@ An obvious one is aggregation via the .. ipython:: python - grouped = df.groupby('A') + grouped = df.groupby("A") grouped.aggregate(np.sum) - grouped = df.groupby(['A', 'B']) + grouped = df.groupby(["A", "B"]) grouped.aggregate(np.sum) As you can see, the result of the aggregation will have the group names as the @@ -487,17 +498,17 @@ changed by using the ``as_index`` option: .. ipython:: python - grouped = df.groupby(['A', 'B'], as_index=False) + grouped = df.groupby(["A", "B"], as_index=False) grouped.aggregate(np.sum) - df.groupby('A', as_index=False).sum() + df.groupby("A", as_index=False).sum() Note that you could use the ``reset_index`` DataFrame function to achieve the same result as the column names are stored in the resulting ``MultiIndex``: .. ipython:: python - df.groupby(['A', 'B']).sum().reset_index() + df.groupby(["A", "B"]).sum().reset_index() Another simple aggregation example is to compute the size of each group. This is included in GroupBy as the ``size`` method. It returns a Series whose @@ -559,8 +570,8 @@ aggregation with, outputting a DataFrame: .. ipython:: python - grouped = df.groupby('A') - grouped['C'].agg([np.sum, np.mean, np.std]) + grouped = df.groupby("A") + grouped["C"].agg([np.sum, np.mean, np.std]) On a grouped ``DataFrame``, you can pass a list of functions to apply to each column, which produces an aggregated result with a hierarchical index: @@ -575,19 +586,21 @@ need to rename, then you can add in a chained operation for a ``Series`` like th .. ipython:: python - (grouped['C'].agg([np.sum, np.mean, np.std]) - .rename(columns={'sum': 'foo', - 'mean': 'bar', - 'std': 'baz'})) + ( + grouped["C"] + .agg([np.sum, np.mean, np.std]) + .rename(columns={"sum": "foo", "mean": "bar", "std": "baz"}) + ) For a grouped ``DataFrame``, you can rename in a similar manner: .. ipython:: python - (grouped.agg([np.sum, np.mean, np.std]) - .rename(columns={'sum': 'foo', - 'mean': 'bar', - 'std': 'baz'})) + ( + grouped.agg([np.sum, np.mean, np.std]).rename( + columns={"sum": "foo", "mean": "bar", "std": "baz"} + ) + ) .. note:: @@ -598,7 +611,7 @@ For a grouped ``DataFrame``, you can rename in a similar manner: .. ipython:: python :okexcept: - grouped['C'].agg(['sum', 'sum']) + grouped["C"].agg(["sum", "sum"]) Pandas *does* allow you to provide multiple lambdas. In this case, pandas @@ -607,8 +620,7 @@ For a grouped ``DataFrame``, you can rename in a similar manner: .. ipython:: python - grouped['C'].agg([lambda x: x.max() - x.min(), - lambda x: x.median() - x.mean()]) + grouped["C"].agg([lambda x: x.max() - x.min(), lambda x: x.median() - x.mean()]) @@ -631,15 +643,19 @@ accepts the special syntax in :meth:`GroupBy.agg`, known as "named aggregation", .. ipython:: python - animals = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'], - 'height': [9.1, 6.0, 9.5, 34.0], - 'weight': [7.9, 7.5, 9.9, 198.0]}) + animals = pd.DataFrame( + { + "kind": ["cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0], + "weight": [7.9, 7.5, 9.9, 198.0], + } + ) animals animals.groupby("kind").agg( - min_height=pd.NamedAgg(column='height', aggfunc='min'), - max_height=pd.NamedAgg(column='height', aggfunc='max'), - average_weight=pd.NamedAgg(column='weight', aggfunc=np.mean), + min_height=pd.NamedAgg(column="height", aggfunc="min"), + max_height=pd.NamedAgg(column="height", aggfunc="max"), + average_weight=pd.NamedAgg(column="weight", aggfunc=np.mean), ) @@ -648,9 +664,9 @@ accepts the special syntax in :meth:`GroupBy.agg`, known as "named aggregation", .. ipython:: python animals.groupby("kind").agg( - min_height=('height', 'min'), - max_height=('height', 'max'), - average_weight=('weight', np.mean), + min_height=("height", "min"), + max_height=("height", "max"), + average_weight=("weight", np.mean), ) @@ -659,9 +675,11 @@ and unpack the keyword arguments .. ipython:: python - animals.groupby("kind").agg(**{ - 'total weight': pd.NamedAgg(column='weight', aggfunc=sum), - }) + animals.groupby("kind").agg( + **{ + "total weight": pd.NamedAgg(column="weight", aggfunc=sum), + } + ) Additional keyword arguments are not passed through to the aggregation functions. Only pairs of ``(column, aggfunc)`` should be passed as ``**kwargs``. If your aggregation functions @@ -680,8 +698,8 @@ no column selection, so the values are just the functions. .. ipython:: python animals.groupby("kind").height.agg( - min_height='min', - max_height='max', + min_height="min", + max_height="max", ) Applying different functions to DataFrame columns @@ -692,8 +710,7 @@ columns of a DataFrame: .. ipython:: python - grouped.agg({'C': np.sum, - 'D': lambda x: np.std(x, ddof=1)}) + grouped.agg({"C": np.sum, "D": lambda x: np.std(x, ddof=1)}) The function names can also be strings. In order for a string to be valid it must be either implemented on GroupBy or available via :ref:`dispatching @@ -701,7 +718,7 @@ must be either implemented on GroupBy or available via :ref:`dispatching .. ipython:: python - grouped.agg({'C': 'sum', 'D': 'std'}) + grouped.agg({"C": "sum", "D": "std"}) .. _groupby.aggregate.cython: @@ -713,8 +730,8 @@ optimized Cython implementations: .. ipython:: python - df.groupby('A').sum() - df.groupby(['A', 'B']).mean() + df.groupby("A").sum() + df.groupby(["A", "B"]).mean() Of course ``sum`` and ``mean`` are implemented on pandas objects, so the above code would work even without the special versions via dispatching (see below). @@ -743,15 +760,14 @@ For example, suppose we wished to standardize the data within each group: .. ipython:: python - index = pd.date_range('10/1/1999', periods=1100) + index = pd.date_range("10/1/1999", periods=1100) ts = pd.Series(np.random.normal(0.5, 2, 1100), index) ts = ts.rolling(window=100, min_periods=100).mean().dropna() ts.head() ts.tail() - transformed = (ts.groupby(lambda x: x.year) - .transform(lambda x: (x - x.mean()) / x.std())) + transformed = ts.groupby(lambda x: x.year).transform(lambda x: (x - x.mean()) / x.std()) We would expect the result to now have mean 0 and standard deviation 1 within each group, which we can easily check: @@ -772,7 +788,7 @@ We can also visually compare the original and transformed data sets. .. ipython:: python - compare = pd.DataFrame({'Original': ts, 'Transformed': transformed}) + compare = pd.DataFrame({"Original": ts, "Transformed": transformed}) @savefig groupby_transform_plot.png compare.plot() @@ -788,8 +804,8 @@ Alternatively, the built-in methods could be used to produce the same outputs. .. ipython:: python - max = ts.groupby(lambda x: x.year).transform('max') - min = ts.groupby(lambda x: x.year).transform('min') + max = ts.groupby(lambda x: x.year).transform("max") + min = ts.groupby(lambda x: x.year).transform("min") max - min @@ -798,7 +814,7 @@ Another common data transform is to replace missing data with the group mean. .. ipython:: python :suppress: - cols = ['A', 'B', 'C'] + cols = ["A", "B", "C"] values = np.random.randn(1000, 3) values[np.random.randint(0, 1000, 100), 0] = np.nan values[np.random.randint(0, 1000, 50), 1] = np.nan @@ -809,7 +825,7 @@ Another common data transform is to replace missing data with the group mean. data_df - countries = np.array(['US', 'UK', 'GR', 'JP']) + countries = np.array(["US", "UK", "GR", "JP"]) key = countries[np.random.randint(0, 4, 1000)] grouped = data_df.groupby(key) @@ -859,11 +875,10 @@ the column B based on the groups of column A. .. ipython:: python - df_re = pd.DataFrame({'A': [1] * 10 + [5] * 10, - 'B': np.arange(20)}) + df_re = pd.DataFrame({"A": [1] * 10 + [5] * 10, "B": np.arange(20)}) df_re - df_re.groupby('A').rolling(4).B.mean() + df_re.groupby("A").rolling(4).B.mean() The ``expanding()`` method will accumulate a given operation @@ -872,7 +887,7 @@ group. .. ipython:: python - df_re.groupby('A').expanding().sum() + df_re.groupby("A").expanding().sum() Suppose you want to use the ``resample()`` method to get a daily @@ -881,13 +896,16 @@ missing values with the ``ffill()`` method. .. ipython:: python - df_re = pd.DataFrame({'date': pd.date_range(start='2016-01-01', periods=4, - freq='W'), - 'group': [1, 1, 2, 2], - 'val': [5, 6, 7, 8]}).set_index('date') + df_re = pd.DataFrame( + { + "date": pd.date_range(start="2016-01-01", periods=4, freq="W"), + "group": [1, 1, 2, 2], + "val": [5, 6, 7, 8], + } + ).set_index("date") df_re - df_re.groupby('group').resample('1D').ffill() + df_re.groupby("group").resample("1D").ffill() .. _groupby.filter: @@ -911,8 +929,8 @@ with only a couple members. .. ipython:: python - dff = pd.DataFrame({'A': np.arange(8), 'B': list('aabbbbcc')}) - dff.groupby('B').filter(lambda x: len(x) > 2) + dff = pd.DataFrame({"A": np.arange(8), "B": list("aabbbbcc")}) + dff.groupby("B").filter(lambda x: len(x) > 2) Alternatively, instead of dropping the offending groups, we can return a like-indexed objects where the groups that do not pass the filter are filled @@ -920,14 +938,14 @@ with NaNs. .. ipython:: python - dff.groupby('B').filter(lambda x: len(x) > 2, dropna=False) + dff.groupby("B").filter(lambda x: len(x) > 2, dropna=False) For DataFrames with multiple columns, filters should explicitly specify a column as the filter criterion. .. ipython:: python - dff['C'] = np.arange(8) - dff.groupby('B').filter(lambda x: len(x['C']) > 2) + dff["C"] = np.arange(8) + dff.groupby("B").filter(lambda x: len(x["C"]) > 2) .. note:: @@ -939,7 +957,7 @@ For DataFrames with multiple columns, filters should explicitly specify a column .. ipython:: python - dff.groupby('B').head(2) + dff.groupby("B").head(2) .. _groupby.dispatch: @@ -953,7 +971,7 @@ functions: .. ipython:: python - grouped = df.groupby('A') + grouped = df.groupby("A") grouped.agg(lambda x: x.std()) But, it's rather verbose and can be untidy if you need to pass additional @@ -973,12 +991,14 @@ next). This enables some operations to be carried out rather succinctly: .. ipython:: python - tsdf = pd.DataFrame(np.random.randn(1000, 3), - index=pd.date_range('1/1/2000', periods=1000), - columns=['A', 'B', 'C']) + tsdf = pd.DataFrame( + np.random.randn(1000, 3), + index=pd.date_range("1/1/2000", periods=1000), + columns=["A", "B", "C"], + ) tsdf.iloc[::2] = np.nan grouped = tsdf.groupby(lambda x: x.year) - grouped.fillna(method='pad') + grouped.fillna(method="pad") In this example, we chopped the collection of time series into yearly chunks then independently called :ref:`fillna ` on the @@ -989,7 +1009,7 @@ The ``nlargest`` and ``nsmallest`` methods work on ``Series`` style groupbys: .. ipython:: python s = pd.Series([9, 8, 7, 5, 19, 1, 4.2, 3.3]) - g = pd.Series(list('abababab')) + g = pd.Series(list("abababab")) gb = s.groupby(g) gb.nlargest(3) gb.nsmallest(3) @@ -1008,10 +1028,10 @@ for both ``aggregate`` and ``transform`` in many standard use cases. However, .. ipython:: python df - grouped = df.groupby('A') + grouped = df.groupby("A") # could also just call .describe() - grouped['C'].apply(lambda x: x.describe()) + grouped["C"].apply(lambda x: x.describe()) The dimension of the returned result can also change: @@ -1032,7 +1052,8 @@ that is itself a series, and possibly upcast the result to a DataFrame: .. ipython:: python def f(x): - return pd.Series([x, x ** 2], index=['x', 'x^2']) + return pd.Series([x, x ** 2], index=["x", "x^2"]) + s = pd.Series(np.random.rand(5)) s @@ -1133,7 +1154,7 @@ will be (silently) dropped. Thus, this does not pose any problems: .. ipython:: python - df.groupby('A').std() + df.groupby("A").std() Note that ``df.groupby('A').colname.std().`` is more efficient than ``df.groupby('A').std().colname``, so if the result of an aggregation function @@ -1151,23 +1172,29 @@ is only interesting over one column (here ``colname``), it may be filtered .. ipython:: python from decimal import Decimal + df_dec = pd.DataFrame( - {'id': [1, 2, 1, 2], - 'int_column': [1, 2, 3, 4], - 'dec_column': [Decimal('0.50'), Decimal('0.15'), - Decimal('0.25'), Decimal('0.40')] - } + { + "id": [1, 2, 1, 2], + "int_column": [1, 2, 3, 4], + "dec_column": [ + Decimal("0.50"), + Decimal("0.15"), + Decimal("0.25"), + Decimal("0.40"), + ], + } ) # Decimal columns can be sum'd explicitly by themselves... - df_dec.groupby(['id'])[['dec_column']].sum() + df_dec.groupby(["id"])[["dec_column"]].sum() # ...but cannot be combined with standard data types or they will be excluded - df_dec.groupby(['id'])[['int_column', 'dec_column']].sum() + df_dec.groupby(["id"])[["int_column", "dec_column"]].sum() # Use .agg function to aggregate over standard and "nuisance" data types # at the same time - df_dec.groupby(['id']).agg({'int_column': 'sum', 'dec_column': 'sum'}) + df_dec.groupby(["id"]).agg({"int_column": "sum", "dec_column": "sum"}) .. _groupby.observed: @@ -1182,25 +1209,27 @@ Show all values: .. ipython:: python - pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], - categories=['a', 'b']), - observed=False).count() + pd.Series([1, 1, 1]).groupby( + pd.Categorical(["a", "a", "a"], categories=["a", "b"]), observed=False + ).count() Show only the observed values: .. ipython:: python - pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], - categories=['a', 'b']), - observed=True).count() + pd.Series([1, 1, 1]).groupby( + pd.Categorical(["a", "a", "a"], categories=["a", "b"]), observed=True + ).count() The returned dtype of the grouped will *always* include *all* of the categories that were grouped. .. ipython:: python - s = pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], - categories=['a', 'b']), - observed=False).count() + s = ( + pd.Series([1, 1, 1]) + .groupby(pd.Categorical(["a", "a", "a"], categories=["a", "b"]), observed=False) + .count() + ) s.index.dtype .. _groupby.missing: @@ -1224,7 +1253,7 @@ can be used as group keys. If so, the order of the levels will be preserved: data = pd.Series(np.random.randn(100)) - factor = pd.qcut(data, [0, .25, .5, .75, 1.]) + factor = pd.qcut(data, [0, 0.25, 0.5, 0.75, 1.0]) data.groupby(factor).mean() @@ -1240,19 +1269,23 @@ use the ``pd.Grouper`` to provide this local control. import datetime - df = pd.DataFrame({'Branch': 'A A A A A A A B'.split(), - 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), - 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], - 'Date': [ - datetime.datetime(2013, 1, 1, 13, 0), - datetime.datetime(2013, 1, 1, 13, 5), - datetime.datetime(2013, 10, 1, 20, 0), - datetime.datetime(2013, 10, 2, 10, 0), - datetime.datetime(2013, 10, 1, 20, 0), - datetime.datetime(2013, 10, 2, 10, 0), - datetime.datetime(2013, 12, 2, 12, 0), - datetime.datetime(2013, 12, 2, 14, 0)] - }) + df = pd.DataFrame( + { + "Branch": "A A A A A A A B".split(), + "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(), + "Quantity": [1, 3, 5, 1, 8, 1, 9, 3], + "Date": [ + datetime.datetime(2013, 1, 1, 13, 0), + datetime.datetime(2013, 1, 1, 13, 5), + datetime.datetime(2013, 10, 1, 20, 0), + datetime.datetime(2013, 10, 2, 10, 0), + datetime.datetime(2013, 10, 1, 20, 0), + datetime.datetime(2013, 10, 2, 10, 0), + datetime.datetime(2013, 12, 2, 12, 0), + datetime.datetime(2013, 12, 2, 14, 0), + ], + } + ) df @@ -1260,18 +1293,18 @@ Groupby a specific column with the desired frequency. This is like resampling. .. ipython:: python - df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer']).sum() + df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"]).sum() You have an ambiguous specification in that you have a named index and a column that could be potential groupers. .. ipython:: python - df = df.set_index('Date') - df['Date'] = df.index + pd.offsets.MonthEnd(2) - df.groupby([pd.Grouper(freq='6M', key='Date'), 'Buyer']).sum() + df = df.set_index("Date") + df["Date"] = df.index + pd.offsets.MonthEnd(2) + df.groupby([pd.Grouper(freq="6M", key="Date"), "Buyer"]).sum() - df.groupby([pd.Grouper(freq='6M', level='Date'), 'Buyer']).sum() + df.groupby([pd.Grouper(freq="6M", level="Date"), "Buyer"]).sum() Taking the first rows of each group @@ -1281,10 +1314,10 @@ Just like for a DataFrame or Series you can call head and tail on a groupby: .. ipython:: python - df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) df - g = df.groupby('A') + g = df.groupby("A") g.head(1) g.tail(1) @@ -1302,8 +1335,8 @@ will return a single row (or no row) per group if you pass an int for n: .. ipython:: python - df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') + df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) + g = df.groupby("A") g.nth(0) g.nth(-1) @@ -1314,21 +1347,21 @@ If you want to select the nth not-null item, use the ``dropna`` kwarg. For a Dat .. ipython:: python # nth(0) is the same as g.first() - g.nth(0, dropna='any') + g.nth(0, dropna="any") g.first() # nth(-1) is the same as g.last() - g.nth(-1, dropna='any') # NaNs denote group exhausted when using dropna + g.nth(-1, dropna="any") # NaNs denote group exhausted when using dropna g.last() - g.B.nth(0, dropna='all') + g.B.nth(0, dropna="all") As with other methods, passing ``as_index=False``, will achieve a filtration, which returns the grouped row. .. ipython:: python - df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A', as_index=False) + df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) + g = df.groupby("A", as_index=False) g.nth(0) g.nth(-1) @@ -1337,8 +1370,8 @@ You can also select multiple rows from each group by specifying multiple nth val .. ipython:: python - business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', freq='B') - df = pd.DataFrame(1, index=business_dates, columns=['a', 'b']) + business_dates = pd.date_range(start="4/1/2014", end="6/30/2014", freq="B") + df = pd.DataFrame(1, index=business_dates, columns=["a", "b"]) # get the first, 4th, and last date index for each month df.groupby([df.index.year, df.index.month]).nth([0, 3, -1]) @@ -1350,12 +1383,12 @@ To see the order in which each row appears within its group, use the .. ipython:: python - dfg = pd.DataFrame(list('aaabba'), columns=['A']) + dfg = pd.DataFrame(list("aaabba"), columns=["A"]) dfg - dfg.groupby('A').cumcount() + dfg.groupby("A").cumcount() - dfg.groupby('A').cumcount(ascending=False) + dfg.groupby("A").cumcount(ascending=False) .. _groupby.ngroup: @@ -1374,12 +1407,12 @@ order they are first observed. .. ipython:: python - dfg = pd.DataFrame(list('aaabba'), columns=['A']) + dfg = pd.DataFrame(list("aaabba"), columns=["A"]) dfg - dfg.groupby('A').ngroup() + dfg.groupby("A").ngroup() - dfg.groupby('A').ngroup(ascending=False) + dfg.groupby("A").ngroup(ascending=False) Plotting ~~~~~~~~ @@ -1392,8 +1425,8 @@ the values in column 1 where the group is "B" are 3 higher on average. np.random.seed(1234) df = pd.DataFrame(np.random.randn(50, 2)) - df['g'] = np.random.choice(['A', 'B'], size=50) - df.loc[df['g'] == 'B', 1] += 3 + df["g"] = np.random.choice(["A", "B"], size=50) + df.loc[df["g"] == "B", 1] += 3 We can easily visualize this with a boxplot: @@ -1401,7 +1434,7 @@ We can easily visualize this with a boxplot: :okwarning: @savefig groupby_boxplot.png - df.groupby('g').boxplot() + df.groupby("g").boxplot() The result of calling ``boxplot`` is a dictionary whose keys are the values of our grouping column ``g`` ("A" and "B"). The values of the resulting dictionary @@ -1436,20 +1469,26 @@ code more readable. First we set the data: .. ipython:: python n = 1000 - df = pd.DataFrame({'Store': np.random.choice(['Store_1', 'Store_2'], n), - 'Product': np.random.choice(['Product_1', - 'Product_2'], n), - 'Revenue': (np.random.random(n) * 50 + 10).round(2), - 'Quantity': np.random.randint(1, 10, size=n)}) + df = pd.DataFrame( + { + "Store": np.random.choice(["Store_1", "Store_2"], n), + "Product": np.random.choice(["Product_1", "Product_2"], n), + "Revenue": (np.random.random(n) * 50 + 10).round(2), + "Quantity": np.random.randint(1, 10, size=n), + } + ) df.head(2) Now, to find prices per store/product, we can simply do: .. ipython:: python - (df.groupby(['Store', 'Product']) - .pipe(lambda grp: grp.Revenue.sum() / grp.Quantity.sum()) - .unstack().round(2)) + ( + df.groupby(["Store", "Product"]) + .pipe(lambda grp: grp.Revenue.sum() / grp.Quantity.sum()) + .unstack() + .round(2) + ) Piping can also be expressive when you want to deliver a grouped object to some arbitrary function, for example: @@ -1459,7 +1498,8 @@ arbitrary function, for example: def mean(groupby): return groupby.mean() - df.groupby(['Store', 'Product']).pipe(mean) + + df.groupby(["Store", "Product"]).pipe(mean) where ``mean`` takes a GroupBy object and finds the mean of the Revenue and Quantity columns respectively for each Store-Product combination. The ``mean`` function can @@ -1476,8 +1516,7 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on .. ipython:: python - df = pd.DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0], - 'c': [1, 0, 0], 'd': [2, 3, 4]}) + df = pd.DataFrame({"a": [1, 0, 0], "b": [0, 1, 0], "c": [1, 0, 0], "d": [2, 3, 4]}) df df.groupby(df.sum(), axis=1).sum() @@ -1536,16 +1575,22 @@ column index name will be used as the name of the inserted column: .. ipython:: python - df = pd.DataFrame({'a': [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], - 'b': [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1], - 'c': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], - 'd': [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1]}) + df = pd.DataFrame( + { + "a": [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], + "b": [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1], + "c": [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], + "d": [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1], + } + ) + def compute_metrics(x): - result = {'b_sum': x['b'].sum(), 'c_mean': x['c'].mean()} - return pd.Series(result, name='metrics') + result = {"b_sum": x["b"].sum(), "c_mean": x["c"].mean()} + return pd.Series(result, name="metrics") + - result = df.groupby('a').apply(compute_metrics) + result = df.groupby("a").apply(compute_metrics) result From bcbac2a3d1adb95afc8cbb961baf29d26a0c5289 Mon Sep 17 00:00:00 2001 From: Erfan Nariman <34067903+erfannariman@users.noreply.github.com> Date: Sat, 3 Oct 2020 19:35:01 +0200 Subject: [PATCH 2/2] Update doc/source/user_guide/groupby.rst Co-authored-by: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> --- doc/source/user_guide/groupby.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 9696f14f03b56..ec64442319a84 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -677,7 +677,7 @@ and unpack the keyword arguments animals.groupby("kind").agg( **{ - "total weight": pd.NamedAgg(column="weight", aggfunc=sum), + "total weight": pd.NamedAgg(column="weight", aggfunc=sum) } )