Skip to content

Commit 662c102

Browse files
committed
Merge branch 'master' into 33200-groupby-quantile
2 parents 5832ba9 + 0db2286 commit 662c102

File tree

174 files changed

+3963
-2756
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

174 files changed

+3963
-2756
lines changed

asv_bench/benchmarks/arithmetic.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def time_series_op_with_fill_value_no_nas(self):
6767
self.ser.add(self.ser, fill_value=4)
6868

6969

70-
class MixedFrameWithSeriesAxis0:
70+
class MixedFrameWithSeriesAxis:
7171
params = [
7272
[
7373
"eq",
@@ -78,7 +78,7 @@ class MixedFrameWithSeriesAxis0:
7878
"gt",
7979
"add",
8080
"sub",
81-
"div",
81+
"truediv",
8282
"floordiv",
8383
"mul",
8484
"pow",
@@ -87,15 +87,19 @@ class MixedFrameWithSeriesAxis0:
8787
param_names = ["opname"]
8888

8989
def setup(self, opname):
90-
arr = np.arange(10 ** 6).reshape(100, -1)
90+
arr = np.arange(10 ** 6).reshape(1000, -1)
9191
df = DataFrame(arr)
9292
df["C"] = 1.0
9393
self.df = df
9494
self.ser = df[0]
95+
self.row = df.iloc[0]
9596

9697
def time_frame_op_with_series_axis0(self, opname):
9798
getattr(self.df, opname)(self.ser, axis=0)
9899

100+
def time_frame_op_with_series_axis1(self, opname):
101+
getattr(operator, opname)(self.df, self.ser)
102+
99103

100104
class Ops:
101105

asv_bench/benchmarks/groupby.py

+58
Original file line numberDiff line numberDiff line change
@@ -660,4 +660,62 @@ def function(values):
660660
self.grouper.transform(function, engine="cython")
661661

662662

663+
class AggEngine:
664+
def setup(self):
665+
N = 10 ** 3
666+
data = DataFrame(
667+
{0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N},
668+
columns=[0, 1],
669+
)
670+
self.grouper = data.groupby(0)
671+
672+
def time_series_numba(self):
673+
def function(values, index):
674+
total = 0
675+
for i, value in enumerate(values):
676+
if i % 2:
677+
total += value + 5
678+
else:
679+
total += value * 2
680+
return total
681+
682+
self.grouper[1].agg(function, engine="numba")
683+
684+
def time_series_cython(self):
685+
def function(values):
686+
total = 0
687+
for i, value in enumerate(values):
688+
if i % 2:
689+
total += value + 5
690+
else:
691+
total += value * 2
692+
return total
693+
694+
self.grouper[1].agg(function, engine="cython")
695+
696+
def time_dataframe_numba(self):
697+
def function(values, index):
698+
total = 0
699+
for i, value in enumerate(values):
700+
if i % 2:
701+
total += value + 5
702+
else:
703+
total += value * 2
704+
return total
705+
706+
self.grouper.agg(function, engine="numba")
707+
708+
def time_dataframe_cython(self):
709+
def function(values):
710+
total = 0
711+
for i, value in enumerate(values):
712+
if i % 2:
713+
total += value + 5
714+
else:
715+
total += value * 2
716+
return total
717+
718+
self.grouper.agg(function, engine="cython")
719+
720+
663721
from .pandas_vb_common import setup # noqa: F401 isort:skip

asv_bench/benchmarks/stat_ops.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ class FrameOps:
1111
param_names = ["op", "dtype", "axis"]
1212

1313
def setup(self, op, dtype, axis):
14-
if op == "mad" and dtype == "Int64" and axis == 1:
15-
# GH-33036
14+
if op == "mad" and dtype == "Int64":
15+
# GH-33036, GH#33600
1616
raise NotImplementedError
1717
values = np.random.randn(100000, 4)
1818
if dtype == "Int64":

ci/code_checks.sh

+7-1
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,13 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then
150150
# Check for imports from pandas._testing instead of `import pandas._testing as tm`
151151
invgrep -R --include="*.py*" -E "from pandas._testing import" pandas/tests
152152
RET=$(($RET + $?)) ; echo $MSG "DONE"
153-
invgrep -R --include="*.py*" -E "from pandas.util import testing as tm" pandas/tests
153+
invgrep -R --include="*.py*" -E "from pandas import _testing as tm" pandas/tests
154+
RET=$(($RET + $?)) ; echo $MSG "DONE"
155+
156+
# No direct imports from conftest
157+
invgrep -R --include="*.py*" -E "conftest import" pandas/tests
158+
RET=$(($RET + $?)) ; echo $MSG "DONE"
159+
invgrep -R --include="*.py*" -E "import conftest" pandas/tests
154160
RET=$(($RET + $?)) ; echo $MSG "DONE"
155161

156162
MSG='Check for use of exec' ; echo $MSG

doc/source/getting_started/index.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -398,7 +398,7 @@ data set, a sliding window of the data or grouped by categories. The latter is a
398398
<div class="card-body">
399399

400400
Change the structure of your data table in multiple ways. You can :func:`~pandas.melt` your data table from wide to long/tidy form or :func:`~pandas.pivot`
401-
from long to wide format. With aggregations built-in, a pivot table is created with a sinlge command.
401+
from long to wide format. With aggregations built-in, a pivot table is created with a single command.
402402

403403
.. image:: ../_static/schemas/07_melt.svg
404404
:align: center

doc/source/user_guide/computation.rst

+12-8
Original file line numberDiff line numberDiff line change
@@ -318,8 +318,8 @@ We provide a number of common statistical functions:
318318
:meth:`~Rolling.kurt`, Sample kurtosis (4th moment)
319319
:meth:`~Rolling.quantile`, Sample quantile (value at %)
320320
:meth:`~Rolling.apply`, Generic apply
321-
:meth:`~Rolling.cov`, Unbiased covariance (binary)
322-
:meth:`~Rolling.corr`, Correlation (binary)
321+
:meth:`~Rolling.cov`, Sample covariance (binary)
322+
:meth:`~Rolling.corr`, Sample correlation (binary)
323323

324324
.. _computation.window_variance.caveats:
325325

@@ -341,6 +341,8 @@ We provide a number of common statistical functions:
341341
sample variance under the circumstances would result in a biased estimator
342342
of the variable we are trying to determine.
343343

344+
The same caveats apply to using any supported statistical sample methods.
345+
344346
.. _stats.rolling_apply:
345347

346348
Rolling apply
@@ -380,8 +382,8 @@ and their default values are set to ``False``, ``True`` and ``False`` respective
380382
.. note::
381383

382384
In terms of performance, **the first time a function is run using the Numba engine will be slow**
383-
as Numba will have some function compilation overhead. However, ``rolling`` objects will cache
384-
the function and subsequent calls will be fast. In general, the Numba engine is performant with
385+
as Numba will have some function compilation overhead. However, the compiled functions are cached,
386+
and subsequent calls will be fast. In general, the Numba engine is performant with
385387
a larger amount of data points (e.g. 1+ million).
386388

387389
.. code-block:: ipython
@@ -870,12 +872,12 @@ Method summary
870872
:meth:`~Expanding.max`, Maximum
871873
:meth:`~Expanding.std`, Sample standard deviation
872874
:meth:`~Expanding.var`, Sample variance
873-
:meth:`~Expanding.skew`, Unbiased skewness (3rd moment)
874-
:meth:`~Expanding.kurt`, Unbiased kurtosis (4th moment)
875+
:meth:`~Expanding.skew`, Sample skewness (3rd moment)
876+
:meth:`~Expanding.kurt`, Sample kurtosis (4th moment)
875877
:meth:`~Expanding.quantile`, Sample quantile (value at %)
876878
:meth:`~Expanding.apply`, Generic apply
877-
:meth:`~Expanding.cov`, Unbiased covariance (binary)
878-
:meth:`~Expanding.corr`, Correlation (binary)
879+
:meth:`~Expanding.cov`, Sample covariance (binary)
880+
:meth:`~Expanding.corr`, Sample correlation (binary)
879881

880882
.. note::
881883

@@ -884,6 +886,8 @@ Method summary
884886
windows. See :ref:`this section <computation.window_variance.caveats>` for more
885887
information.
886888

889+
The same caveats apply to using any supported statistical sample methods.
890+
887891
.. currentmodule:: pandas
888892

889893
Aside from not having a ``window`` parameter, these functions have the same

doc/source/user_guide/cookbook.rst

-27
Original file line numberDiff line numberDiff line change
@@ -1333,33 +1333,6 @@ Values can be set to NaT using np.nan, similar to datetime
13331333
y[1] = np.nan
13341334
y
13351335
1336-
Aliasing axis names
1337-
-------------------
1338-
1339-
To globally provide aliases for axis names, one can define these 2 functions:
1340-
1341-
.. ipython:: python
1342-
1343-
def set_axis_alias(cls, axis, alias):
1344-
if axis not in cls._AXIS_NUMBERS:
1345-
raise Exception("invalid axis [%s] for alias [%s]" % (axis, alias))
1346-
cls._AXIS_ALIASES[alias] = axis
1347-
1348-
.. ipython:: python
1349-
1350-
def clear_axis_alias(cls, axis, alias):
1351-
if axis not in cls._AXIS_NUMBERS:
1352-
raise Exception("invalid axis [%s] for alias [%s]" % (axis, alias))
1353-
cls._AXIS_ALIASES.pop(alias, None)
1354-
1355-
.. ipython:: python
1356-
1357-
set_axis_alias(pd.DataFrame, 'columns', 'myaxis2')
1358-
df2 = pd.DataFrame(np.random.randn(3, 2), columns=['c1', 'c2'],
1359-
index=['i1', 'i2', 'i3'])
1360-
df2.sum(axis='myaxis2')
1361-
clear_axis_alias(pd.DataFrame, 'columns', 'myaxis2')
1362-
13631336
Creating example data
13641337
---------------------
13651338

doc/source/user_guide/groupby.rst

+67
Original file line numberDiff line numberDiff line change
@@ -1021,6 +1021,73 @@ that is itself a series, and possibly upcast the result to a DataFrame:
10211021
the output as well as set the indices.
10221022

10231023

1024+
Numba Accelerated Routines
1025+
--------------------------
1026+
1027+
.. versionadded:: 1.1
1028+
1029+
If `Numba <https://numba.pydata.org/>`__ is installed as an optional dependency, the ``transform`` and
1030+
``aggregate`` methods support ``engine='numba'`` and ``engine_kwargs`` arguments. The ``engine_kwargs``
1031+
argument is a dictionary of keyword arguments that will be passed into the
1032+
`numba.jit decorator <https://numba.pydata.org/numba-doc/latest/reference/jit-compilation.html#numba.jit>`__.
1033+
These keyword arguments will be applied to the passed function. Currently only ``nogil``, ``nopython``,
1034+
and ``parallel`` are supported, and their default values are set to ``False``, ``True`` and ``False`` respectively.
1035+
1036+
The function signature must start with ``values, index`` **exactly** as the data belonging to each group
1037+
will be passed into ``values``, and the group index will be passed into ``index``.
1038+
1039+
.. warning::
1040+
1041+
When using ``engine='numba'``, there will be no "fall back" behavior internally. The group
1042+
data and group index will be passed as numpy arrays to the JITed user defined function, and no
1043+
alternative execution attempts will be tried.
1044+
1045+
.. note::
1046+
1047+
In terms of performance, **the first time a function is run using the Numba engine will be slow**
1048+
as Numba will have some function compilation overhead. However, the compiled functions are cached,
1049+
and subsequent calls will be fast. In general, the Numba engine is performant with
1050+
a larger amount of data points (e.g. 1+ million).
1051+
1052+
.. code-block:: ipython
1053+
1054+
In [1]: N = 10 ** 3
1055+
1056+
In [2]: data = {0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N}
1057+
1058+
In [3]: df = pd.DataFrame(data, columns=[0, 1])
1059+
1060+
In [4]: def f_numba(values, index):
1061+
...: total = 0
1062+
...: for i, value in enumerate(values):
1063+
...: if i % 2:
1064+
...: total += value + 5
1065+
...: else:
1066+
...: total += value * 2
1067+
...: return total
1068+
...:
1069+
1070+
In [5]: def f_cython(values):
1071+
...: total = 0
1072+
...: for i, value in enumerate(values):
1073+
...: if i % 2:
1074+
...: total += value + 5
1075+
...: else:
1076+
...: total += value * 2
1077+
...: return total
1078+
...:
1079+
1080+
In [6]: groupby = df.groupby(0)
1081+
# Run the first time, compilation time will affect performance
1082+
In [7]: %timeit -r 1 -n 1 groupby.aggregate(f_numba, engine='numba') # noqa: E225
1083+
2.14 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
1084+
# Function is cached and performance will improve
1085+
In [8]: %timeit groupby.aggregate(f_numba, engine='numba')
1086+
4.93 ms ± 32.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
1087+
1088+
In [9]: %timeit groupby.aggregate(f_cython, engine='cython')
1089+
18.6 ms ± 84.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
1090+
10241091
Other useful features
10251092
---------------------
10261093

0 commit comments

Comments
 (0)