Skip to content

Commit f9b71b0

Browse files
authored
Merge branch 'pandas-dev:master' into master
2 parents d95e084 + d60e687 commit f9b71b0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

79 files changed

+1410
-847
lines changed

.github/workflows/ci.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ jobs:
2323

2424
concurrency:
2525
group: ${{ github.ref }}-checks
26-
cancel-in-progress: true
26+
cancel-in-progress: ${{github.event_name == 'pull_request'}}
2727

2828
steps:
2929
- name: Checkout

.github/workflows/database.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ jobs:
3131

3232
concurrency:
3333
group: ${{ github.ref }}-${{ matrix.ENV_FILE }}
34-
cancel-in-progress: true
34+
cancel-in-progress: ${{github.event_name == 'pull_request'}}
3535

3636
services:
3737
mysql:

.github/workflows/posix.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ jobs:
4545
TEST_ARGS: ${{ matrix.settings[6] }}
4646
concurrency:
4747
group: ${{ github.ref }}-${{ matrix.settings[0] }}
48-
cancel-in-progress: true
48+
cancel-in-progress: ${{github.event_name == 'pull_request'}}
4949

5050
steps:
5151
- name: Checkout

.github/workflows/pre-commit.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
runs-on: ubuntu-latest
1212
concurrency:
1313
group: ${{ github.ref }}-pre-commit
14-
cancel-in-progress: true
14+
cancel-in-progress: ${{github.event_name == 'pull_request'}}
1515
steps:
1616
- uses: actions/checkout@v2
1717
- uses: actions/setup-python@v2

.github/workflows/python-dev.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ jobs:
2525

2626
concurrency:
2727
group: ${{ github.ref }}-dev
28-
cancel-in-progress: true
28+
cancel-in-progress: ${{github.event_name == 'pull_request'}}
2929

3030
steps:
3131
- uses: actions/checkout@v2

.github/workflows/sdist.yml

+3
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ jobs:
2323
fail-fast: false
2424
matrix:
2525
python-version: ["3.8", "3.9"]
26+
concurrency:
27+
group: ${{github.ref}}-${{matrix.python-version}}-sdist
28+
cancel-in-progress: ${{github.event_name == 'pull_request'}}
2629

2730
steps:
2831
- uses: actions/checkout@v2

.pre-commit-config.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,11 @@ repos:
5353
types: [text]
5454
args: [--append-config=flake8/cython-template.cfg]
5555
- repo: https://github.com/PyCQA/isort
56-
rev: 5.9.1
56+
rev: 5.9.2
5757
hooks:
5858
- id: isort
5959
- repo: https://github.com/asottile/pyupgrade
60-
rev: v2.20.0
60+
rev: v2.21.0
6161
hooks:
6262
- id: pyupgrade
6363
args: [--py38-plus]

asv_bench/benchmarks/frame_methods.py

+16
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,22 @@ def time_to_html_mixed(self):
232232
self.df2.to_html()
233233

234234

235+
class ToDict:
236+
params = [["dict", "list", "series", "split", "records", "index"]]
237+
param_names = ["orient"]
238+
239+
def setup(self, orient):
240+
data = np.random.randint(0, 1000, size=(10000, 4))
241+
self.int_df = DataFrame(data)
242+
self.datetimelike_df = self.int_df.astype("timedelta64[ns]")
243+
244+
def time_to_dict_ints(self, orient):
245+
self.int_df.to_dict(orient=orient)
246+
247+
def time_to_dict_datetimelike(self, orient):
248+
self.datetimelike_df.to_dict(orient=orient)
249+
250+
235251
class ToNumpy:
236252
def setup(self):
237253
N = 10000

asv_bench/benchmarks/io/csv.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,8 @@ class ReadCSVFloatPrecision(StringIORewind):
291291

292292
def setup(self, sep, decimal, float_precision):
293293
floats = [
294-
"".join(random.choice(string.digits) for _ in range(28)) for _ in range(15)
294+
"".join([random.choice(string.digits) for _ in range(28)])
295+
for _ in range(15)
295296
]
296297
rows = sep.join([f"0{decimal}" + "{}"] * 3) + "\n"
297298
data = rows * 5
@@ -395,7 +396,7 @@ class ReadCSVCachedParseDates(StringIORewind):
395396
param_names = ["do_cache", "engine"]
396397

397398
def setup(self, do_cache, engine):
398-
data = ("\n".join(f"10/{year}" for year in range(2000, 2100)) + "\n") * 10
399+
data = ("\n".join([f"10/{year}" for year in range(2000, 2100)]) + "\n") * 10
399400
self.StringIO_input = StringIO(data)
400401

401402
def time_read_csv_cached(self, do_cache, engine):

doc/source/development/contributing_environment.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ These packages will automatically be installed by using the ``pandas``
7272

7373
**Windows**
7474

75-
You will need `Build Tools for Visual Studio 2017
75+
You will need `Build Tools for Visual Studio 2019
7676
<https://visualstudio.microsoft.com/downloads/>`_.
7777

7878
.. warning::

doc/source/getting_started/tutorials.rst

+13
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,19 @@ entails.
1818
For the table of contents, see the `pandas-cookbook GitHub
1919
repository <https://github.com/jvns/pandas-cookbook>`_.
2020

21+
pandas workshop by Stefanie Molin
22+
---------------------------------
23+
24+
An introductory workshop by `Stefanie Molin <https://github.com/stefmolin>`_
25+
designed to quickly get you up to speed with pandas using real-world datasets.
26+
It covers getting started with pandas, data wrangling, and data visualization
27+
(with some exposure to matplotlib and seaborn). The
28+
`pandas-workshop GitHub repository <https://github.com/stefmolin/pandas-workshop>`_
29+
features detailed environment setup instructions (including a Binder environment),
30+
slides and notebooks for following along, and exercises to practice the concepts.
31+
There is also a lab with new exercises on a dataset not covered in the workshop for
32+
additional practice.
33+
2134
Learn pandas by Hernan Rojas
2235
----------------------------
2336

doc/source/user_guide/enhancingperf.rst

+59-38
Original file line numberDiff line numberDiff line change
@@ -302,28 +302,63 @@ For more about ``boundscheck`` and ``wraparound``, see the Cython docs on
302302

303303
.. _enhancingperf.numba:
304304

305-
Using Numba
306-
-----------
305+
Numba (JIT compilation)
306+
-----------------------
307307

308-
A recent alternative to statically compiling Cython code, is to use a *dynamic jit-compiler*, Numba.
308+
An alternative to statically compiling Cython code is to use a dynamic just-in-time (JIT) compiler with `Numba <https://numba.pydata.org/>`__.
309309

310-
Numba gives you the power to speed up your applications with high performance functions written directly in Python. With a few annotations, array-oriented and math-heavy Python code can be just-in-time compiled to native machine instructions, similar in performance to C, C++ and Fortran, without having to switch languages or Python interpreters.
310+
Numba allows you to write a pure Python function which can be JIT compiled to native machine instructions, similar in performance to C, C++ and Fortran,
311+
by decorating your function with ``@jit``.
311312

312-
Numba works by generating optimized machine code using the LLVM compiler infrastructure at import time, runtime, or statically (using the included pycc tool). Numba supports compilation of Python to run on either CPU or GPU hardware, and is designed to integrate with the Python scientific software stack.
313+
Numba works by generating optimized machine code using the LLVM compiler infrastructure at import time, runtime, or statically (using the included pycc tool).
314+
Numba supports compilation of Python to run on either CPU or GPU hardware and is designed to integrate with the Python scientific software stack.
313315

314316
.. note::
315317

316-
You will need to install Numba. This is easy with ``conda``, by using: ``conda install numba``, see :ref:`installing using miniconda<install.miniconda>`.
318+
The ``@jit`` compilation will add overhead to the runtime of the function, so performance benefits may not be realized especially when using small data sets.
319+
Consider `caching <https://numba.readthedocs.io/en/stable/developer/caching.html>`__ your function to avoid compilation overhead each time your function is run.
317320

318-
.. note::
321+
Numba can be used in 2 ways with pandas:
322+
323+
#. Specify the ``engine="numba"`` keyword in select pandas methods
324+
#. Define your own Python function decorated with ``@jit`` and pass the underlying NumPy array of :class:`Series` or :class:`Dataframe` (using ``to_numpy()``) into the function
325+
326+
pandas Numba Engine
327+
~~~~~~~~~~~~~~~~~~~
328+
329+
If Numba is installed, one can specify ``engine="numba"`` in select pandas methods to execute the method using Numba.
330+
Methods that support ``engine="numba"`` will also have an ``engine_kwargs`` keyword that accepts a dictionary that allows one to specify
331+
``"nogil"``, ``"nopython"`` and ``"parallel"`` keys with boolean values to pass into the ``@jit`` decorator.
332+
If ``engine_kwargs`` is not specified, it defaults to ``{"nogil": False, "nopython": True, "parallel": False}`` unless otherwise specified.
333+
334+
In terms of performance, **the first time a function is run using the Numba engine will be slow**
335+
as Numba will have some function compilation overhead. However, the JIT compiled functions are cached,
336+
and subsequent calls will be fast. In general, the Numba engine is performant with
337+
a larger amount of data points (e.g. 1+ million).
319338

320-
As of Numba version 0.20, pandas objects cannot be passed directly to Numba-compiled functions. Instead, one must pass the NumPy array underlying the pandas object to the Numba-compiled function as demonstrated below.
339+
.. code-block:: ipython
340+
341+
In [1]: data = pd.Series(range(1_000_000)) # noqa: E225
342+
343+
In [2]: roll = data.rolling(10)
321344
322-
Jit
323-
~~~
345+
In [3]: def f(x):
346+
...: return np.sum(x) + 5
347+
# Run the first time, compilation time will affect performance
348+
In [4]: %timeit -r 1 -n 1 roll.apply(f, engine='numba', raw=True)
349+
1.23 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
350+
# Function is cached and performance will improve
351+
In [5]: %timeit roll.apply(f, engine='numba', raw=True)
352+
188 ms ± 1.93 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
324353
325-
We demonstrate how to use Numba to just-in-time compile our code. We simply
326-
take the plain Python code from above and annotate with the ``@jit`` decorator.
354+
In [6]: %timeit roll.apply(f, engine='cython', raw=True)
355+
3.92 s ± 59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
356+
357+
Custom Function Examples
358+
~~~~~~~~~~~~~~~~~~~~~~~~
359+
360+
A custom Python function decorated with ``@jit`` can be used with pandas objects by passing their NumPy array
361+
representations with ``to_numpy()``.
327362

328363
.. code-block:: python
329364
@@ -360,8 +395,6 @@ take the plain Python code from above and annotate with the ``@jit`` decorator.
360395
)
361396
return pd.Series(result, index=df.index, name="result")
362397
363-
Note that we directly pass NumPy arrays to the Numba function. ``compute_numba`` is just a wrapper that provides a
364-
nicer interface by passing/returning pandas objects.
365398
366399
.. code-block:: ipython
367400
@@ -370,19 +403,9 @@ nicer interface by passing/returning pandas objects.
370403
371404
In this example, using Numba was faster than Cython.
372405

373-
Numba as an argument
374-
~~~~~~~~~~~~~~~~~~~~
375-
376-
Additionally, we can leverage the power of `Numba <https://numba.pydata.org/>`__
377-
by calling it as an argument in :meth:`~Rolling.apply`. See :ref:`Computation tools
378-
<window.numba_engine>` for an extensive example.
379-
380-
Vectorize
381-
~~~~~~~~~
382-
383406
Numba can also be used to write vectorized functions that do not require the user to explicitly
384407
loop over the observations of a vector; a vectorized function will be applied to each row automatically.
385-
Consider the following toy example of doubling each observation:
408+
Consider the following example of doubling each observation:
386409

387410
.. code-block:: python
388411
@@ -414,25 +437,23 @@ Consider the following toy example of doubling each observation:
414437
Caveats
415438
~~~~~~~
416439

417-
.. note::
418-
419-
Numba will execute on any function, but can only accelerate certain classes of functions.
420-
421440
Numba is best at accelerating functions that apply numerical functions to NumPy
422-
arrays. When passed a function that only uses operations it knows how to
423-
accelerate, it will execute in ``nopython`` mode.
424-
425-
If Numba is passed a function that includes something it doesn't know how to
426-
work with -- a category that currently includes sets, lists, dictionaries, or
427-
string functions -- it will revert to ``object mode``. In ``object mode``,
428-
Numba will execute but your code will not speed up significantly. If you would
441+
arrays. If you try to ``@jit`` a function that contains unsupported `Python <https://numba.readthedocs.io/en/stable/reference/pysupported.html>`__
442+
or `NumPy <https://numba.readthedocs.io/en/stable/reference/numpysupported.html>`__
443+
code, compilation will revert `object mode <https://numba.readthedocs.io/en/stable/glossary.html#term-object-mode>`__ which
444+
will mostly likely not speed up your function. If you would
429445
prefer that Numba throw an error if it cannot compile a function in a way that
430446
speeds up your code, pass Numba the argument
431-
``nopython=True`` (e.g. ``@numba.jit(nopython=True)``). For more on
447+
``nopython=True`` (e.g. ``@jit(nopython=True)``). For more on
432448
troubleshooting Numba modes, see the `Numba troubleshooting page
433449
<https://numba.pydata.org/numba-doc/latest/user/troubleshoot.html#the-compiled-code-is-too-slow>`__.
434450

435-
Read more in the `Numba docs <https://numba.pydata.org/>`__.
451+
Using ``parallel=True`` (e.g. ``@jit(parallel=True)``) may result in a ``SIGABRT`` if the threading layer leads to unsafe
452+
behavior. You can first `specify a safe threading layer <https://numba.readthedocs.io/en/stable/user/threading-layer.html#selecting-a-threading-layer-for-safe-parallel-execution>`__
453+
before running a JIT function with ``parallel=True``.
454+
455+
Generally if the you encounter a segfault (``SIGSEGV``) while using Numba, please report the issue
456+
to the `Numba issue tracker. <https://github.com/numba/numba/issues/new/choose>`__
436457

437458
.. _enhancingperf.eval:
438459

doc/source/user_guide/groupby.rst

+3-51
Original file line numberDiff line numberDiff line change
@@ -1106,11 +1106,9 @@ Numba Accelerated Routines
11061106
.. versionadded:: 1.1
11071107

11081108
If `Numba <https://numba.pydata.org/>`__ is installed as an optional dependency, the ``transform`` and
1109-
``aggregate`` methods support ``engine='numba'`` and ``engine_kwargs`` arguments. The ``engine_kwargs``
1110-
argument is a dictionary of keyword arguments that will be passed into the
1111-
`numba.jit decorator <https://numba.pydata.org/numba-doc/latest/reference/jit-compilation.html#numba.jit>`__.
1112-
These keyword arguments will be applied to the passed function. Currently only ``nogil``, ``nopython``,
1113-
and ``parallel`` are supported, and their default values are set to ``False``, ``True`` and ``False`` respectively.
1109+
``aggregate`` methods support ``engine='numba'`` and ``engine_kwargs`` arguments.
1110+
See :ref:`enhancing performance with Numba <enhancingperf.numba>` for general usage of the arguments
1111+
and performance considerations.
11141112

11151113
The function signature must start with ``values, index`` **exactly** as the data belonging to each group
11161114
will be passed into ``values``, and the group index will be passed into ``index``.
@@ -1121,52 +1119,6 @@ will be passed into ``values``, and the group index will be passed into ``index`
11211119
data and group index will be passed as NumPy arrays to the JITed user defined function, and no
11221120
alternative execution attempts will be tried.
11231121

1124-
.. note::
1125-
1126-
In terms of performance, **the first time a function is run using the Numba engine will be slow**
1127-
as Numba will have some function compilation overhead. However, the compiled functions are cached,
1128-
and subsequent calls will be fast. In general, the Numba engine is performant with
1129-
a larger amount of data points (e.g. 1+ million).
1130-
1131-
.. code-block:: ipython
1132-
1133-
In [1]: N = 10 ** 3
1134-
1135-
In [2]: data = {0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N}
1136-
1137-
In [3]: df = pd.DataFrame(data, columns=[0, 1])
1138-
1139-
In [4]: def f_numba(values, index):
1140-
...: total = 0
1141-
...: for i, value in enumerate(values):
1142-
...: if i % 2:
1143-
...: total += value + 5
1144-
...: else:
1145-
...: total += value * 2
1146-
...: return total
1147-
...:
1148-
1149-
In [5]: def f_cython(values):
1150-
...: total = 0
1151-
...: for i, value in enumerate(values):
1152-
...: if i % 2:
1153-
...: total += value + 5
1154-
...: else:
1155-
...: total += value * 2
1156-
...: return total
1157-
...:
1158-
1159-
In [6]: groupby = df.groupby(0)
1160-
# Run the first time, compilation time will affect performance
1161-
In [7]: %timeit -r 1 -n 1 groupby.aggregate(f_numba, engine='numba') # noqa: E225
1162-
2.14 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
1163-
# Function is cached and performance will improve
1164-
In [8]: %timeit groupby.aggregate(f_numba, engine='numba')
1165-
4.93 ms ± 32.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
1166-
1167-
In [9]: %timeit groupby.aggregate(f_cython, engine='cython')
1168-
18.6 ms ± 84.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
1169-
11701122
Other useful features
11711123
---------------------
11721124

doc/source/user_guide/io.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -5699,7 +5699,7 @@ Example of a callable using PostgreSQL `COPY clause
56995699
writer.writerows(data_iter)
57005700
s_buf.seek(0)
57015701

5702-
columns = ', '.join('"{}"'.format(k) for k in keys)
5702+
columns = ', '.join(['"{}"'.format(k) for k in keys])
57035703
if table.schema:
57045704
table_name = '{}.{}'.format(table.schema, table.name)
57055705
else:

doc/source/user_guide/timedeltas.rst

+7-1
Original file line numberDiff line numberDiff line change
@@ -88,13 +88,19 @@ or a list/array of strings:
8888
8989
pd.to_timedelta(["1 days 06:05:01.00003", "15.5us", "nan"])
9090
91-
The ``unit`` keyword argument specifies the unit of the Timedelta:
91+
The ``unit`` keyword argument specifies the unit of the Timedelta if the input
92+
is numeric:
9293

9394
.. ipython:: python
9495
9596
pd.to_timedelta(np.arange(5), unit="s")
9697
pd.to_timedelta(np.arange(5), unit="d")
9798
99+
.. warning::
100+
If a string or array of strings is passed as an input then the ``unit`` keyword
101+
argument will be ignored. If a string without units is passed then the default
102+
unit of nanoseconds is assumed.
103+
98104
.. _timedeltas.limitations:
99105

100106
Timedelta limitations

0 commit comments

Comments
 (0)