feefladder
diff --git a/‎.github/workflows/ci.yml
+1-1 b/‎.github/workflows/ci.yml
+1-1
diff --git a/‎.github/workflows/database.yml
+1-1 b/‎.github/workflows/database.yml
+1-1
diff --git a/‎.github/workflows/posix.yml
+1-1 b/‎.github/workflows/posix.yml
+1-1
diff --git a/‎.github/workflows/pre-commit.yml
+1-1 b/‎.github/workflows/pre-commit.yml
+1-1
diff --git a/‎.github/workflows/python-dev.yml
+1-1 b/‎.github/workflows/python-dev.yml
+1-1
diff --git a/‎.github/workflows/sdist.yml
+3 b/‎.github/workflows/sdist.yml
+3
diff --git a/‎.pre-commit-config.yaml
+2-2 b/‎.pre-commit-config.yaml
+2-2
diff --git a/‎asv_bench/benchmarks/frame_methods.py
+16 b/‎asv_bench/benchmarks/frame_methods.py
+16
diff --git a/‎asv_bench/benchmarks/io/csv.py
+3-2 b/‎asv_bench/benchmarks/io/csv.py
+3-2
diff --git a/‎doc/source/development/contributing_environment.rst
+1-1 b/‎doc/source/development/contributing_environment.rst
+1-1
diff --git a/‎doc/source/getting_started/tutorials.rst
+13 b/‎doc/source/getting_started/tutorials.rst
+13
diff --git a/‎doc/source/user_guide/enhancingperf.rst
+59-38 b/‎doc/source/user_guide/enhancingperf.rst
+59-38
diff --git a/‎doc/source/user_guide/groupby.rst
+3-51 b/‎doc/source/user_guide/groupby.rst
+3-51
diff --git a/‎doc/source/user_guide/io.rst
+1-1 b/‎doc/source/user_guide/io.rst
+1-1
diff --git a/‎doc/source/user_guide/timedeltas.rst
+7-1 b/‎doc/source/user_guide/timedeltas.rst
+7-1
@@ -23,7 +23,7 @@ jobs:
 
     concurrency:
       group: ${{ github.ref }}-checks
-      cancel-in-progress: true
+      cancel-in-progress: ${{github.event_name == 'pull_request'}}
 
     steps:
     - name: Checkout
 
@@ -31,7 +31,7 @@ jobs:
 
     concurrency:
       group: ${{ github.ref }}-${{ matrix.ENV_FILE }}
-      cancel-in-progress: true
+      cancel-in-progress: ${{github.event_name == 'pull_request'}}
 
     services:
       mysql:
 
@@ -45,7 +45,7 @@ jobs:
       TEST_ARGS: ${{ matrix.settings[6] }}
     concurrency:
       group: ${{ github.ref }}-${{ matrix.settings[0] }}
-      cancel-in-progress: true
+      cancel-in-progress: ${{github.event_name == 'pull_request'}}
 
     steps:
     - name: Checkout
 
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     concurrency:
       group: ${{ github.ref }}-pre-commit
-      cancel-in-progress: true
+      cancel-in-progress: ${{github.event_name == 'pull_request'}}
     steps:
     - uses: actions/checkout@v2
     - uses: actions/setup-python@v2
 
@@ -25,7 +25,7 @@ jobs:
 
     concurrency:
       group: ${{ github.ref }}-dev
-      cancel-in-progress: true
+      cancel-in-progress: ${{github.event_name == 'pull_request'}}
 
     steps:
     - uses: actions/checkout@v2
 
@@ -23,6 +23,9 @@ jobs:
       fail-fast: false
       matrix:
         python-version: ["3.8", "3.9"]
+    concurrency:
+      group: ${{github.ref}}-${{matrix.python-version}}-sdist
+      cancel-in-progress: ${{github.event_name == 'pull_request'}}
 
     steps:
     - uses: actions/checkout@v2
 
@@ -53,11 +53,11 @@ repos:
         types: [text]
         args: [--append-config=flake8/cython-template.cfg]
 -   repo: https://github.com/PyCQA/isort
-    rev: 5.9.1
+    rev: 5.9.2
     hooks:
     -   id: isort
 -   repo: https://github.com/asottile/pyupgrade
-    rev: v2.20.0
+    rev: v2.21.0
     hooks:
     -   id: pyupgrade
         args: [--py38-plus]
 
@@ -232,6 +232,22 @@ def time_to_html_mixed(self):
         self.df2.to_html()
 
 
+class ToDict:
+    params = [["dict", "list", "series", "split", "records", "index"]]
+    param_names = ["orient"]
+
+    def setup(self, orient):
+        data = np.random.randint(0, 1000, size=(10000, 4))
+        self.int_df = DataFrame(data)
+        self.datetimelike_df = self.int_df.astype("timedelta64[ns]")
+
+    def time_to_dict_ints(self, orient):
+        self.int_df.to_dict(orient=orient)
+
+    def time_to_dict_datetimelike(self, orient):
+        self.datetimelike_df.to_dict(orient=orient)
+
+
 class ToNumpy:
     def setup(self):
         N = 10000
 
@@ -291,7 +291,8 @@ class ReadCSVFloatPrecision(StringIORewind):
 
     def setup(self, sep, decimal, float_precision):
         floats = [
-            "".join(random.choice(string.digits) for _ in range(28)) for _ in range(15)
+            "".join([random.choice(string.digits) for _ in range(28)])
+            for _ in range(15)
         ]
         rows = sep.join([f"0{decimal}" + "{}"] * 3) + "\n"
         data = rows * 5
@@ -395,7 +396,7 @@ class ReadCSVCachedParseDates(StringIORewind):
     param_names = ["do_cache", "engine"]
 
     def setup(self, do_cache, engine):
-        data = ("\n".join(f"10/{year}" for year in range(2000, 2100)) + "\n") * 10
+        data = ("\n".join([f"10/{year}" for year in range(2000, 2100)]) + "\n") * 10
         self.StringIO_input = StringIO(data)
 
     def time_read_csv_cached(self, do_cache, engine):
 
@@ -72,7 +72,7 @@ These packages will automatically be installed by using the ``pandas``
 
 **Windows**
 
-You will need `Build Tools for Visual Studio 2017
+You will need `Build Tools for Visual Studio 2019
 <https://visualstudio.microsoft.com/downloads/>`_.
 
 .. warning::
 
@@ -18,6 +18,19 @@ entails.
 For the table of contents, see the `pandas-cookbook GitHub
 repository <https://github.com/jvns/pandas-cookbook>`_.
 
+pandas workshop by Stefanie Molin
+---------------------------------
+
+An introductory workshop by `Stefanie Molin <https://github.com/stefmolin>`_
+designed to quickly get you up to speed with pandas using real-world datasets.
+It covers getting started with pandas, data wrangling, and data visualization
+(with some exposure to matplotlib and seaborn). The
+`pandas-workshop GitHub repository <https://github.com/stefmolin/pandas-workshop>`_
+features detailed environment setup instructions (including a Binder environment),
+slides and notebooks for following along, and exercises to practice the concepts.
+There is also a lab with new exercises on a dataset not covered in the workshop for
+additional practice.
+
 Learn pandas by Hernan Rojas
 ----------------------------
 
 
@@ -302,28 +302,63 @@ For more about ``boundscheck`` and ``wraparound``, see the Cython docs on
 
 .. _enhancingperf.numba:
 
-Using Numba
------------
+Numba (JIT compilation)
+-----------------------
 
-A recent alternative to statically compiling Cython code, is to use a *dynamic jit-compiler*, Numba.
+An alternative to statically compiling Cython code is to use a dynamic just-in-time (JIT) compiler with `Numba <https://numba.pydata.org/>`__.
 
-Numba gives you the power to speed up your applications with high performance functions written directly in Python. With a few annotations, array-oriented and math-heavy Python code can be just-in-time compiled to native machine instructions, similar in performance to C, C++ and Fortran, without having to switch languages or Python interpreters.
+Numba allows you to write a pure Python function which can be JIT compiled to native machine instructions, similar in performance to C, C++ and Fortran,
+by decorating your function with ``@jit``.
 
-Numba works by generating optimized machine code using the LLVM compiler infrastructure at import time, runtime, or statically (using the included pycc tool). Numba supports compilation of Python to run on either CPU or GPU hardware, and is designed to integrate with the Python scientific software stack.
+Numba works by generating optimized machine code using the LLVM compiler infrastructure at import time, runtime, or statically (using the included pycc tool).
+Numba supports compilation of Python to run on either CPU or GPU hardware and is designed to integrate with the Python scientific software stack.
 
 .. note::
 
-    You will need to install Numba. This is easy with ``conda``, by using: ``conda install numba``, see :ref:`installing using miniconda<install.miniconda>`.
+    The ``@jit`` compilation will add overhead to the runtime of the function, so performance benefits may not be realized especially when using small data sets.
+    Consider `caching <https://numba.readthedocs.io/en/stable/developer/caching.html>`__ your function to avoid compilation overhead each time your function is run.
 
-.. note::
+Numba can be used in 2 ways with pandas:
+
+#. Specify the ``engine="numba"`` keyword in select pandas methods
+#. Define your own Python function decorated with ``@jit`` and pass the underlying NumPy array of :class:`Series` or :class:`Dataframe` (using ``to_numpy()``) into the function
+
+pandas Numba Engine
+~~~~~~~~~~~~~~~~~~~
+
+If Numba is installed, one can specify ``engine="numba"`` in select pandas methods to execute the method using Numba.
+Methods that support ``engine="numba"`` will also have an ``engine_kwargs`` keyword that accepts a dictionary that allows one to specify
+``"nogil"``, ``"nopython"`` and ``"parallel"`` keys with boolean values to pass into the ``@jit`` decorator.
+If ``engine_kwargs`` is not specified, it defaults to ``{"nogil": False, "nopython": True, "parallel": False}`` unless otherwise specified.
+
+In terms of performance, **the first time a function is run using the Numba engine will be slow**
+as Numba will have some function compilation overhead. However, the JIT compiled functions are cached,
+and subsequent calls will be fast. In general, the Numba engine is performant with
+a larger amount of data points (e.g. 1+ million).
 
-    As of Numba version 0.20, pandas objects cannot be passed directly to Numba-compiled functions. Instead, one must pass the NumPy array underlying the pandas object to the Numba-compiled function as demonstrated below.
+.. code-block:: ipython
+
+   In [1]: data = pd.Series(range(1_000_000))  # noqa: E225
+
+   In [2]: roll = data.rolling(10)
 
-Jit
-~~~
+   In [3]: def f(x):
+      ...:     return np.sum(x) + 5
+   # Run the first time, compilation time will affect performance
+   In [4]: %timeit -r 1 -n 1 roll.apply(f, engine='numba', raw=True)
+   1.23 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
+   # Function is cached and performance will improve
+   In [5]: %timeit roll.apply(f, engine='numba', raw=True)
+   188 ms ± 1.93 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
 
-We demonstrate how to use Numba to just-in-time compile our code. We simply
-take the plain Python code from above and annotate with the ``@jit`` decorator.
+   In [6]: %timeit roll.apply(f, engine='cython', raw=True)
+   3.92 s ± 59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
+
+Custom Function Examples
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+A custom Python function decorated with ``@jit`` can be used with pandas objects by passing their NumPy array
+representations with ``to_numpy()``.
 
 .. code-block:: python
 
@@ -360,8 +395,6 @@ take the plain Python code from above and annotate with the ``@jit`` decorator.
        )
        return pd.Series(result, index=df.index, name="result")
 
-Note that we directly pass NumPy arrays to the Numba function. ``compute_numba`` is just a wrapper that provides a
-nicer interface by passing/returning pandas objects.
 
 .. code-block:: ipython
 
@@ -370,19 +403,9 @@ nicer interface by passing/returning pandas objects.
 
 In this example, using Numba was faster than Cython.
 
-Numba as an argument
-~~~~~~~~~~~~~~~~~~~~
-
-Additionally, we can leverage the power of `Numba <https://numba.pydata.org/>`__
-by calling it as an argument in :meth:`~Rolling.apply`. See :ref:`Computation tools
-<window.numba_engine>` for an extensive example.
-
-Vectorize
-~~~~~~~~~
-
 Numba can also be used to write vectorized functions that do not require the user to explicitly
 loop over the observations of a vector; a vectorized function will be applied to each row automatically.
-Consider the following toy example of doubling each observation:
+Consider the following example of doubling each observation:
 
 .. code-block:: python
 
@@ -414,25 +437,23 @@ Consider the following toy example of doubling each observation:
 Caveats
 ~~~~~~~
 
-.. note::
-
-    Numba will execute on any function, but can only accelerate certain classes of functions.
-
 Numba is best at accelerating functions that apply numerical functions to NumPy
-arrays. When passed a function that only uses operations it knows how to
-accelerate, it will execute in ``nopython`` mode.
-
-If Numba is passed a function that includes something it doesn't know how to
-work with -- a category that currently includes sets, lists, dictionaries, or
-string functions -- it will revert to ``object mode``. In ``object mode``,
-Numba will execute but your code will not speed up significantly. If you would
+arrays. If you try to ``@jit`` a function that contains unsupported `Python <https://numba.readthedocs.io/en/stable/reference/pysupported.html>`__
+or `NumPy <https://numba.readthedocs.io/en/stable/reference/numpysupported.html>`__
+code, compilation will revert `object mode <https://numba.readthedocs.io/en/stable/glossary.html#term-object-mode>`__ which
+will mostly likely not speed up your function. If you would
 prefer that Numba throw an error if it cannot compile a function in a way that
 speeds up your code, pass Numba the argument
-``nopython=True`` (e.g.  ``@numba.jit(nopython=True)``). For more on
+``nopython=True`` (e.g.  ``@jit(nopython=True)``). For more on
 troubleshooting Numba modes, see the `Numba troubleshooting page
 <https://numba.pydata.org/numba-doc/latest/user/troubleshoot.html#the-compiled-code-is-too-slow>`__.
 
-Read more in the `Numba docs <https://numba.pydata.org/>`__.
+Using ``parallel=True`` (e.g. ``@jit(parallel=True)``) may result in a ``SIGABRT`` if the threading layer leads to unsafe
+behavior. You can first `specify a safe threading layer <https://numba.readthedocs.io/en/stable/user/threading-layer.html#selecting-a-threading-layer-for-safe-parallel-execution>`__
+before running a JIT function with ``parallel=True``.
+
+Generally if the you encounter a segfault (``SIGSEGV``) while using Numba, please report the issue
+to the `Numba issue tracker. <https://github.com/numba/numba/issues/new/choose>`__
 
 .. _enhancingperf.eval:
 
 
@@ -1106,11 +1106,9 @@ Numba Accelerated Routines
 .. versionadded:: 1.1
 
 If `Numba <https://numba.pydata.org/>`__ is installed as an optional dependency, the ``transform`` and
-``aggregate`` methods support ``engine='numba'`` and ``engine_kwargs`` arguments. The ``engine_kwargs``
-argument is a dictionary of keyword arguments that will be passed into the
-`numba.jit decorator <https://numba.pydata.org/numba-doc/latest/reference/jit-compilation.html#numba.jit>`__.
-These keyword arguments will be applied to the passed function. Currently only ``nogil``, ``nopython``,
-and ``parallel`` are supported, and their default values are set to ``False``, ``True`` and ``False`` respectively.
+``aggregate`` methods support ``engine='numba'`` and ``engine_kwargs`` arguments.
+See :ref:`enhancing performance with Numba <enhancingperf.numba>` for general usage of the arguments
+and performance considerations.
 
 The function signature must start with ``values, index`` **exactly** as the data belonging to each group
 will be passed into ``values``, and the group index will be passed into ``index``.
@@ -1121,52 +1119,6 @@ will be passed into ``values``, and the group index will be passed into ``index`
    data and group index will be passed as NumPy arrays to the JITed user defined function, and no
    alternative execution attempts will be tried.
 
-.. note::
-
-   In terms of performance, **the first time a function is run using the Numba engine will be slow**
-   as Numba will have some function compilation overhead. However, the compiled functions are cached,
-   and subsequent calls will be fast. In general, the Numba engine is performant with
-   a larger amount of data points (e.g. 1+ million).
-
-.. code-block:: ipython
-
-   In [1]: N = 10 ** 3
-
-   In [2]: data = {0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N}
-
-   In [3]: df = pd.DataFrame(data, columns=[0, 1])
-
-   In [4]: def f_numba(values, index):
-      ...:     total = 0
-      ...:     for i, value in enumerate(values):
-      ...:         if i % 2:
-      ...:             total += value + 5
-      ...:         else:
-      ...:             total += value * 2
-      ...:     return total
-      ...:
-
-   In [5]: def f_cython(values):
-      ...:     total = 0
-      ...:     for i, value in enumerate(values):
-      ...:         if i % 2:
-      ...:             total += value + 5
-      ...:         else:
-      ...:             total += value * 2
-      ...:     return total
-      ...:
-
-   In [6]: groupby = df.groupby(0)
-   # Run the first time, compilation time will affect performance
-   In [7]: %timeit -r 1 -n 1 groupby.aggregate(f_numba, engine='numba')  # noqa: E225
-   2.14 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
-   # Function is cached and performance will improve
-   In [8]: %timeit groupby.aggregate(f_numba, engine='numba')
-   4.93 ms ± 32.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
-
-   In [9]: %timeit groupby.aggregate(f_cython, engine='cython')
-   18.6 ms ± 84.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
-
 Other useful features
 ---------------------
 
 
@@ -5699,7 +5699,7 @@ Example of a callable using PostgreSQL `COPY clause
           writer.writerows(data_iter)
           s_buf.seek(0)
 
-          columns = ', '.join('"{}"'.format(k) for k in keys)
+          columns = ', '.join(['"{}"'.format(k) for k in keys])
           if table.schema:
               table_name = '{}.{}'.format(table.schema, table.name)
           else:
 
@@ -88,13 +88,19 @@ or a list/array of strings:
 
    pd.to_timedelta(["1 days 06:05:01.00003", "15.5us", "nan"])
 
-The ``unit`` keyword argument specifies the unit of the Timedelta:
+The ``unit`` keyword argument specifies the unit of the Timedelta if the input
+is numeric:
 
 .. ipython:: python
 
    pd.to_timedelta(np.arange(5), unit="s")
    pd.to_timedelta(np.arange(5), unit="d")
 
+.. warning::
+    If a string or array of strings is passed as an input then the ``unit`` keyword
+    argument will be ignored. If a string without units is passed then the default
+    unit of nanoseconds is assumed.
+
 .. _timedeltas.limitations:
 
 Timedelta limitations