sthagen · sthagen · Mar 24, 2021 · Mar 23, 2021 · Mar 23, 2021 · Mar 23, 2021
diff --git a/doc/source/whatsnew/v1.2.4.rst b/doc/source/whatsnew/v1.2.4.rst
@@ -17,6 +17,7 @@ Fixed regressions
 
 - Fixed regression in :meth:`DataFrame.sum` when ``min_count`` greater than the :class:`DataFrame` shape was passed resulted in a ``ValueError`` (:issue:`39738`)
 - Fixed regression in :meth:`DataFrame.to_json` raising ``AttributeError`` when run on PyPy (:issue:`39837`)
+- Fixed regression in :meth:`DataFrame.where` not returning a copy in the case of an all True condition (:issue:`39595`)
 -
 
 .. ---------------------------------------------------------------------------

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -136,6 +136,7 @@ Other enhancements
 - :meth:`.Styler.set_table_styles` amended to optionally allow certain css-string input arguments (:issue:`39564`)
 - :meth:`.Styler.apply` now more consistently accepts ndarray function returns, i.e. in all cases for ``axis`` is ``0, 1 or None`` (:issue:`39359`)
 - :meth:`.Styler.apply` and :meth:`.Styler.applymap` now raise errors if wrong format CSS is passed on render (:issue:`39660`)
+- :meth:`.Styler.format` adds keyword argument ``escape`` for optional HTML escaping (:issue:`40437`)
 - Builtin highlighting methods in :class:`Styler` have a more consistent signature and css customisability (:issue:`40242`)
 - :meth:`Series.loc.__getitem__` and :meth:`Series.loc.__setitem__` with :class:`MultiIndex` now raising helpful error message when indexer has too many dimensions (:issue:`35349`)
 - :meth:`pandas.read_stata` and :class:`StataReader` support reading data from compressed files.
@@ -631,6 +632,7 @@ Groupby/resample/rolling
 - Bug in :class:`core.window.ewm.ExponentialMovingWindow` when calling ``__getitem__`` would incorrectly raise a ``ValueError`` when providing ``times`` (:issue:`40164`)
 - Bug in :class:`core.window.ewm.ExponentialMovingWindow` when calling ``__getitem__`` would not retain ``com``, ``span``, ``alpha`` or ``halflife`` attributes  (:issue:`40164`)
 - :class:`core.window.ewm.ExponentialMovingWindow` now raises a ``NotImplementedError`` when specifying ``times`` with ``adjust=False`` due to an incorrect calculation (:issue:`40098`)
+- Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`)
 
 Reshaping
 ^^^^^^^^^
@@ -649,6 +651,7 @@ Reshaping
 - Allow :class:`Index` to be passed to the :func:`numpy.all` function (:issue:`40180`)
 - Bug in :meth:`DataFrame.stack` not preserving ``CategoricalDtype`` in a ``MultiIndex`` (:issue:`36991`)
 - Bug in :func:`to_datetime` raising error when input sequence contains unhashable items (:issue:`39756`)
+- Bug in :meth:`Series.explode` preserving index when ``ignore_index`` was ``True`` and values were scalars (:issue:`40487`)
 
 Sparse
 ^^^^^^

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in
@@ -8,6 +8,32 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 # take_1d, take_2d
 # ----------------------------------------------------------------------
 
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_1d_intp_intp(
+    const intp_t[:] values,
+    const intp_t[:] indexer,
+    intp_t[::1] out,
+    intp_t fill_value=-1,
+):
+    cdef:
+        Py_ssize_t i, n, idx
+        intp_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i in range(n):
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
 {{py:
 
 # c_type_in, c_type_out

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -37,6 +37,7 @@ from pandas._libs.util cimport (
 )
 
 from pandas._libs.algos import (
+    ensure_platform_int,
     groupsort_indexer,
     rank_1d,
     take_2d_axis1_float64_float64,
@@ -111,7 +112,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
     """
     cdef:
         Py_ssize_t i, j, N, K, ngroups, size
-        ndarray[int64_t] _counts
+        ndarray[intp_t] _counts
         ndarray[float64_t, ndim=2] data
         ndarray[intp_t] indexer
         float64_t* ptr
@@ -121,7 +122,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
     ngroups = len(counts)
     N, K = (<object>values).shape
 
-    indexer, _counts = groupsort_indexer(labels, ngroups)
+    indexer, _counts = groupsort_indexer(ensure_platform_int(labels), ngroups)
     counts[:] = _counts[1:]
 
     data = np.empty((K, N), dtype=np.float64)
@@ -1127,18 +1128,40 @@ ctypedef fused groupby_t:
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_max(groupby_t[:, ::1] out,
-              int64_t[::1] counts,
-              ndarray[groupby_t, ndim=2] values,
-              const int64_t[:] labels,
-              Py_ssize_t min_count=-1):
+cdef group_min_max(groupby_t[:, ::1] out,
+                   int64_t[::1] counts,
+                   ndarray[groupby_t, ndim=2] values,
+                   const int64_t[:] labels,
+                   Py_ssize_t min_count=-1,
+                   bint compute_max=True):
     """
-    Only aggregates on axis=0
+    Compute minimum/maximum  of columns of `values`, in row groups `labels`.
+
+    Parameters
+    ----------
+    out : array
+        Array to store result in.
+    counts : int64 array
+        Input as a zeroed array, populated by group sizes during algorithm
+    values : array
+        Values to find column-wise min/max of.
+    labels : int64 array
+        Labels to group by.
+    min_count : Py_ssize_t, default -1
+        The minimum number of non-NA group elements, NA result if threshold
+        is not met
+    compute_max : bint, default True
+        True to compute group-wise max, False to compute min
+
+    Notes
+    -----
+    This method modifies the `out` parameter, rather than returning an object.
+    `counts` is modified to hold group sizes
     """
     cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        groupby_t val, count, nan_val
-        ndarray[groupby_t, ndim=2] maxx
+        Py_ssize_t i, j, N, K, lab, ngroups = len(counts)
+        groupby_t val, nan_val
+        ndarray[groupby_t, ndim=2] group_min_or_max
         bint runtime_error = False
         int64_t[:, ::1] nobs
 
@@ -1150,18 +1173,17 @@ def group_max(groupby_t[:, ::1] out,
     min_count = max(min_count, 1)
     nobs = np.zeros((<object>out).shape, dtype=np.int64)
 
-    maxx = np.empty_like(out)
+    group_min_or_max = np.empty_like(out)
     if groupby_t is int64_t:
-        # Note: evaluated at compile-time
-        maxx[:] = -_int64_max
+        group_min_or_max[:] = -_int64_max if compute_max else _int64_max
         nan_val = NPY_NAT
     elif groupby_t is uint64_t:
         # NB: We do not define nan_val because there is no such thing
-        #  for uint64_t.  We carefully avoid having to reference it in this
-        #  case.
-        maxx[:] = 0
+        # for uint64_t.  We carefully avoid having to reference it in this
+        # case.
+        group_min_or_max[:] = 0 if compute_max else np.iinfo(np.uint64).max
     else:
-        maxx[:] = -np.inf
+        group_min_or_max[:] = -np.inf if compute_max else np.inf
         nan_val = NAN
 
     N, K = (<object>values).shape
@@ -1179,20 +1201,23 @@ def group_max(groupby_t[:, ::1] out,
                 if not _treat_as_na(val, True):
                     # TODO: Sure we always want is_datetimelike=True?
                     nobs[lab, j] += 1
-                    if val > maxx[lab, j]:
-                        maxx[lab, j] = val
+                    if compute_max:
+                        if val > group_min_or_max[lab, j]:
+                            group_min_or_max[lab, j] = val
+                    else:
+                        if val < group_min_or_max[lab, j]:
+                            group_min_or_max[lab, j] = val
 
-        for i in range(ncounts):
+        for i in range(ngroups):
             for j in range(K):
                 if nobs[i, j] < min_count:
                     if groupby_t is uint64_t:
                         runtime_error = True
                         break
                     else:
-
                         out[i, j] = nan_val
                 else:
-                    out[i, j] = maxx[i, j]
+                    out[i, j] = group_min_or_max[i, j]
 
     if runtime_error:
         # We cannot raise directly above because that is within a nogil
@@ -1202,75 +1227,24 @@ def group_max(groupby_t[:, ::1] out,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_min(groupby_t[:, ::1] out,
+def group_max(groupby_t[:, ::1] out,
               int64_t[::1] counts,
               ndarray[groupby_t, ndim=2] values,
               const int64_t[:] labels,
               Py_ssize_t min_count=-1):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        groupby_t val, count, nan_val
-        ndarray[groupby_t, ndim=2] minx
-        bint runtime_error = False
-        int64_t[:, ::1] nobs
-
-    # TODO(cython 3.0):
-    # Instead of `labels.shape[0]` use `len(labels)`
-    if not len(values) == labels.shape[0]:
-        raise AssertionError("len(index) != len(labels)")
-
-    min_count = max(min_count, 1)
-    nobs = np.zeros((<object>out).shape, dtype=np.int64)
-
-    minx = np.empty_like(out)
-    if groupby_t is int64_t:
-        minx[:] = _int64_max
-        nan_val = NPY_NAT
-    elif groupby_t is uint64_t:
-        # NB: We do not define nan_val because there is no such thing
-        #  for uint64_t.  We carefully avoid having to reference it in this
-        #  case.
-        minx[:] = np.iinfo(np.uint64).max
-    else:
-        minx[:] = np.inf
-        nan_val = NAN
+    """See group_min_max.__doc__"""
+    group_min_max(out, counts, values, labels, min_count=min_count, compute_max=True)
 
-    N, K = (<object>values).shape
 
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                if not _treat_as_na(val, True):
-                    # TODO: Sure we always want is_datetimelike=True?
-                    nobs[lab, j] += 1
-                    if val < minx[lab, j]:
-                        minx[lab, j] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] < min_count:
-                    if groupby_t is uint64_t:
-                        runtime_error = True
-                        break
-                    else:
-                        out[i, j] = nan_val
-                else:
-                    out[i, j] = minx[i, j]
-
-    if runtime_error:
-        # We cannot raise directly above because that is within a nogil
-        #  block.
-        raise RuntimeError("empty group with uint64_t")
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_min(groupby_t[:, ::1] out,
+              int64_t[::1] counts,
+              ndarray[groupby_t, ndim=2] values,
+              const int64_t[:] labels,
+              Py_ssize_t min_count=-1):
+    """See group_min_max.__doc__"""
+    group_min_max(out, counts, values, labels, min_count=min_count, compute_max=False)
 
 
 @cython.boundscheck(False)

diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx
@@ -455,3 +455,53 @@ def get_blkno_placements(blknos, group: bool = True):
 
     for blkno, indexer in get_blkno_indexers(blknos, group):
         yield blkno, BlockPlacement(indexer)
+
+
+@cython.freelist(64)
+cdef class Block:
+    """
+    Defining __init__ in a cython class significantly improves performance.
+    """
+    cdef:
+        public BlockPlacement _mgr_locs
+        readonly int ndim
+        public object values
+
+    def __cinit__(self, values, placement: BlockPlacement, ndim: int):
+        """
+        Parameters
+        ----------
+        values : np.ndarray or ExtensionArray
+            We assume maybe_coerce_values has already been called.
+        placement : BlockPlacement
+        ndim : int
+            1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame
+        """
+        self._mgr_locs = placement
+        self.ndim = ndim
+        self.values = values
+
+    cpdef __reduce__(self):
+        # We have to do some gymnastics b/c "ndim" is keyword-only
+        from functools import partial
+
+        from pandas.core.internals.blocks import new_block
+
+        args = (self.values, self.mgr_locs.indexer)
+        func = partial(new_block, ndim=self.ndim)
+        return func, args
+
+    cpdef __setstate__(self, state):
+        from pandas.core.construction import extract_array
+
+        self.mgr_locs = BlockPlacement(state[0])
+        self.values = extract_array(state[1], extract_numpy=True)
+        if len(state) > 2:
+            # we stored ndim
+            self.ndim = state[2]
+        else:
+            # older pickle
+            from pandas.core.internals.api import maybe_infer_ndim
+
+            ndim = maybe_infer_ndim(self.values, self.mgr_locs)
+            self.ndim = ndim