diff --git a/doc/source/whatsnew/v1.2.4.rst b/doc/source/whatsnew/v1.2.4.rst index c7bc337239faf..45d131327630e 100644 --- a/doc/source/whatsnew/v1.2.4.rst +++ b/doc/source/whatsnew/v1.2.4.rst @@ -17,6 +17,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.sum` when ``min_count`` greater than the :class:`DataFrame` shape was passed resulted in a ``ValueError`` (:issue:`39738`) - Fixed regression in :meth:`DataFrame.to_json` raising ``AttributeError`` when run on PyPy (:issue:`39837`) +- Fixed regression in :meth:`DataFrame.where` not returning a copy in the case of an all True condition (:issue:`39595`) - .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 4e7067da3cc72..512e6e6cbb391 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -136,6 +136,7 @@ Other enhancements - :meth:`.Styler.set_table_styles` amended to optionally allow certain css-string input arguments (:issue:`39564`) - :meth:`.Styler.apply` now more consistently accepts ndarray function returns, i.e. in all cases for ``axis`` is ``0, 1 or None`` (:issue:`39359`) - :meth:`.Styler.apply` and :meth:`.Styler.applymap` now raise errors if wrong format CSS is passed on render (:issue:`39660`) +- :meth:`.Styler.format` adds keyword argument ``escape`` for optional HTML escaping (:issue:`40437`) - Builtin highlighting methods in :class:`Styler` have a more consistent signature and css customisability (:issue:`40242`) - :meth:`Series.loc.__getitem__` and :meth:`Series.loc.__setitem__` with :class:`MultiIndex` now raising helpful error message when indexer has too many dimensions (:issue:`35349`) - :meth:`pandas.read_stata` and :class:`StataReader` support reading data from compressed files. @@ -631,6 +632,7 @@ Groupby/resample/rolling - Bug in :class:`core.window.ewm.ExponentialMovingWindow` when calling ``__getitem__`` would incorrectly raise a ``ValueError`` when providing ``times`` (:issue:`40164`) - Bug in :class:`core.window.ewm.ExponentialMovingWindow` when calling ``__getitem__`` would not retain ``com``, ``span``, ``alpha`` or ``halflife`` attributes (:issue:`40164`) - :class:`core.window.ewm.ExponentialMovingWindow` now raises a ``NotImplementedError`` when specifying ``times`` with ``adjust=False`` due to an incorrect calculation (:issue:`40098`) +- Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`) Reshaping ^^^^^^^^^ @@ -649,6 +651,7 @@ Reshaping - Allow :class:`Index` to be passed to the :func:`numpy.all` function (:issue:`40180`) - Bug in :meth:`DataFrame.stack` not preserving ``CategoricalDtype`` in a ``MultiIndex`` (:issue:`36991`) - Bug in :func:`to_datetime` raising error when input sequence contains unhashable items (:issue:`39756`) +- Bug in :meth:`Series.explode` preserving index when ``ignore_index`` was ``True`` and values were scalars (:issue:`40487`) Sparse ^^^^^^ diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 94bd8b49777cf..122a014604bf0 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -191,7 +191,7 @@ def is_lexsorted(list_of_arrays: list) -> bint: @cython.boundscheck(False) @cython.wraparound(False) -def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups): +def groupsort_indexer(const intp_t[:] index, Py_ssize_t ngroups): """ Compute a 1-d indexer. @@ -200,7 +200,7 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups): Parameters ---------- - index: int64 ndarray + index: np.ndarray[np.intp] Mappings from group -> position. ngroups: int64 Number of groups. @@ -209,7 +209,7 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups): ------- ndarray[intp_t, ndim=1] Indexer - ndarray[int64_t, ndim=1] + ndarray[intp_t, ndim=1] Group Counts Notes @@ -218,13 +218,12 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups): """ cdef: Py_ssize_t i, loc, label, n - ndarray[int64_t] counts, where - ndarray[intp_t] indexer + ndarray[intp_t] indexer, where, counts - counts = np.zeros(ngroups + 1, dtype=np.int64) + counts = np.zeros(ngroups + 1, dtype=np.intp) n = len(index) indexer = np.zeros(n, dtype=np.intp) - where = np.zeros(ngroups + 1, dtype=np.int64) + where = np.zeros(ngroups + 1, dtype=np.intp) with nogil: @@ -995,15 +994,19 @@ def rank_1d( cdef: TiebreakEnumType tiebreak Py_ssize_t i, j, N, grp_start=0, dups=0, sum_ranks=0 - Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0 + Py_ssize_t grp_vals_seen=1, grp_na_count=0 ndarray[int64_t, ndim=1] lexsort_indexer ndarray[float64_t, ndim=1] grp_sizes, out ndarray[rank_t, ndim=1] masked_vals ndarray[uint8_t, ndim=1] mask - bint keep_na, at_end, next_val_diff, check_labels + bint keep_na, at_end, next_val_diff, check_labels, group_changed rank_t nan_fill_val tiebreak = tiebreakers[ties_method] + if tiebreak == TIEBREAK_FIRST: + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + keep_na = na_option == 'keep' N = len(values) @@ -1011,6 +1014,7 @@ def rank_1d( assert len(labels) == N out = np.empty(N) grp_sizes = np.ones(N) + # If all 0 labels, can short-circuit later label # comparisons check_labels = np.any(labels) @@ -1032,6 +1036,12 @@ def rank_1d( else: mask = np.zeros(shape=len(masked_vals), dtype=np.uint8) + # If `na_option == 'top'`, we want to assign the lowest rank + # to NaN regardless of ascending/descending. So if ascending, + # fill with lowest value of type to end up with lowest rank. + # If descending, fill with highest value since descending + # will flip the ordering to still end up with lowest rank. + # Symmetric logic applies to `na_option == 'bottom'` if ascending ^ (na_option == 'top'): if rank_t is object: nan_fill_val = Infinity() @@ -1074,36 +1084,36 @@ def rank_1d( if rank_t is object: for i in range(N): at_end = i == N - 1 + # dups and sum_ranks will be incremented each loop where # the value / group remains the same, and should be reset - # when either of those change - # Used to calculate tiebreakers + # when either of those change. Used to calculate tiebreakers dups += 1 sum_ranks += i - grp_start + 1 + next_val_diff = at_end or are_diff(masked_vals[lexsort_indexer[i]], + masked_vals[lexsort_indexer[i+1]]) + + # We'll need this check later anyway to determine group size, so just + # compute it here since shortcircuiting won't help + group_changed = at_end or (check_labels and + (labels[lexsort_indexer[i]] + != labels[lexsort_indexer[i+1]])) + # Update out only when there is a transition of values or labels. # When a new value or group is encountered, go back #dups steps( # the number of occurrence of current value) and assign the ranks # based on the starting index of the current group (grp_start) # and the current index - if not at_end: - next_val_diff = are_diff(masked_vals[lexsort_indexer[i]], - masked_vals[lexsort_indexer[i+1]]) - else: - next_val_diff = True - - if (next_val_diff - or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]) - or (check_labels - and (labels[lexsort_indexer[i]] - != labels[lexsort_indexer[i+1]])) - ): - # if keep_na, check for missing values and assign back + if (next_val_diff or group_changed + or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])): + + # If keep_na, check for missing values and assign back # to the result where appropriate if keep_na and mask[lexsort_indexer[i]]: + grp_na_count = dups for j in range(i - dups + 1, i + 1): out[lexsort_indexer[j]] = NaN - grp_na_count = dups elif tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): out[lexsort_indexer[j]] = sum_ranks / dups @@ -1113,37 +1123,41 @@ def rank_1d( elif tiebreak == TIEBREAK_MAX: for j in range(i - dups + 1, i + 1): out[lexsort_indexer[j]] = i - grp_start + 1 + + # With n as the previous rank in the group and m as the number + # of duplicates in this stretch, if TIEBREAK_FIRST and ascending, + # then rankings should be n + 1, n + 2 ... n + m elif tiebreak == TIEBREAK_FIRST: for j in range(i - dups + 1, i + 1): - if ascending: - out[lexsort_indexer[j]] = j + 1 - grp_start - else: - out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start + out[lexsort_indexer[j]] = j + 1 - grp_start + + # If TIEBREAK_FIRST and descending, the ranking should be + # n + m, n + (m - 1) ... n + 1. This is equivalent to + # (i - dups + 1) + (i - j + 1) - grp_start + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start elif tiebreak == TIEBREAK_DENSE: for j in range(i - dups + 1, i + 1): out[lexsort_indexer[j]] = grp_vals_seen - # look forward to the next value (using the sorting in _as) + # Look forward to the next value (using the sorting in lexsort_indexer) # if the value does not equal the current value then we need to # reset the dups and sum_ranks, knowing that a new value is - # coming up. the conditional also needs to handle nan equality + # coming up. The conditional also needs to handle nan equality # and the end of iteration if next_val_diff or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]): dups = sum_ranks = 0 grp_vals_seen += 1 - grp_tie_count += 1 # Similar to the previous conditional, check now if we are # moving to a new group. If so, keep track of the index where # the new group occurs, so the tiebreaker calculations can - # decrement that from their position. fill in the size of each - # group encountered (used by pct calculations later). also be + # decrement that from their position. Fill in the size of each + # group encountered (used by pct calculations later). Also be # sure to reset any of the items helping to calculate dups - if (at_end or - (check_labels - and (labels[lexsort_indexer[i]] - != labels[lexsort_indexer[i+1]]))): + if group_changed: if tiebreak != TIEBREAK_DENSE: for j in range(grp_start, i + 1): grp_sizes[lexsort_indexer[j]] = \ @@ -1151,46 +1165,45 @@ def rank_1d( else: for j in range(grp_start, i + 1): grp_sizes[lexsort_indexer[j]] = \ - (grp_tie_count - (grp_na_count > 0)) + (grp_vals_seen - 1 - (grp_na_count > 0)) dups = sum_ranks = 0 grp_na_count = 0 - grp_tie_count = 0 grp_start = i + 1 grp_vals_seen = 1 else: with nogil: for i in range(N): at_end = i == N - 1 + # dups and sum_ranks will be incremented each loop where # the value / group remains the same, and should be reset - # when either of those change - # Used to calculate tiebreakers + # when either of those change. Used to calculate tiebreakers dups += 1 sum_ranks += i - grp_start + 1 + next_val_diff = at_end or (masked_vals[lexsort_indexer[i]] + != masked_vals[lexsort_indexer[i+1]]) + + # We'll need this check later anyway to determine group size, so just + # compute it here since shortcircuiting won't help + group_changed = at_end or (check_labels and + (labels[lexsort_indexer[i]] + != labels[lexsort_indexer[i+1]])) + # Update out only when there is a transition of values or labels. # When a new value or group is encountered, go back #dups steps( # the number of occurrence of current value) and assign the ranks # based on the starting index of the current group (grp_start) # and the current index - if not at_end: - next_val_diff = (masked_vals[lexsort_indexer[i]] - != masked_vals[lexsort_indexer[i+1]]) - else: - next_val_diff = True - - if (next_val_diff - or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]) - or (check_labels - and (labels[lexsort_indexer[i]] - != labels[lexsort_indexer[i+1]])) - ): - # if keep_na, check for missing values and assign back + if (next_val_diff or group_changed + or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])): + + # If keep_na, check for missing values and assign back # to the result where appropriate if keep_na and mask[lexsort_indexer[i]]: + grp_na_count = dups for j in range(i - dups + 1, i + 1): out[lexsort_indexer[j]] = NaN - grp_na_count = dups elif tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): out[lexsort_indexer[j]] = sum_ranks / dups @@ -1200,37 +1213,41 @@ def rank_1d( elif tiebreak == TIEBREAK_MAX: for j in range(i - dups + 1, i + 1): out[lexsort_indexer[j]] = i - grp_start + 1 + + # With n as the previous rank in the group and m as the number + # of duplicates in this stretch, if TIEBREAK_FIRST and ascending, + # then rankings should be n + 1, n + 2 ... n + m elif tiebreak == TIEBREAK_FIRST: for j in range(i - dups + 1, i + 1): - if ascending: - out[lexsort_indexer[j]] = j + 1 - grp_start - else: - out[lexsort_indexer[j]] = \ - (2 * i - j - dups + 2 - grp_start) + out[lexsort_indexer[j]] = j + 1 - grp_start + + # If TIEBREAK_FIRST and descending, the ranking should be + # n + m, n + (m - 1) ... n + 1. This is equivalent to + # (i - dups + 1) + (i - j + 1) - grp_start + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start elif tiebreak == TIEBREAK_DENSE: for j in range(i - dups + 1, i + 1): out[lexsort_indexer[j]] = grp_vals_seen - # look forward to the next value (using the sorting in + # Look forward to the next value (using the sorting in # lexsort_indexer) if the value does not equal the current - # value then we need to reset the dups and sum_ranks, - # knowing that a new value is coming up. the conditional - # also needs to handle nan equality and the end of iteration + # value then we need to reset the dups and sum_ranks, knowing + # that a new value is coming up. The conditional also needs + # to handle nan equality and the end of iteration if next_val_diff or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]): dups = sum_ranks = 0 grp_vals_seen += 1 - grp_tie_count += 1 # Similar to the previous conditional, check now if we are # moving to a new group. If so, keep track of the index where # the new group occurs, so the tiebreaker calculations can - # decrement that from their position. fill in the size of each - # group encountered (used by pct calculations later). also be + # decrement that from their position. Fill in the size of each + # group encountered (used by pct calculations later). Also be # sure to reset any of the items helping to calculate dups - if at_end or (check_labels and - (labels[lexsort_indexer[i]] - != labels[lexsort_indexer[i+1]])): + if group_changed: if tiebreak != TIEBREAK_DENSE: for j in range(grp_start, i + 1): grp_sizes[lexsort_indexer[j]] = \ @@ -1238,10 +1255,9 @@ def rank_1d( else: for j in range(grp_start, i + 1): grp_sizes[lexsort_indexer[j]] = \ - (grp_tie_count - (grp_na_count > 0)) + (grp_vals_seen - 1 - (grp_na_count > 0)) dups = sum_ranks = 0 grp_na_count = 0 - grp_tie_count = 0 grp_start = i + 1 grp_vals_seen = 1 diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index cdf4ef3b119d2..929cb86c41036 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -8,6 +8,32 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # take_1d, take_2d # ---------------------------------------------------------------------- + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_intp_intp( + const intp_t[:] values, + const intp_t[:] indexer, + intp_t[::1] out, + intp_t fill_value=-1, +): + cdef: + Py_ssize_t i, n, idx + intp_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i in range(n): + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + {{py: # c_type_in, c_type_out diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index f09a6c04aecbf..9766b82b1e9d5 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -37,6 +37,7 @@ from pandas._libs.util cimport ( ) from pandas._libs.algos import ( + ensure_platform_int, groupsort_indexer, rank_1d, take_2d_axis1_float64_float64, @@ -111,7 +112,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, """ cdef: Py_ssize_t i, j, N, K, ngroups, size - ndarray[int64_t] _counts + ndarray[intp_t] _counts ndarray[float64_t, ndim=2] data ndarray[intp_t] indexer float64_t* ptr @@ -121,7 +122,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, ngroups = len(counts) N, K = (values).shape - indexer, _counts = groupsort_indexer(labels, ngroups) + indexer, _counts = groupsort_indexer(ensure_platform_int(labels), ngroups) counts[:] = _counts[1:] data = np.empty((K, N), dtype=np.float64) @@ -1127,18 +1128,40 @@ ctypedef fused groupby_t: @cython.wraparound(False) @cython.boundscheck(False) -def group_max(groupby_t[:, ::1] out, - int64_t[::1] counts, - ndarray[groupby_t, ndim=2] values, - const int64_t[:] labels, - Py_ssize_t min_count=-1): +cdef group_min_max(groupby_t[:, ::1] out, + int64_t[::1] counts, + ndarray[groupby_t, ndim=2] values, + const int64_t[:] labels, + Py_ssize_t min_count=-1, + bint compute_max=True): """ - Only aggregates on axis=0 + Compute minimum/maximum of columns of `values`, in row groups `labels`. + + Parameters + ---------- + out : array + Array to store result in. + counts : int64 array + Input as a zeroed array, populated by group sizes during algorithm + values : array + Values to find column-wise min/max of. + labels : int64 array + Labels to group by. + min_count : Py_ssize_t, default -1 + The minimum number of non-NA group elements, NA result if threshold + is not met + compute_max : bint, default True + True to compute group-wise max, False to compute min + + Notes + ----- + This method modifies the `out` parameter, rather than returning an object. + `counts` is modified to hold group sizes """ cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - groupby_t val, count, nan_val - ndarray[groupby_t, ndim=2] maxx + Py_ssize_t i, j, N, K, lab, ngroups = len(counts) + groupby_t val, nan_val + ndarray[groupby_t, ndim=2] group_min_or_max bint runtime_error = False int64_t[:, ::1] nobs @@ -1150,18 +1173,17 @@ def group_max(groupby_t[:, ::1] out, min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) - maxx = np.empty_like(out) + group_min_or_max = np.empty_like(out) if groupby_t is int64_t: - # Note: evaluated at compile-time - maxx[:] = -_int64_max + group_min_or_max[:] = -_int64_max if compute_max else _int64_max nan_val = NPY_NAT elif groupby_t is uint64_t: # NB: We do not define nan_val because there is no such thing - # for uint64_t. We carefully avoid having to reference it in this - # case. - maxx[:] = 0 + # for uint64_t. We carefully avoid having to reference it in this + # case. + group_min_or_max[:] = 0 if compute_max else np.iinfo(np.uint64).max else: - maxx[:] = -np.inf + group_min_or_max[:] = -np.inf if compute_max else np.inf nan_val = NAN N, K = (values).shape @@ -1179,20 +1201,23 @@ def group_max(groupby_t[:, ::1] out, if not _treat_as_na(val, True): # TODO: Sure we always want is_datetimelike=True? nobs[lab, j] += 1 - if val > maxx[lab, j]: - maxx[lab, j] = val + if compute_max: + if val > group_min_or_max[lab, j]: + group_min_or_max[lab, j] = val + else: + if val < group_min_or_max[lab, j]: + group_min_or_max[lab, j] = val - for i in range(ncounts): + for i in range(ngroups): for j in range(K): if nobs[i, j] < min_count: if groupby_t is uint64_t: runtime_error = True break else: - out[i, j] = nan_val else: - out[i, j] = maxx[i, j] + out[i, j] = group_min_or_max[i, j] if runtime_error: # We cannot raise directly above because that is within a nogil @@ -1202,75 +1227,24 @@ def group_max(groupby_t[:, ::1] out, @cython.wraparound(False) @cython.boundscheck(False) -def group_min(groupby_t[:, ::1] out, +def group_max(groupby_t[:, ::1] out, int64_t[::1] counts, ndarray[groupby_t, ndim=2] values, const int64_t[:] labels, Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - groupby_t val, count, nan_val - ndarray[groupby_t, ndim=2] minx - bint runtime_error = False - int64_t[:, ::1] nobs - - # TODO(cython 3.0): - # Instead of `labels.shape[0]` use `len(labels)` - if not len(values) == labels.shape[0]: - raise AssertionError("len(index) != len(labels)") - - min_count = max(min_count, 1) - nobs = np.zeros((out).shape, dtype=np.int64) - - minx = np.empty_like(out) - if groupby_t is int64_t: - minx[:] = _int64_max - nan_val = NPY_NAT - elif groupby_t is uint64_t: - # NB: We do not define nan_val because there is no such thing - # for uint64_t. We carefully avoid having to reference it in this - # case. - minx[:] = np.iinfo(np.uint64).max - else: - minx[:] = np.inf - nan_val = NAN + """See group_min_max.__doc__""" + group_min_max(out, counts, values, labels, min_count=min_count, compute_max=True) - N, K = (values).shape - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - if not _treat_as_na(val, True): - # TODO: Sure we always want is_datetimelike=True? - nobs[lab, j] += 1 - if val < minx[lab, j]: - minx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] < min_count: - if groupby_t is uint64_t: - runtime_error = True - break - else: - out[i, j] = nan_val - else: - out[i, j] = minx[i, j] - - if runtime_error: - # We cannot raise directly above because that is within a nogil - # block. - raise RuntimeError("empty group with uint64_t") +@cython.wraparound(False) +@cython.boundscheck(False) +def group_min(groupby_t[:, ::1] out, + int64_t[::1] counts, + ndarray[groupby_t, ndim=2] values, + const int64_t[:] labels, + Py_ssize_t min_count=-1): + """See group_min_max.__doc__""" + group_min_max(out, counts, values, labels, min_count=min_count, compute_max=False) @cython.boundscheck(False) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 5352ca53e1b54..31b6935e9b2ba 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -455,3 +455,53 @@ def get_blkno_placements(blknos, group: bool = True): for blkno, indexer in get_blkno_indexers(blknos, group): yield blkno, BlockPlacement(indexer) + + +@cython.freelist(64) +cdef class Block: + """ + Defining __init__ in a cython class significantly improves performance. + """ + cdef: + public BlockPlacement _mgr_locs + readonly int ndim + public object values + + def __cinit__(self, values, placement: BlockPlacement, ndim: int): + """ + Parameters + ---------- + values : np.ndarray or ExtensionArray + We assume maybe_coerce_values has already been called. + placement : BlockPlacement + ndim : int + 1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame + """ + self._mgr_locs = placement + self.ndim = ndim + self.values = values + + cpdef __reduce__(self): + # We have to do some gymnastics b/c "ndim" is keyword-only + from functools import partial + + from pandas.core.internals.blocks import new_block + + args = (self.values, self.mgr_locs.indexer) + func = partial(new_block, ndim=self.ndim) + return func, args + + cpdef __setstate__(self, state): + from pandas.core.construction import extract_array + + self.mgr_locs = BlockPlacement(state[0]) + self.values = extract_array(state[1], extract_numpy=True) + if len(state) > 2: + # we stored ndim + self.ndim = state[2] + else: + # older pickle + from pandas.core.internals.api import maybe_infer_ndim + + ndim = maybe_infer_ndim(self.values, self.mgr_locs) + self.ndim = ndim diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index c2947de943e1a..7888a15a7cb26 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -21,10 +21,9 @@ from numpy cimport ( cnp.import_array() from pandas._libs.algos import ( - ensure_int64, - ensure_platform_int, groupsort_indexer, take_1d_int64_int64, + take_1d_intp_intp, ) @@ -34,16 +33,16 @@ def inner_join(const intp_t[:] left, const intp_t[:] right, cdef: Py_ssize_t i, j, k, count = 0 ndarray[intp_t] left_sorter, right_sorter - ndarray[int64_t] left_count, right_count - ndarray[int64_t] left_indexer, right_indexer - int64_t lc, rc + ndarray[intp_t] left_count, right_count + ndarray[intp_t] left_indexer, right_indexer + intp_t lc, rc Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 Py_ssize_t offset # NA group in location 0 - left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups) - right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups) + left_sorter, left_count = groupsort_indexer(left, max_groups) + right_sorter, right_count = groupsort_indexer(right, max_groups) with nogil: # First pass, determine size of result set, do not use the NA group @@ -58,8 +57,8 @@ def inner_join(const intp_t[:] left, const intp_t[:] right, left_pos = left_count[0] right_pos = right_count[0] - left_indexer = np.empty(count, dtype=np.int64) - right_indexer = np.empty(count, dtype=np.int64) + left_indexer = np.empty(count, dtype=np.intp) + right_indexer = np.empty(count, dtype=np.intp) with nogil: for i in range(1, max_groups + 1): @@ -85,17 +84,17 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right, Py_ssize_t max_groups, bint sort=True): cdef: Py_ssize_t i, j, k, count = 0 - ndarray[int64_t] left_count, right_count + ndarray[intp_t] left_count, right_count ndarray[intp_t] rev, left_sorter, right_sorter - ndarray[int64_t] left_indexer, right_indexer - int64_t lc, rc + ndarray[intp_t] left_indexer, right_indexer + intp_t lc, rc Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 Py_ssize_t offset # NA group in location 0 - left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups) - right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups) + left_sorter, left_count = groupsort_indexer(left, max_groups) + right_sorter, right_count = groupsort_indexer(right, max_groups) with nogil: # First pass, determine size of result set, do not use the NA group @@ -109,8 +108,8 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right, left_pos = left_count[0] right_pos = right_count[0] - left_indexer = np.empty(count, dtype=np.int64) - right_indexer = np.empty(count, dtype=np.int64) + left_indexer = np.empty(count, dtype=np.intp) + right_indexer = np.empty(count, dtype=np.intp) with nogil: for i in range(1, max_groups + 1): @@ -142,11 +141,10 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right, # this is a short-cut to avoid groupsort_indexer # otherwise, the `else` path also works in this case rev = np.empty(len(left), dtype=np.intp) - rev.put(ensure_platform_int(left_sorter), np.arange(len(left))) + rev.put(left_sorter, np.arange(len(left))) else: rev, _ = groupsort_indexer(left_indexer, len(left)) - rev = ensure_platform_int(rev) right_indexer = right_indexer.take(rev) left_indexer = left_indexer.take(rev) @@ -159,16 +157,16 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right, cdef: Py_ssize_t i, j, k, count = 0 ndarray[intp_t] left_sorter, right_sorter - ndarray[int64_t] left_count, right_count - ndarray[int64_t] left_indexer, right_indexer - int64_t lc, rc - int64_t left_pos = 0, right_pos = 0 + ndarray[intp_t] left_count, right_count + ndarray[intp_t] left_indexer, right_indexer + intp_t lc, rc + intp_t left_pos = 0, right_pos = 0 Py_ssize_t offset, position = 0 # NA group in location 0 - left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups) - right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups) + left_sorter, left_count = groupsort_indexer(left, max_groups) + right_sorter, right_count = groupsort_indexer(right, max_groups) with nogil: # First pass, determine size of result set, do not use the NA group @@ -185,8 +183,8 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right, left_pos = left_count[0] right_pos = right_count[0] - left_indexer = np.empty(count, dtype=np.int64) - right_indexer = np.empty(count, dtype=np.int64) + left_indexer = np.empty(count, dtype=np.intp) + right_indexer = np.empty(count, dtype=np.intp) with nogil: for i in range(1, max_groups + 1): @@ -217,31 +215,29 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right, _get_result_indexer(right_sorter, right_indexer)) -cdef ndarray[int64_t] _get_result_indexer( - ndarray[intp_t] sorter, ndarray[int64_t] indexer +cdef ndarray[intp_t] _get_result_indexer( + ndarray[intp_t] sorter, ndarray[intp_t] indexer ): if len(sorter) > 0: # cython-only equivalent to # `res = algos.take_nd(sorter, indexer, fill_value=-1)` - res = np.empty(len(indexer), dtype=np.int64) - take_1d_int64_int64(ensure_int64(sorter), ensure_platform_int(indexer), res, -1) - # FIXME: sorter is intp_t, not int64_t, opposite for indexer; - # will this break on 32bit builds? + res = np.empty(len(indexer), dtype=np.intp) + take_1d_intp_intp(sorter, indexer, res, -1) else: # length-0 case - res = np.empty(len(indexer), dtype=np.int64) + res = np.empty(len(indexer), dtype=np.intp) res[:] = -1 return res -def ffill_indexer(const int64_t[:] indexer): +def ffill_indexer(const intp_t[:] indexer): cdef: Py_ssize_t i, n = len(indexer) - ndarray[int64_t] result - int64_t val, last_obs + ndarray[intp_t] result + intp_t val, last_obs - result = np.empty(n, dtype=np.int64) + result = np.empty(n, dtype=np.intp) last_obs = -1 for i in range(n): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 1398db6960cc8..3c88590991d77 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1981,9 +1981,9 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: """ categories = self.categories r, counts = libalgos.groupsort_indexer( - self.codes.astype("int64", copy=False), categories.size + ensure_platform_int(self.codes), categories.size ) - counts = counts.cumsum() + counts = ensure_int64(counts).cumsum() _result = (r[start:end] for start, end in zip(counts, counts[1:])) return dict(zip(categories, _result)) diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 97bffb35c28d9..58da2570015b5 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -8,6 +8,7 @@ import numpy as np from pandas._libs import lib +from pandas._typing import ArrayLike is_bool = lib.is_bool @@ -420,3 +421,28 @@ def is_dataclass(item): return is_dataclass(item) and not isinstance(item, type) except ImportError: return False + + +def is_inferred_bool_dtype(arr: ArrayLike) -> bool: + """ + Check if this is a ndarray[bool] or an ndarray[object] of bool objects. + + Parameters + ---------- + arr : np.ndarray or ExtensionArray + + Returns + ------- + bool + + Notes + ----- + This does not include the special treatment is_bool_dtype uses for + Categorical. + """ + dtype = arr.dtype + if dtype == np.dtype(bool): + return True + elif dtype == np.dtype("object"): + return lib.is_bool_array(arr.ravel("K")) + return False diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e5e7b446d9cb2..094f4a67d2e61 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4154,7 +4154,7 @@ def _get_leaf_sorter(labels: List[np.ndarray]) -> np.ndarray: return np.empty(0, dtype=np.intp) if len(labels) == 1: - return get_group_index_sorter(labels[0]) + return get_group_index_sorter(ensure_platform_int(labels[0])) # find indexers of beginning of each set of # same-key labels w.r.t all but last level @@ -4224,7 +4224,7 @@ def _get_leaf_sorter(labels: List[np.ndarray]) -> np.ndarray: if level == 0: # outer most level, take the fast route ngroups = 1 + new_lev_codes.max() left_indexer, counts = libalgos.groupsort_indexer( - ensure_int64(new_lev_codes), ngroups + new_lev_codes, ngroups ) # missing values are placed first; drop them! diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index aab8273b1e213..d6b76510c68ab 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -59,13 +59,13 @@ def make_block( if not isinstance(placement, BlockPlacement): placement = BlockPlacement(placement) - ndim = _maybe_infer_ndim(values, placement, ndim) + ndim = maybe_infer_ndim(values, placement, ndim) check_ndim(values, placement, ndim) values = maybe_coerce_values(values) return klass(values, ndim=ndim, placement=placement) -def _maybe_infer_ndim(values, placement: BlockPlacement, ndim: Optional[int]) -> int: +def maybe_infer_ndim(values, placement: BlockPlacement, ndim: Optional[int]) -> int: """ If `ndim` is not provided, infer it from placment and values. """ diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 9da019fb2ef95..99a1706c671b1 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -33,7 +33,6 @@ ) from pandas.core.dtypes.common import ( ensure_int64, - is_bool_dtype, is_datetime64_ns_dtype, is_dtype_equal, is_extension_array_dtype, @@ -50,6 +49,7 @@ ABCPandasArray, ABCSeries, ) +from pandas.core.dtypes.inference import is_inferred_bool_dtype from pandas.core.dtypes.missing import ( array_equals, isna, @@ -676,10 +676,7 @@ def get_bool_data(self, copy: bool = False) -> ArrayManager: copy : bool, default False Whether to copy the blocks """ - return self._get_data_subset( - lambda arr: is_bool_dtype(arr.dtype) - or (is_object_dtype(arr.dtype) and lib.is_bool_array(arr)) - ) + return self._get_data_subset(is_inferred_bool_dtype) def get_numeric_data(self, copy: bool = False) -> ArrayManager: """ diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 32cecec01b8be..7d8dcb34ed582 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -13,6 +13,7 @@ Union, cast, ) +import warnings import numpy as np @@ -68,6 +69,7 @@ ABCPandasArray, ABCSeries, ) +from pandas.core.dtypes.inference import is_inferred_bool_dtype from pandas.core.dtypes.missing import ( is_valid_na_for_dtype, isna, @@ -145,7 +147,7 @@ def newfunc(self, *args, **kwargs) -> List[Block]: return cast(F, newfunc) -class Block(PandasObject): +class Block(libinternals.Block, PandasObject): """ Canonical n-dimensional unit of homogeneous dtype contained in a pandas data structure @@ -155,53 +157,13 @@ class Block(PandasObject): values: Union[np.ndarray, ExtensionArray] - __slots__ = ["_mgr_locs", "values", "ndim"] + __slots__ = () is_numeric = False - is_bool = False is_object = False is_extension = False _can_consolidate = True _validate_ndim = True - @classmethod - def _simple_new( - cls, values: ArrayLike, placement: BlockPlacement, ndim: int - ) -> Block: - """ - Fastpath constructor, does *no* validation - """ - obj = object.__new__(cls) - obj.ndim = ndim - obj.values = values - obj._mgr_locs = placement - return obj - - def __init__(self, values, placement: BlockPlacement, ndim: int): - """ - Parameters - ---------- - values : np.ndarray or ExtensionArray - We assume maybe_coerce_values has already been called. - placement : BlockPlacement (or castable) - ndim : int - 1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame - """ - assert isinstance(ndim, int) - assert isinstance(placement, BlockPlacement) - self.ndim = ndim - self._mgr_locs = placement - self.values = values - - @property - def _holder(self): - """ - The array-like that can hold the underlying values. - - None for 'Block', overridden by subclasses that don't - use an ndarray. - """ - return None - @final @property def _consolidate_key(self): @@ -228,7 +190,22 @@ def _can_hold_na(self) -> bool: @final @property def is_categorical(self) -> bool: - return self._holder is Categorical + warnings.warn( + "Block.is_categorical is deprecated and will be removed in a " + "future version. Use isinstance(block.values, Categorical) " + "instead. See https://github.com/pandas-dev/pandas/issues/40226", + DeprecationWarning, + stacklevel=2, + ) + return isinstance(self.values, Categorical) + + @final + @property + def is_bool(self) -> bool: + """ + We can be bool if a) we are bool dtype or b) object dtype with bool objects. + """ + return is_inferred_bool_dtype(self.values) @final def external_values(self): @@ -279,7 +256,6 @@ def mgr_locs(self) -> BlockPlacement: @mgr_locs.setter def mgr_locs(self, new_mgr_locs: BlockPlacement): - assert isinstance(new_mgr_locs, BlockPlacement) self._mgr_locs = new_mgr_locs @final @@ -324,16 +300,6 @@ def __repr__(self) -> str: def __len__(self) -> int: return len(self.values) - @final - def __getstate__(self): - return self.mgr_locs.indexer, self.values - - @final - def __setstate__(self, state): - self.mgr_locs = libinternals.BlockPlacement(state[0]) - self.values = extract_array(state[1], extract_numpy=True) - self.ndim = self.values.ndim - def _slice(self, slicer): """ return a slice of my values """ @@ -354,7 +320,7 @@ def getitem_block(self, slicer) -> Block: if new_values.ndim != self.values.ndim: raise ValueError("Only same dim slicing is allowed") - return type(self)._simple_new(new_values, new_mgr_locs, self.ndim) + return type(self)(new_values, new_mgr_locs, self.ndim) @final def getitem_block_index(self, slicer: slice) -> Block: @@ -366,7 +332,7 @@ def getitem_block_index(self, slicer: slice) -> Block: # error: Invalid index type "Tuple[ellipsis, slice]" for # "Union[ndarray, ExtensionArray]"; expected type "Union[int, slice, ndarray]" new_values = self.values[..., slicer] # type: ignore[index] - return type(self)._simple_new(new_values, self._mgr_locs, ndim=self.ndim) + return type(self)(new_values, self._mgr_locs, ndim=self.ndim) @final def getitem_block_columns(self, slicer, new_mgr_locs: BlockPlacement) -> Block: @@ -380,7 +346,7 @@ def getitem_block_columns(self, slicer, new_mgr_locs: BlockPlacement) -> Block: if new_values.ndim != self.values.ndim: raise ValueError("Only same dim slicing is allowed") - return type(self)._simple_new(new_values, new_mgr_locs, self.ndim) + return type(self)(new_values, new_mgr_locs, self.ndim) @property def shape(self) -> Shape: @@ -798,8 +764,10 @@ def _replace_list( """ See BlockManager._replace_list docstring. """ + values = self.values + # TODO: dont special-case Categorical - if self.is_categorical and len(algos.unique(dest_list)) == 1: + if isinstance(values, Categorical) and len(algos.unique(dest_list)) == 1: # We likely got here by tiling value inside NDFrame.replace, # so un-tile here return self.replace(src_list, dest_list[0], inplace, regex) @@ -814,17 +782,17 @@ def _replace_list( src_len = len(pairs) - 1 - if self.is_object: + if values.dtype == _dtype_obj: # Calculate the mask once, prior to the call of comp # in order to avoid repeating the same computations - mask = ~isna(self.values) + mask = ~isna(values) masks = [ - compare_or_regex_search(self.values, s[0], regex=regex, mask=mask) + compare_or_regex_search(values, s[0], regex=regex, mask=mask) for s in pairs ] else: # GH#38086 faster if we know we dont need to check for regex - masks = [missing.mask_missing(self.values, s[0]) for s in pairs] + masks = [missing.mask_missing(values, s[0]) for s in pairs] # error: Argument 1 to "extract_bool_array" has incompatible type # "Union[ExtensionArray, ndarray, bool]"; expected "Union[ExtensionArray, @@ -1326,7 +1294,8 @@ def where(self, other, cond, errors="raise") -> List[Block]: if noop: # TODO: avoid the downcasting at the end in this case? - result = values + # GH-39595: Always return a copy + result = values.copy() else: # see if we can operate on the entire block, or need item-by-item # or if we are a single block (ndim == 1) @@ -1503,11 +1472,6 @@ def putmask(self, mask, new) -> List[Block]: new_values[mask] = new return [self.make_block(values=new_values)] - @property - def _holder(self): - # For extension blocks, the holder is values-dependent. - return type(self.values) - @property def is_view(self) -> bool: """Extension arrays are never treated as views.""" @@ -1713,7 +1677,7 @@ def where(self, other, cond, errors="raise") -> List[Block]: # NotImplementedError for class not implementing `__setitem__` # TypeError for SparseArray, which implements just to raise # a TypeError - result = self._holder._from_sequence( + result = type(self.values)._from_sequence( np.where(cond, self.values, other), dtype=dtype ) @@ -1785,10 +1749,6 @@ def _can_hold_element(self, element: Any) -> bool: # "Union[dtype[Any], ExtensionDtype]"; expected "dtype[Any]" return can_hold_element(self.dtype, element) # type: ignore[arg-type] - @property - def is_bool(self): - return self.dtype.kind == "b" - class NDArrayBackedExtensionBlock(HybridMixin, Block): """ @@ -1903,10 +1863,6 @@ class DatetimeLikeBlockMixin(NDArrayBackedExtensionBlock): def array_values(self): return ensure_wrapped_if_datetimelike(self.values) - @property - def _holder(self): - return type(self.array_values()) - class DatetimeBlock(DatetimeLikeBlockMixin): __slots__ = () @@ -1920,7 +1876,7 @@ def set_inplace(self, locs, values): self.values[locs] = values -class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): +class DatetimeTZBlock(ExtensionBlock, DatetimeLikeBlockMixin): """ implement a datetime64 block with a tz attribute """ values: DatetimeArray @@ -1955,14 +1911,6 @@ class ObjectBlock(Block): values: np.ndarray - @property - def is_bool(self): - """ - we can be a bool if we have only bool values but are of type - object - """ - return lib.is_bool_array(self.values.ravel("K")) - @maybe_split def reduce(self, func, ignore_failures: bool = False) -> List[Block]: """ diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index b40e2d90869ec..d5e549ec874da 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -39,6 +39,7 @@ import pandas.core.algorithms as algos from pandas.core.arrays import ( + Categorical, DatetimeArray, ExtensionArray, ) @@ -367,7 +368,7 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: # preserve these for validation in concat_compat return self.block.values - if self.block.is_bool and not self.block.is_categorical: + if self.block.is_bool and not isinstance(self.block.values, Categorical): # External code requested filling/upcasting, bool values must # be upcasted to object to avoid being upcasted to numeric. values = self.block.astype(np.object_).values diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 2308f9edb4328..abfd6932d7b21 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1986,7 +1986,7 @@ def asfreq(obj, freq, method=None, how=None, normalize=False, fill_value=None): new_obj.index = _asfreq_compat(obj.index, freq) else: - dti = date_range(obj.index[0], obj.index[-1], freq=freq) + dti = date_range(obj.index.min(), obj.index.max(), freq=freq) dti.name = obj.index.name new_obj = obj.reindex(dti, method=method, fill_value=fill_value) if normalize: diff --git a/pandas/core/series.py b/pandas/core/series.py index 83eb4c38bc163..27042f7de9dc1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3857,7 +3857,8 @@ def explode(self, ignore_index: bool = False) -> Series: dtype: object """ if not len(self) or not is_object_dtype(self): - return self.copy() + result = self.copy() + return result.reset_index(drop=True) if ignore_index else result values, counts = reshape.explode(np.asarray(self._values)) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 88fcc13502439..02c41538ca123 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -604,7 +604,7 @@ def get_group_index_sorter( (alpha + beta * ngroups) < (count * np.log(count)) # type: ignore[operator] ) if do_groupsort: - sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), ngroups) + sorter, _ = algos.groupsort_indexer(ensure_platform_int(group_index), ngroups) # sorter _should_ already be intp, but mypy is not yet able to verify else: sorter = group_index.argsort(kind="mergesort") diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 282a0dba8ac03..9250d861740fc 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -48,6 +48,7 @@ from pandas.core.indexes.api import Index jinja2 = import_optional_dependency("jinja2", extra="DataFrame.style requires jinja2.") +from markupsafe import escape as escape_func # markupsafe is jinja2 dependency BaseFormatter = Union[str, Callable] ExtFormatter = Union[BaseFormatter, Dict[Any, Optional[BaseFormatter]]] @@ -113,6 +114,12 @@ class Styler: .. versionadded:: 1.2.0 + escape : bool, default False + Replace the characters ``&``, ``<``, ``>``, ``'``, and ``"`` in cell display + strings with HTML-safe sequences. + + ... versionadded:: 1.3.0 + Attributes ---------- env : Jinja2 jinja2.Environment @@ -169,6 +176,7 @@ def __init__( cell_ids: bool = True, na_rep: Optional[str] = None, uuid_len: int = 5, + escape: bool = False, ): # validate ordered args if isinstance(data, pd.Series): @@ -202,7 +210,7 @@ def __init__( ] = defaultdict(lambda: partial(_default_formatter, precision=def_precision)) self.precision = precision # can be removed on set_precision depr cycle self.na_rep = na_rep # can be removed on set_na_rep depr cycle - self.format(formatter=None, precision=precision, na_rep=na_rep) + self.format(formatter=None, precision=precision, na_rep=na_rep, escape=escape) def _repr_html_(self) -> str: """ @@ -544,6 +552,7 @@ def format( subset: Optional[Union[slice, Sequence[Any]]] = None, na_rep: Optional[str] = None, precision: Optional[int] = None, + escape: bool = False, ) -> Styler: """ Format the text display value of cells. @@ -567,6 +576,12 @@ def format( .. versionadded:: 1.3.0 + escape : bool, default False + Replace the characters ``&``, ``<``, ``>``, ``'``, and ``"`` in cell display + string with HTML-safe sequences. Escaping is done before ``formatter``. + + .. versionadded:: 1.3.0 + Returns ------- self : Styler @@ -606,7 +621,7 @@ def format( 0 MISS 1.000 A 1 2.000 MISS 3.000 - Using a format specification on consistent column dtypes + Using a ``formatter`` specification on consistent column dtypes >>> df.style.format('{:.2f}', na_rep='MISS', subset=[0,1]) 0 1 2 @@ -629,15 +644,34 @@ def format( 0 MISS 1.00 A 1 2.0 PASS 3.00 - Using a callable formatting function + Using a callable ``formatter`` function. >>> func = lambda s: 'STRING' if isinstance(s, str) else 'FLOAT' >>> df.style.format({0: '{:.1f}', 2: func}, precision=4, na_rep='MISS') 0 1 2 0 MISS 1.0000 STRING 1 2.0 MISS FLOAT - """ - if all((formatter is None, subset is None, precision is None, na_rep is None)): + + Using a ``formatter`` with HTML ``escape`` and ``na_rep``. + + >>> df = pd.DataFrame([['
', '"A&B"', None]]) + >>> s = df.style.format('{0}', escape=True, na_rep="NA") + >>> s.render() + ... + <div></div> + "A&B" + NA + ... + """ + if all( + ( + formatter is None, + subset is None, + precision is None, + na_rep is None, + escape is False, + ) + ): self._display_funcs.clear() return self # clear the formatter / revert to default and avoid looping @@ -655,7 +689,7 @@ def format( except KeyError: format_func = None format_func = _maybe_wrap_formatter( - format_func, na_rep=na_rep, precision=precision + format_func, na_rep=na_rep, precision=precision, escape=escape ) for row, value in data[[col]].itertuples(): @@ -2192,6 +2226,7 @@ def _maybe_wrap_formatter( formatter: Optional[BaseFormatter] = None, na_rep: Optional[str] = None, precision: Optional[int] = None, + escape: bool = False, ) -> Callable: """ Allows formatters to be expressed as str, callable or None, where None returns @@ -2208,10 +2243,19 @@ def _maybe_wrap_formatter( else: raise TypeError(f"'formatter' expected str or callable, got {type(formatter)}") + def _str_escape(x, escape: bool): + """if escaping: only use on str, else return input""" + if escape and isinstance(x, str): + return escape_func(x) + else: + return x + + display_func = lambda x: formatter_func(partial(_str_escape, escape=escape)(x)) + if na_rep is None: - return formatter_func + return display_func else: - return lambda x: na_rep if pd.isna(x) else formatter_func(x) + return lambda x: na_rep if pd.isna(x) else display_func(x) def _maybe_convert_css_to_tuples(style: CSSProperties) -> CSSList: diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index bc84d7c70b01c..574fa46d10f67 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -692,3 +692,20 @@ def test_where_try_cast_deprecated(frame_or_series): with tm.assert_produces_warning(FutureWarning): # try_cast keyword deprecated obj.where(mask, -1, try_cast=False) + + +def test_where_copies_with_noop(frame_or_series): + # GH-39595 + result = frame_or_series([1, 2, 3, 4]) + expected = result.copy() + col = result[0] if frame_or_series is DataFrame else result + + where_res = result.where(col < 5) + where_res *= 2 + + tm.assert_equal(result, expected) + + where_res = result.where(col > 5, [1, 2, 3, 4]) + where_res *= 2 + + tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_asfreq.py b/pandas/tests/frame/methods/test_asfreq.py index 8a32841466b18..0d28af5ed7be9 100644 --- a/pandas/tests/frame/methods/test_asfreq.py +++ b/pandas/tests/frame/methods/test_asfreq.py @@ -91,3 +91,15 @@ def test_asfreq_with_date_object_index(self, frame_or_series): result = ts2.asfreq("4H", method="ffill") expected = ts.asfreq("4H", method="ffill") tm.assert_equal(result, expected) + + def test_asfreq_with_unsorted_index(self, frame_or_series): + # GH#39805 + # Test that rows are not dropped when the datetime index is out of order + index = to_datetime(["2021-01-04", "2021-01-02", "2021-01-03", "2021-01-01"]) + result = frame_or_series(range(4), index=index) + + expected = result.reindex(sorted(index)) + expected.index = expected.index._with_freq("infer") + + result = result.asfreq("D") + tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index d24320ad17709..672ab20fb9791 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1163,6 +1163,9 @@ def test_any_all_object_bool_only(self): df._consolidate_inplace() df["C"] = Series([True, True]) + # Categorical of bools is _not_ considered booly + df["D"] = df["C"].astype("category") + # The underlying bug is in DataFrame._get_bool_data, so we check # that while we're here res = df._get_bool_data() diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index ef1c3ec0c2860..fc06b85b1f954 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -27,11 +27,7 @@ ) import pandas._testing as tm import pandas.core.algorithms as algos -from pandas.core.arrays import ( - DatetimeArray, - SparseArray, - TimedeltaArray, -) +from pandas.core.arrays import SparseArray from pandas.core.internals import ( BlockManager, SingleBlockManager, @@ -320,6 +316,12 @@ def test_split(self): for res, exp in zip(result, expected): assert_block_equal(res, exp) + def test_is_categorical_deprecated(self): + # GH#40571 + blk = self.fblock + with tm.assert_produces_warning(DeprecationWarning): + blk.is_categorical + class TestBlockManager: def test_attrs(self): @@ -1302,21 +1304,6 @@ def test_should_store_categorical(self): assert not blk.should_store(np.asarray(cat)) -@pytest.mark.parametrize( - "typestr, holder", - [ - ("category", Categorical), - ("M8[ns]", DatetimeArray), - ("M8[ns, US/Central]", DatetimeArray), - ("m8[ns]", TimedeltaArray), - ("sparse", SparseArray), - ], -) -def test_holder(typestr, holder, block_maker): - blk = create_block(typestr, [1], maker=block_maker) - assert blk._holder is holder - - def test_validate_ndim(block_maker): values = np.array([1.0, 2.0]) placement = slice(2) diff --git a/pandas/tests/io/formats/style/test_style.py b/pandas/tests/io/formats/style/test_style.py index a377074e5484e..d5b6724fd15e6 100644 --- a/pandas/tests/io/formats/style/test_style.py +++ b/pandas/tests/io/formats/style/test_style.py @@ -648,6 +648,36 @@ def test_format_clear(self): self.styler.format() assert (0, 0) not in self.styler._display_funcs # formatter cleared to default + def test_format_escape(self): + df = DataFrame([['<>&"']]) + s = Styler(df, uuid_len=0).format("X&{0}>X", escape=False) + expected = 'X&<>&">X' + assert expected in s.render() + + # only the value should be escaped before passing to the formatter + s = Styler(df, uuid_len=0).format("X&{0}>X", escape=True) + ex = 'X&<>&">X' + assert ex in s.render() + + def test_format_escape_na_rep(self): + # tests the na_rep is not escaped + df = DataFrame([['<>&"', None]]) + s = Styler(df, uuid_len=0).format("X&{0}>X", escape=True, na_rep="&") + ex = 'X&<>&">X' + expected2 = '&' + assert ex in s.render() + assert expected2 in s.render() + + def test_format_escape_floats(self): + # test given formatter for number format is not impacted by escape + s = self.df.style.format("{:.1f}", escape=True) + for expected in [">0.0<", ">1.0<", ">-1.2<", ">-0.6<"]: + assert expected in s.render() + # tests precision of floats is not impacted by escape + s = self.df.style.format(precision=1, escape=True) + for expected in [">0<", ">1<", ">-1.2<", ">-0.6<"]: + assert expected in s.render() + def test_nonunique_raises(self): df = DataFrame([[1, 2]], columns=["A", "A"]) msg = "style is not supported for non-unique indices." diff --git a/pandas/tests/libs/test_join.py b/pandas/tests/libs/test_join.py index f5426c71511bb..eeb66f8941260 100644 --- a/pandas/tests/libs/test_join.py +++ b/pandas/tests/libs/test_join.py @@ -264,8 +264,8 @@ def test_left_outer_join_bug(): lidx, ridx = libjoin.left_outer_join(left, right, max_groups, sort=False) - exp_lidx = np.arange(len(left), dtype=np.int64) - exp_ridx = -np.ones(len(left), dtype=np.int64) + exp_lidx = np.arange(len(left), dtype=np.intp) + exp_ridx = -np.ones(len(left), dtype=np.intp) exp_ridx[left == 1] = 1 exp_ridx[left == 3] = 0 diff --git a/pandas/tests/series/methods/test_explode.py b/pandas/tests/series/methods/test_explode.py index 1f0fbd1cc5ecb..c73737dad89aa 100644 --- a/pandas/tests/series/methods/test_explode.py +++ b/pandas/tests/series/methods/test_explode.py @@ -134,3 +134,11 @@ def test_explode_sets(): result = s.explode().sort_values() expected = pd.Series(["a", "b", "c"], index=[1, 1, 1]) tm.assert_series_equal(result, expected) + + +def test_explode_scalars_can_ignore_index(): + # https://github.com/pandas-dev/pandas/issues/40487 + s = pd.Series([1, 2, 3], index=["a", "b", "c"]) + result = s.explode(ignore_index=True) + expected = pd.Series([1, 2, 3]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index c8df18ddaeebe..cd800b3f3a452 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2116,8 +2116,8 @@ def test_is_lexsorted(): def test_groupsort_indexer(): - a = np.random.randint(0, 1000, 100).astype(np.int64) - b = np.random.randint(0, 1000, 100).astype(np.int64) + a = np.random.randint(0, 1000, 100).astype(np.intp) + b = np.random.randint(0, 1000, 100).astype(np.intp) result = libalgos.groupsort_indexer(a, 1000)[0]