Skip to content

Commit 173f5e1

Browse files
authored
Merge pull request #154 from pandas-dev/master
Sync Fork from Upstream Repo
2 parents 30aa926 + 1f16f49 commit 173f5e1

26 files changed

+469
-329
lines changed

doc/source/whatsnew/v1.2.4.rst

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ Fixed regressions
1717

1818
- Fixed regression in :meth:`DataFrame.sum` when ``min_count`` greater than the :class:`DataFrame` shape was passed resulted in a ``ValueError`` (:issue:`39738`)
1919
- Fixed regression in :meth:`DataFrame.to_json` raising ``AttributeError`` when run on PyPy (:issue:`39837`)
20+
- Fixed regression in :meth:`DataFrame.where` not returning a copy in the case of an all True condition (:issue:`39595`)
2021
-
2122

2223
.. ---------------------------------------------------------------------------

doc/source/whatsnew/v1.3.0.rst

+3
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ Other enhancements
136136
- :meth:`.Styler.set_table_styles` amended to optionally allow certain css-string input arguments (:issue:`39564`)
137137
- :meth:`.Styler.apply` now more consistently accepts ndarray function returns, i.e. in all cases for ``axis`` is ``0, 1 or None`` (:issue:`39359`)
138138
- :meth:`.Styler.apply` and :meth:`.Styler.applymap` now raise errors if wrong format CSS is passed on render (:issue:`39660`)
139+
- :meth:`.Styler.format` adds keyword argument ``escape`` for optional HTML escaping (:issue:`40437`)
139140
- Builtin highlighting methods in :class:`Styler` have a more consistent signature and css customisability (:issue:`40242`)
140141
- :meth:`Series.loc.__getitem__` and :meth:`Series.loc.__setitem__` with :class:`MultiIndex` now raising helpful error message when indexer has too many dimensions (:issue:`35349`)
141142
- :meth:`pandas.read_stata` and :class:`StataReader` support reading data from compressed files.
@@ -631,6 +632,7 @@ Groupby/resample/rolling
631632
- Bug in :class:`core.window.ewm.ExponentialMovingWindow` when calling ``__getitem__`` would incorrectly raise a ``ValueError`` when providing ``times`` (:issue:`40164`)
632633
- Bug in :class:`core.window.ewm.ExponentialMovingWindow` when calling ``__getitem__`` would not retain ``com``, ``span``, ``alpha`` or ``halflife`` attributes (:issue:`40164`)
633634
- :class:`core.window.ewm.ExponentialMovingWindow` now raises a ``NotImplementedError`` when specifying ``times`` with ``adjust=False`` due to an incorrect calculation (:issue:`40098`)
635+
- Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`)
634636

635637
Reshaping
636638
^^^^^^^^^
@@ -649,6 +651,7 @@ Reshaping
649651
- Allow :class:`Index` to be passed to the :func:`numpy.all` function (:issue:`40180`)
650652
- Bug in :meth:`DataFrame.stack` not preserving ``CategoricalDtype`` in a ``MultiIndex`` (:issue:`36991`)
651653
- Bug in :func:`to_datetime` raising error when input sequence contains unhashable items (:issue:`39756`)
654+
- Bug in :meth:`Series.explode` preserving index when ``ignore_index`` was ``True`` and values were scalars (:issue:`40487`)
652655

653656
Sparse
654657
^^^^^^

pandas/_libs/algos.pyx

+89-73
Large diffs are not rendered by default.

pandas/_libs/algos_take_helper.pxi.in

+26
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,32 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
88
# take_1d, take_2d
99
# ----------------------------------------------------------------------
1010

11+
12+
@cython.wraparound(False)
13+
@cython.boundscheck(False)
14+
def take_1d_intp_intp(
15+
const intp_t[:] values,
16+
const intp_t[:] indexer,
17+
intp_t[::1] out,
18+
intp_t fill_value=-1,
19+
):
20+
cdef:
21+
Py_ssize_t i, n, idx
22+
intp_t fv
23+
24+
n = indexer.shape[0]
25+
26+
fv = fill_value
27+
28+
with nogil:
29+
for i in range(n):
30+
idx = indexer[i]
31+
if idx == -1:
32+
out[i] = fv
33+
else:
34+
out[i] = values[idx]
35+
36+
1137
{{py:
1238

1339
# c_type_in, c_type_out

pandas/_libs/groupby.pyx

+60-86
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ from pandas._libs.util cimport (
3737
)
3838

3939
from pandas._libs.algos import (
40+
ensure_platform_int,
4041
groupsort_indexer,
4142
rank_1d,
4243
take_2d_axis1_float64_float64,
@@ -111,7 +112,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
111112
"""
112113
cdef:
113114
Py_ssize_t i, j, N, K, ngroups, size
114-
ndarray[int64_t] _counts
115+
ndarray[intp_t] _counts
115116
ndarray[float64_t, ndim=2] data
116117
ndarray[intp_t] indexer
117118
float64_t* ptr
@@ -121,7 +122,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
121122
ngroups = len(counts)
122123
N, K = (<object>values).shape
123124

124-
indexer, _counts = groupsort_indexer(labels, ngroups)
125+
indexer, _counts = groupsort_indexer(ensure_platform_int(labels), ngroups)
125126
counts[:] = _counts[1:]
126127

127128
data = np.empty((K, N), dtype=np.float64)
@@ -1127,18 +1128,40 @@ ctypedef fused groupby_t:
11271128

11281129
@cython.wraparound(False)
11291130
@cython.boundscheck(False)
1130-
def group_max(groupby_t[:, ::1] out,
1131-
int64_t[::1] counts,
1132-
ndarray[groupby_t, ndim=2] values,
1133-
const int64_t[:] labels,
1134-
Py_ssize_t min_count=-1):
1131+
cdef group_min_max(groupby_t[:, ::1] out,
1132+
int64_t[::1] counts,
1133+
ndarray[groupby_t, ndim=2] values,
1134+
const int64_t[:] labels,
1135+
Py_ssize_t min_count=-1,
1136+
bint compute_max=True):
11351137
"""
1136-
Only aggregates on axis=0
1138+
Compute minimum/maximum of columns of `values`, in row groups `labels`.
1139+
1140+
Parameters
1141+
----------
1142+
out : array
1143+
Array to store result in.
1144+
counts : int64 array
1145+
Input as a zeroed array, populated by group sizes during algorithm
1146+
values : array
1147+
Values to find column-wise min/max of.
1148+
labels : int64 array
1149+
Labels to group by.
1150+
min_count : Py_ssize_t, default -1
1151+
The minimum number of non-NA group elements, NA result if threshold
1152+
is not met
1153+
compute_max : bint, default True
1154+
True to compute group-wise max, False to compute min
1155+
1156+
Notes
1157+
-----
1158+
This method modifies the `out` parameter, rather than returning an object.
1159+
`counts` is modified to hold group sizes
11371160
"""
11381161
cdef:
1139-
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
1140-
groupby_t val, count, nan_val
1141-
ndarray[groupby_t, ndim=2] maxx
1162+
Py_ssize_t i, j, N, K, lab, ngroups = len(counts)
1163+
groupby_t val, nan_val
1164+
ndarray[groupby_t, ndim=2] group_min_or_max
11421165
bint runtime_error = False
11431166
int64_t[:, ::1] nobs
11441167

@@ -1150,18 +1173,17 @@ def group_max(groupby_t[:, ::1] out,
11501173
min_count = max(min_count, 1)
11511174
nobs = np.zeros((<object>out).shape, dtype=np.int64)
11521175

1153-
maxx = np.empty_like(out)
1176+
group_min_or_max = np.empty_like(out)
11541177
if groupby_t is int64_t:
1155-
# Note: evaluated at compile-time
1156-
maxx[:] = -_int64_max
1178+
group_min_or_max[:] = -_int64_max if compute_max else _int64_max
11571179
nan_val = NPY_NAT
11581180
elif groupby_t is uint64_t:
11591181
# NB: We do not define nan_val because there is no such thing
1160-
# for uint64_t. We carefully avoid having to reference it in this
1161-
# case.
1162-
maxx[:] = 0
1182+
# for uint64_t. We carefully avoid having to reference it in this
1183+
# case.
1184+
group_min_or_max[:] = 0 if compute_max else np.iinfo(np.uint64).max
11631185
else:
1164-
maxx[:] = -np.inf
1186+
group_min_or_max[:] = -np.inf if compute_max else np.inf
11651187
nan_val = NAN
11661188

11671189
N, K = (<object>values).shape
@@ -1179,20 +1201,23 @@ def group_max(groupby_t[:, ::1] out,
11791201
if not _treat_as_na(val, True):
11801202
# TODO: Sure we always want is_datetimelike=True?
11811203
nobs[lab, j] += 1
1182-
if val > maxx[lab, j]:
1183-
maxx[lab, j] = val
1204+
if compute_max:
1205+
if val > group_min_or_max[lab, j]:
1206+
group_min_or_max[lab, j] = val
1207+
else:
1208+
if val < group_min_or_max[lab, j]:
1209+
group_min_or_max[lab, j] = val
11841210

1185-
for i in range(ncounts):
1211+
for i in range(ngroups):
11861212
for j in range(K):
11871213
if nobs[i, j] < min_count:
11881214
if groupby_t is uint64_t:
11891215
runtime_error = True
11901216
break
11911217
else:
1192-
11931218
out[i, j] = nan_val
11941219
else:
1195-
out[i, j] = maxx[i, j]
1220+
out[i, j] = group_min_or_max[i, j]
11961221

11971222
if runtime_error:
11981223
# We cannot raise directly above because that is within a nogil
@@ -1202,75 +1227,24 @@ def group_max(groupby_t[:, ::1] out,
12021227

12031228
@cython.wraparound(False)
12041229
@cython.boundscheck(False)
1205-
def group_min(groupby_t[:, ::1] out,
1230+
def group_max(groupby_t[:, ::1] out,
12061231
int64_t[::1] counts,
12071232
ndarray[groupby_t, ndim=2] values,
12081233
const int64_t[:] labels,
12091234
Py_ssize_t min_count=-1):
1210-
"""
1211-
Only aggregates on axis=0
1212-
"""
1213-
cdef:
1214-
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
1215-
groupby_t val, count, nan_val
1216-
ndarray[groupby_t, ndim=2] minx
1217-
bint runtime_error = False
1218-
int64_t[:, ::1] nobs
1219-
1220-
# TODO(cython 3.0):
1221-
# Instead of `labels.shape[0]` use `len(labels)`
1222-
if not len(values) == labels.shape[0]:
1223-
raise AssertionError("len(index) != len(labels)")
1224-
1225-
min_count = max(min_count, 1)
1226-
nobs = np.zeros((<object>out).shape, dtype=np.int64)
1227-
1228-
minx = np.empty_like(out)
1229-
if groupby_t is int64_t:
1230-
minx[:] = _int64_max
1231-
nan_val = NPY_NAT
1232-
elif groupby_t is uint64_t:
1233-
# NB: We do not define nan_val because there is no such thing
1234-
# for uint64_t. We carefully avoid having to reference it in this
1235-
# case.
1236-
minx[:] = np.iinfo(np.uint64).max
1237-
else:
1238-
minx[:] = np.inf
1239-
nan_val = NAN
1235+
"""See group_min_max.__doc__"""
1236+
group_min_max(out, counts, values, labels, min_count=min_count, compute_max=True)
12401237

1241-
N, K = (<object>values).shape
12421238

1243-
with nogil:
1244-
for i in range(N):
1245-
lab = labels[i]
1246-
if lab < 0:
1247-
continue
1248-
1249-
counts[lab] += 1
1250-
for j in range(K):
1251-
val = values[i, j]
1252-
1253-
if not _treat_as_na(val, True):
1254-
# TODO: Sure we always want is_datetimelike=True?
1255-
nobs[lab, j] += 1
1256-
if val < minx[lab, j]:
1257-
minx[lab, j] = val
1258-
1259-
for i in range(ncounts):
1260-
for j in range(K):
1261-
if nobs[i, j] < min_count:
1262-
if groupby_t is uint64_t:
1263-
runtime_error = True
1264-
break
1265-
else:
1266-
out[i, j] = nan_val
1267-
else:
1268-
out[i, j] = minx[i, j]
1269-
1270-
if runtime_error:
1271-
# We cannot raise directly above because that is within a nogil
1272-
# block.
1273-
raise RuntimeError("empty group with uint64_t")
1239+
@cython.wraparound(False)
1240+
@cython.boundscheck(False)
1241+
def group_min(groupby_t[:, ::1] out,
1242+
int64_t[::1] counts,
1243+
ndarray[groupby_t, ndim=2] values,
1244+
const int64_t[:] labels,
1245+
Py_ssize_t min_count=-1):
1246+
"""See group_min_max.__doc__"""
1247+
group_min_max(out, counts, values, labels, min_count=min_count, compute_max=False)
12741248

12751249

12761250
@cython.boundscheck(False)

pandas/_libs/internals.pyx

+50
Original file line numberDiff line numberDiff line change
@@ -455,3 +455,53 @@ def get_blkno_placements(blknos, group: bool = True):
455455

456456
for blkno, indexer in get_blkno_indexers(blknos, group):
457457
yield blkno, BlockPlacement(indexer)
458+
459+
460+
@cython.freelist(64)
461+
cdef class Block:
462+
"""
463+
Defining __init__ in a cython class significantly improves performance.
464+
"""
465+
cdef:
466+
public BlockPlacement _mgr_locs
467+
readonly int ndim
468+
public object values
469+
470+
def __cinit__(self, values, placement: BlockPlacement, ndim: int):
471+
"""
472+
Parameters
473+
----------
474+
values : np.ndarray or ExtensionArray
475+
We assume maybe_coerce_values has already been called.
476+
placement : BlockPlacement
477+
ndim : int
478+
1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame
479+
"""
480+
self._mgr_locs = placement
481+
self.ndim = ndim
482+
self.values = values
483+
484+
cpdef __reduce__(self):
485+
# We have to do some gymnastics b/c "ndim" is keyword-only
486+
from functools import partial
487+
488+
from pandas.core.internals.blocks import new_block
489+
490+
args = (self.values, self.mgr_locs.indexer)
491+
func = partial(new_block, ndim=self.ndim)
492+
return func, args
493+
494+
cpdef __setstate__(self, state):
495+
from pandas.core.construction import extract_array
496+
497+
self.mgr_locs = BlockPlacement(state[0])
498+
self.values = extract_array(state[1], extract_numpy=True)
499+
if len(state) > 2:
500+
# we stored ndim
501+
self.ndim = state[2]
502+
else:
503+
# older pickle
504+
from pandas.core.internals.api import maybe_infer_ndim
505+
506+
ndim = maybe_infer_ndim(self.values, self.mgr_locs)
507+
self.ndim = ndim

0 commit comments

Comments
 (0)