Skip to content

Commit a15bd69

Browse files
authored
Merge branch 'main' into fix-scatter-norm-keyword
2 parents 0da0941 + 017a645 commit a15bd69

31 files changed

+521
-396
lines changed

.github/workflows/sdist.yml

+2
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,13 @@ on:
99
branches:
1010
- main
1111
- 1.4.x
12+
types: [labeled, opened, synchronize, reopened]
1213
paths-ignore:
1314
- "doc/**"
1415

1516
jobs:
1617
build:
18+
if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}}
1719
runs-on: ubuntu-latest
1820
timeout-minutes: 60
1921
defaults:

asv_bench/benchmarks/indexing.py

+61-21
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
CategoricalIndex,
1414
DataFrame,
1515
Float64Index,
16-
IndexSlice,
1716
Int64Index,
1817
IntervalIndex,
1918
MultiIndex,
@@ -200,28 +199,69 @@ def time_take(self, index):
200199

201200

202201
class MultiIndexing:
203-
def setup(self):
204-
mi = MultiIndex.from_product([range(1000), range(1000)])
205-
self.s = Series(np.random.randn(1000000), index=mi)
206-
self.df = DataFrame(self.s)
207202

208-
n = 100000
209-
with warnings.catch_warnings(record=True):
210-
self.mdt = DataFrame(
211-
{
212-
"A": np.random.choice(range(10000, 45000, 1000), n),
213-
"B": np.random.choice(range(10, 400), n),
214-
"C": np.random.choice(range(1, 150), n),
215-
"D": np.random.choice(range(10000, 45000), n),
216-
"x": np.random.choice(range(400), n),
217-
"y": np.random.choice(range(25), n),
218-
}
219-
)
220-
self.idx = IndexSlice[20000:30000, 20:30, 35:45, 30000:40000]
221-
self.mdt = self.mdt.set_index(["A", "B", "C", "D"]).sort_index()
203+
params = [True, False]
204+
param_names = ["unique_levels"]
205+
206+
def setup(self, unique_levels):
207+
self.ndim = 2
208+
if unique_levels:
209+
mi = MultiIndex.from_arrays([range(1000000)] * self.ndim)
210+
else:
211+
mi = MultiIndex.from_product([range(1000)] * self.ndim)
212+
self.df = DataFrame(np.random.randn(len(mi)), index=mi)
213+
214+
self.tgt_slice = slice(200, 800)
215+
self.tgt_null_slice = slice(None)
216+
self.tgt_list = list(range(0, 1000, 10))
217+
self.tgt_scalar = 500
218+
219+
bool_indexer = np.zeros(len(mi), dtype=np.bool_)
220+
bool_indexer[slice(0, len(mi), 100)] = True
221+
self.tgt_bool_indexer = bool_indexer
222+
223+
def time_loc_partial_key_slice(self, unique_levels):
224+
self.df.loc[self.tgt_slice, :]
225+
226+
def time_loc_partial_key_null_slice(self, unique_levels):
227+
self.df.loc[self.tgt_null_slice, :]
228+
229+
def time_loc_partial_key_list(self, unique_levels):
230+
self.df.loc[self.tgt_list, :]
231+
232+
def time_loc_partial_key_scalar(self, unique_levels):
233+
self.df.loc[self.tgt_scalar, :]
234+
235+
def time_loc_partial_bool_indexer(self, unique_levels):
236+
self.df.loc[self.tgt_bool_indexer, :]
237+
238+
def time_loc_all_slices(self, unique_levels):
239+
target = tuple([self.tgt_slice] * self.ndim)
240+
self.df.loc[target, :]
241+
242+
def time_loc_all_null_slices(self, unique_levels):
243+
target = tuple([self.tgt_null_slice] * self.ndim)
244+
self.df.loc[target, :]
245+
246+
def time_loc_all_lists(self, unique_levels):
247+
target = tuple([self.tgt_list] * self.ndim)
248+
self.df.loc[target, :]
249+
250+
def time_loc_all_scalars(self, unique_levels):
251+
target = tuple([self.tgt_scalar] * self.ndim)
252+
self.df.loc[target, :]
253+
254+
def time_loc_all_bool_indexers(self, unique_levels):
255+
target = tuple([self.tgt_bool_indexer] * self.ndim)
256+
self.df.loc[target, :]
257+
258+
def time_loc_slice_plus_null_slice(self, unique_levels):
259+
target = (self.tgt_slice, self.tgt_null_slice)
260+
self.df.loc[target, :]
222261

223-
def time_index_slice(self):
224-
self.mdt.loc[self.idx, :]
262+
def time_loc_null_slice_plus_slice(self, unique_levels):
263+
target = (self.tgt_null_slice, self.tgt_slice)
264+
self.df.loc[target, :]
225265

226266

227267
class IntervalIndexing:

ci/run_tests.sh

-4
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,6 @@ if [[ "$PATTERN" ]]; then
3030
PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\""
3131
fi
3232

33-
if [[ $(uname) != "Linux" && $(uname) != "Darwin" ]]; then
34-
PYTEST_CMD="$PYTEST_CMD --ignore=pandas/tests/plotting/"
35-
fi
36-
3733
echo $PYTEST_CMD
3834
sh -c "$PYTEST_CMD"
3935

doc/source/development/contributing_environment.rst

+18-9
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,28 @@ with a full pandas development environment.
2626

2727
**Docker Commands**
2828

29-
Pass your GitHub username in the ``DockerFile`` to use your own fork::
29+
Build the Docker image::
3030

3131
# Build the image pandas-yourname-env
3232
docker build --tag pandas-yourname-env .
33-
# Run a container and bind your local forked repo, pandas-yourname, to the container
34-
docker run -it --rm -v path-to-pandas-yourname:/home/pandas-yourname pandas-yourname-env
33+
# Or build the image by passing your GitHub username to use your own fork
34+
docker build --build-arg gh_username=yourname --tag pandas-yourname-env .
3535

36-
Even easier, you can integrate Docker with the following IDEs:
36+
Run Container::
37+
38+
# Run a container and bind your local repo to the container
39+
docker run -it -w /home/pandas --rm -v path-to-local-pandas-repo:/home/pandas pandas-yourname-env
40+
41+
.. note::
42+
If you bind your local repo for the first time, you have to build the C extensions afterwards.
43+
Run the following command inside the container::
44+
45+
python setup.py build_ext -j 4
46+
47+
You need to rebuild the C extensions anytime the Cython code in ``pandas/_libs`` changes.
48+
This most frequently occurs when changing or merging branches.
49+
50+
*Even easier, you can integrate Docker with the following IDEs:*
3751

3852
**Visual Studio Code**
3953

@@ -47,11 +61,6 @@ Enable Docker support and use the Services tool window to build and manage image
4761
run and interact with containers.
4862
See https://www.jetbrains.com/help/pycharm/docker.html for details.
4963

50-
Note that you might need to rebuild the C extensions if/when you merge with upstream/main using::
51-
52-
python setup.py build_ext -j 4
53-
54-
5564
Creating an environment without Docker
5665
---------------------------------------
5766

doc/source/whatsnew/v1.5.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ Performance improvements
255255
- Performance improvement in :meth:`DataFrame.duplicated` when subset consists of only one column (:issue:`45236`)
256256
- Performance improvement in :meth:`.GroupBy.transform` when broadcasting values for user-defined functions (:issue:`45708`)
257257
- Performance improvement in :meth:`.GroupBy.transform` for user-defined functions when only a single group exists (:issue:`44977`)
258-
- Performance improvement in :meth:`MultiIndex.get_locs` (:issue:`45681`)
258+
- Performance improvement in :meth:`MultiIndex.get_locs` (:issue:`45681`, :issue:`46040`)
259259
- Performance improvement in :func:`merge` when left and/or right are empty (:issue:`45838`)
260260
- Performance improvement in :meth:`DataFrame.join` when left and/or right are empty (:issue:`46015`)
261261
- Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`)

pandas/_libs/groupby.pyi

+2
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,8 @@ def group_nth(
111111
counts: np.ndarray, # int64_t[::1]
112112
values: np.ndarray, # ndarray[rank_t, ndim=2]
113113
labels: np.ndarray, # const int64_t[:]
114+
mask: npt.NDArray[np.bool_] | None,
115+
result_mask: npt.NDArray[np.bool_] | None,
114116
min_count: int = ..., # int64_t
115117
rank: int = ..., # int64_t
116118
) -> None: ...

pandas/_libs/groupby.pyx

+20-3
Original file line numberDiff line numberDiff line change
@@ -1066,6 +1066,8 @@ def group_nth(iu_64_floating_obj_t[:, ::1] out,
10661066
int64_t[::1] counts,
10671067
ndarray[iu_64_floating_obj_t, ndim=2] values,
10681068
const intp_t[::1] labels,
1069+
const uint8_t[:, :] mask,
1070+
uint8_t[:, ::1] result_mask=None,
10691071
int64_t min_count=-1,
10701072
int64_t rank=1,
10711073
) -> None:
@@ -1078,6 +1080,8 @@ def group_nth(iu_64_floating_obj_t[:, ::1] out,
10781080
ndarray[iu_64_floating_obj_t, ndim=2] resx
10791081
ndarray[int64_t, ndim=2] nobs
10801082
bint runtime_error = False
1083+
bint uses_mask = mask is not None
1084+
bint isna_entry
10811085

10821086
# TODO(cython3):
10831087
# Instead of `labels.shape[0]` use `len(labels)`
@@ -1104,7 +1108,12 @@ def group_nth(iu_64_floating_obj_t[:, ::1] out,
11041108
for j in range(K):
11051109
val = values[i, j]
11061110

1107-
if not checknull(val):
1111+
if uses_mask:
1112+
isna_entry = mask[i, j]
1113+
else:
1114+
isna_entry = checknull(val)
1115+
1116+
if not isna_entry:
11081117
# NB: use _treat_as_na here once
11091118
# conditional-nogil is available.
11101119
nobs[lab, j] += 1
@@ -1129,16 +1138,24 @@ def group_nth(iu_64_floating_obj_t[:, ::1] out,
11291138
for j in range(K):
11301139
val = values[i, j]
11311140

1132-
if not _treat_as_na(val, True):
1141+
if uses_mask:
1142+
isna_entry = mask[i, j]
1143+
else:
1144+
isna_entry = _treat_as_na(val, True)
11331145
# TODO: Sure we always want is_datetimelike=True?
1146+
1147+
if not isna_entry:
11341148
nobs[lab, j] += 1
11351149
if nobs[lab, j] == rank:
11361150
resx[lab, j] = val
11371151

11381152
for i in range(ncounts):
11391153
for j in range(K):
11401154
if nobs[i, j] < min_count:
1141-
if iu_64_floating_obj_t is int64_t:
1155+
if uses_mask:
1156+
result_mask[i, j] = True
1157+
elif iu_64_floating_obj_t is int64_t:
1158+
# TODO: only if datetimelike?
11421159
out[i, j] = NPY_NAT
11431160
elif iu_64_floating_obj_t is uint64_t:
11441161
runtime_error = True

pandas/core/groupby/ops.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ def __init__(self, kind: str, how: str):
140140

141141
# "group_any" and "group_all" are also support masks, but don't go
142142
# through WrappedCythonOp
143-
_MASKED_CYTHON_FUNCTIONS = {"cummin", "cummax", "min", "max", "last"}
143+
_MASKED_CYTHON_FUNCTIONS = {"cummin", "cummax", "min", "max", "last", "first"}
144144

145145
_cython_arity = {"ohlc": 4} # OHLC
146146

@@ -532,7 +532,7 @@ def _call_cython_op(
532532
result_mask=result_mask,
533533
is_datetimelike=is_datetimelike,
534534
)
535-
elif self.how in ["last"]:
535+
elif self.how in ["first", "last"]:
536536
func(
537537
out=result,
538538
counts=counts,

pandas/core/indexes/multi.py

+14-6
Original file line numberDiff line numberDiff line change
@@ -2762,7 +2762,7 @@ def _partial_tup_index(self, tup: tuple, side="left"):
27622762
if lab not in lev and not isna(lab):
27632763
# short circuit
27642764
try:
2765-
loc = lev.searchsorted(lab, side=side)
2765+
loc = algos.searchsorted(lev, lab, side=side)
27662766
except TypeError as err:
27672767
# non-comparable e.g. test_slice_locs_with_type_mismatch
27682768
raise TypeError(f"Level type mismatch: {lab}") from err
@@ -2771,7 +2771,7 @@ def _partial_tup_index(self, tup: tuple, side="left"):
27712771
raise TypeError(f"Level type mismatch: {lab}")
27722772
if side == "right" and loc >= 0:
27732773
loc -= 1
2774-
return start + section.searchsorted(loc, side=side)
2774+
return start + algos.searchsorted(section, loc, side=side)
27752775

27762776
idx = self._get_loc_single_level_index(lev, lab)
27772777
if isinstance(idx, slice) and k < n - 1:
@@ -2780,13 +2780,21 @@ def _partial_tup_index(self, tup: tuple, side="left"):
27802780
start = idx.start
27812781
end = idx.stop
27822782
elif k < n - 1:
2783-
end = start + section.searchsorted(idx, side="right")
2784-
start = start + section.searchsorted(idx, side="left")
2783+
# error: Incompatible types in assignment (expression has type
2784+
# "Union[ndarray[Any, dtype[signedinteger[Any]]]
2785+
end = start + algos.searchsorted( # type: ignore[assignment]
2786+
section, idx, side="right"
2787+
)
2788+
# error: Incompatible types in assignment (expression has type
2789+
# "Union[ndarray[Any, dtype[signedinteger[Any]]]
2790+
start = start + algos.searchsorted( # type: ignore[assignment]
2791+
section, idx, side="left"
2792+
)
27852793
elif isinstance(idx, slice):
27862794
idx = idx.start
2787-
return start + section.searchsorted(idx, side=side)
2795+
return start + algos.searchsorted(section, idx, side=side)
27882796
else:
2789-
return start + section.searchsorted(idx, side=side)
2797+
return start + algos.searchsorted(section, idx, side=side)
27902798

27912799
def _get_loc_single_level_index(self, level_index: Index, key: Hashable) -> int:
27922800
"""

pandas/plotting/_core.py

+13-6
Original file line numberDiff line numberDiff line change
@@ -1793,12 +1793,19 @@ def _load_backend(backend: str) -> types.ModuleType:
17931793
found_backend = False
17941794

17951795
eps = entry_points()
1796-
if "pandas_plotting_backends" in eps:
1797-
for entry_point in eps["pandas_plotting_backends"]:
1798-
found_backend = entry_point.name == backend
1799-
if found_backend:
1800-
module = entry_point.load()
1801-
break
1796+
key = "pandas_plotting_backends"
1797+
# entry_points lost dict API ~ PY 3.10
1798+
# https://github.com/python/importlib_metadata/issues/298
1799+
if hasattr(eps, "select"):
1800+
# error: "Dict[str, Tuple[EntryPoint, ...]]" has no attribute "select"
1801+
entry = eps.select(group=key) # type: ignore[attr-defined]
1802+
else:
1803+
entry = eps.get(key, ())
1804+
for entry_point in entry:
1805+
found_backend = entry_point.name == backend
1806+
if found_backend:
1807+
module = entry_point.load()
1808+
break
18021809

18031810
if not found_backend:
18041811
# Fall back to unregistered, module name approach.

0 commit comments

Comments
 (0)