Skip to content

Commit c2e75be

Browse files
committed
Merge branch 'master' into ref-gbop
2 parents eeba093 + 7127b84 commit c2e75be

File tree

24 files changed

+308
-146
lines changed

24 files changed

+308
-146
lines changed

asv_bench/benchmarks/groupby.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -480,7 +480,19 @@ class GroupByCythonAgg:
480480
param_names = ["dtype", "method"]
481481
params = [
482482
["float64"],
483-
["sum", "prod", "min", "max", "mean", "median", "var", "first", "last"],
483+
[
484+
"sum",
485+
"prod",
486+
"min",
487+
"max",
488+
"mean",
489+
"median",
490+
"var",
491+
"first",
492+
"last",
493+
"any",
494+
"all",
495+
],
484496
]
485497

486498
def setup(self, dtype, method):

doc/source/whatsnew/v1.3.0.rst

+4
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,9 @@ Other enhancements
217217
- :class:`RangeIndex` can now be constructed by passing a ``range`` object directly e.g. ``pd.RangeIndex(range(3))`` (:issue:`12067`)
218218
- :meth:`round` being enabled for the nullable integer and floating dtypes (:issue:`38844`)
219219
- :meth:`pandas.read_csv` and :meth:`pandas.read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`)
220+
- :meth:`.GroupBy.any` and :meth:`.GroupBy.all` use Kleene logic with nullable data types (:issue:`37506`)
221+
- :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`)
222+
-
220223

221224
.. ---------------------------------------------------------------------------
222225
@@ -787,6 +790,7 @@ Groupby/resample/rolling
787790
- Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`)
788791
- Bug in aggregation functions for :class:`DataFrame` not respecting ``numeric_only`` argument when ``level`` keyword was given (:issue:`40660`)
789792
- Bug in :class:`core.window.RollingGroupby` where ``as_index=False`` argument in ``groupby`` was ignored (:issue:`39433`)
793+
- Bug in :meth:`.GroupBy.any` and :meth:`.GroupBy.all` raising ``ValueError`` when using with nullable type columns holding ``NA`` even with ``skipna=True`` (:issue:`40585`)
790794
- Bug in :meth:`GroupBy.cummin` and :meth:`GroupBy.cummax` incorrectly rounding integer values near the ``int64`` implementations bounds (:issue:`40767`)
791795

792796
Reshaping

pandas/_libs/groupby.pyx

+25-8
Original file line numberDiff line numberDiff line change
@@ -388,40 +388,47 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[intp_t] labels,
388388

389389
@cython.boundscheck(False)
390390
@cython.wraparound(False)
391-
def group_any_all(uint8_t[::1] out,
392-
const uint8_t[::1] values,
391+
def group_any_all(int8_t[::1] out,
392+
const int8_t[::1] values,
393393
const intp_t[:] labels,
394394
const uint8_t[::1] mask,
395395
str val_test,
396-
bint skipna) -> None:
396+
bint skipna,
397+
bint nullable) -> None:
397398
"""
398-
Aggregated boolean values to show truthfulness of group elements.
399+
Aggregated boolean values to show truthfulness of group elements. If the
400+
input is a nullable type (nullable=True), the result will be computed
401+
using Kleene logic.
399402

400403
Parameters
401404
----------
402-
out : np.ndarray[np.uint8]
405+
out : np.ndarray[np.int8]
403406
Values into which this method will write its results.
404407
labels : np.ndarray[np.intp]
405408
Array containing unique label for each group, with its
406409
ordering matching up to the corresponding record in `values`
407-
values : np.ndarray[np.uint8]
410+
values : np.ndarray[np.int8]
408411
Containing the truth value of each element.
409412
mask : np.ndarray[np.uint8]
410413
Indicating whether a value is na or not.
411414
val_test : {'any', 'all'}
412415
String object dictating whether to use any or all truth testing
413416
skipna : bool
414417
Flag to ignore nan values during truth testing
418+
nullable : bool
419+
Whether or not the input is a nullable type. If True, the
420+
result will be computed using Kleene logic
415421

416422
Notes
417423
-----
418424
This method modifies the `out` parameter rather than returning an object.
419-
The returned values will either be 0 or 1 (False or True, respectively).
425+
The returned values will either be 0, 1 (False or True, respectively), or
426+
-1 to signify a masked position in the case of a nullable input.
420427
"""
421428
cdef:
422429
Py_ssize_t i, N = len(labels)
423430
intp_t lab
424-
uint8_t flag_val
431+
int8_t flag_val
425432

426433
if val_test == 'all':
427434
# Because the 'all' value of an empty iterable in Python is True we can
@@ -444,6 +451,16 @@ def group_any_all(uint8_t[::1] out,
444451
if lab < 0 or (skipna and mask[i]):
445452
continue
446453

454+
if nullable and mask[i]:
455+
# Set the position as masked if `out[lab] != flag_val`, which
456+
# would indicate True/False has not yet been seen for any/all,
457+
# so by Kleene logic the result is currently unknown
458+
if out[lab] != flag_val:
459+
out[lab] = -1
460+
continue
461+
462+
# If True and 'any' or False and 'all', the result is
463+
# already determined
447464
if values[i] == flag_val:
448465
out[lab] = flag_val
449466

pandas/core/arrays/datetimes.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps):
190190
_infer_matches = ("datetime", "datetime64", "date")
191191

192192
# define my properties & methods for delegation
193-
_bool_ops = [
193+
_bool_ops: list[str] = [
194194
"is_month_start",
195195
"is_month_end",
196196
"is_quarter_start",
@@ -199,8 +199,8 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps):
199199
"is_year_end",
200200
"is_leap_year",
201201
]
202-
_object_ops = ["freq", "tz"]
203-
_field_ops = [
202+
_object_ops: list[str] = ["freq", "tz"]
203+
_field_ops: list[str] = [
204204
"year",
205205
"month",
206206
"day",
@@ -220,9 +220,9 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps):
220220
"microsecond",
221221
"nanosecond",
222222
]
223-
_other_ops = ["date", "time", "timetz"]
224-
_datetimelike_ops = _field_ops + _object_ops + _bool_ops + _other_ops
225-
_datetimelike_methods = [
223+
_other_ops: list[str] = ["date", "time", "timetz"]
224+
_datetimelike_ops: list[str] = _field_ops + _object_ops + _bool_ops + _other_ops
225+
_datetimelike_methods: list[str] = [
226226
"to_period",
227227
"tz_localize",
228228
"tz_convert",

pandas/core/arrays/interval.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@
8787

8888
IntervalArrayT = TypeVar("IntervalArrayT", bound="IntervalArray")
8989

90-
_interval_shared_docs = {}
90+
_interval_shared_docs: dict[str, str] = {}
9191

9292
_shared_docs_kwargs = {
9393
"klass": "IntervalArray",

pandas/core/arrays/period.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -157,9 +157,9 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps):
157157

158158
# Names others delegate to us
159159
_other_ops: list[str] = []
160-
_bool_ops = ["is_leap_year"]
161-
_object_ops = ["start_time", "end_time", "freq"]
162-
_field_ops = [
160+
_bool_ops: list[str] = ["is_leap_year"]
161+
_object_ops: list[str] = ["start_time", "end_time", "freq"]
162+
_field_ops: list[str] = [
163163
"year",
164164
"month",
165165
"day",

pandas/core/generic.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1797,9 +1797,9 @@ def _drop_labels_or_levels(self, keys, axis: int = 0):
17971797
17981798
Parameters
17991799
----------
1800-
keys: str or list of str
1800+
keys : str or list of str
18011801
labels or levels to drop
1802-
axis: int, default 0
1802+
axis : int, default 0
18031803
Axis that levels are associated with (0 for index, 1 for columns)
18041804
18051805
Returns

pandas/core/groupby/base.py

+11-6
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
hold the allowlist of methods that are exposed on the
44
SeriesGroupBy and the DataFrameGroupBy objects.
55
"""
6+
from __future__ import annotations
7+
68
import collections
7-
from typing import List
89

910
from pandas._typing import final
1011

@@ -19,7 +20,7 @@
1920

2021

2122
class ShallowMixin(PandasObject):
22-
_attributes: List[str] = []
23+
_attributes: list[str] = []
2324

2425
@final
2526
def _shallow_copy(self, obj, **kwargs):
@@ -39,7 +40,7 @@ class GotItemMixin(PandasObject):
3940
Provide the groupby facilities to the mixed object.
4041
"""
4142

42-
_attributes: List[str]
43+
_attributes: list[str]
4344

4445
@final
4546
def _gotitem(self, key, ndim, subset=None):
@@ -106,12 +107,16 @@ def _gotitem(self, key, ndim, subset=None):
106107
| plotting_methods
107108
)
108109

109-
series_apply_allowlist = (
110+
series_apply_allowlist: frozenset[str] = (
110111
common_apply_allowlist
111-
| {"nlargest", "nsmallest", "is_monotonic_increasing", "is_monotonic_decreasing"}
112+
| frozenset(
113+
{"nlargest", "nsmallest", "is_monotonic_increasing", "is_monotonic_decreasing"}
114+
)
112115
) | frozenset(["dtype", "unique"])
113116

114-
dataframe_apply_allowlist = common_apply_allowlist | frozenset(["dtypes", "corrwith"])
117+
dataframe_apply_allowlist: frozenset[str] = common_apply_allowlist | frozenset(
118+
["dtypes", "corrwith"]
119+
)
115120

116121
# cythonized transformations or canned "agg+broadcast", which do not
117122
# require postprocessing of the result by transform.

pandas/core/groupby/generic.py

-6
Original file line numberDiff line numberDiff line change
@@ -177,9 +177,6 @@ def pinner(cls):
177177
class SeriesGroupBy(GroupBy[Series]):
178178
_apply_allowlist = base.series_apply_allowlist
179179

180-
# Defined as a cache_readonly in SelectionMixin
181-
_obj_with_exclusions: Series
182-
183180
def _iterate_slices(self) -> Iterable[Series]:
184181
yield self._selected_obj
185182

@@ -930,9 +927,6 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None):
930927
@pin_allowlisted_properties(DataFrame, base.dataframe_apply_allowlist)
931928
class DataFrameGroupBy(GroupBy[DataFrame]):
932929

933-
# Defined as a cache_readonly in SelectionMixin
934-
_obj_with_exclusions: DataFrame
935-
936930
_apply_allowlist = base.dataframe_apply_allowlist
937931

938932
_agg_examples_doc = dedent(

pandas/core/groupby/groupby.py

+29-6
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ class providing the base-class of operations.
7777
from pandas.core import nanops
7878
import pandas.core.algorithms as algorithms
7979
from pandas.core.arrays import (
80+
BaseMaskedArray,
81+
BooleanArray,
8082
Categorical,
8183
ExtensionArray,
8284
)
@@ -1413,24 +1415,34 @@ def _bool_agg(self, val_test, skipna):
14131415
Shared func to call any / all Cython GroupBy implementations.
14141416
"""
14151417

1416-
def objs_to_bool(vals: np.ndarray) -> tuple[np.ndarray, type]:
1418+
def objs_to_bool(vals: ArrayLike) -> tuple[np.ndarray, type]:
14171419
if is_object_dtype(vals):
14181420
vals = np.array([bool(x) for x in vals])
1421+
elif isinstance(vals, BaseMaskedArray):
1422+
vals = vals._data.astype(bool, copy=False)
14191423
else:
14201424
vals = vals.astype(bool)
14211425

1422-
return vals.view(np.uint8), bool
1426+
return vals.view(np.int8), bool
14231427

1424-
def result_to_bool(result: np.ndarray, inference: type) -> np.ndarray:
1425-
return result.astype(inference, copy=False)
1428+
def result_to_bool(
1429+
result: np.ndarray,
1430+
inference: type,
1431+
nullable: bool = False,
1432+
) -> ArrayLike:
1433+
if nullable:
1434+
return BooleanArray(result.astype(bool, copy=False), result == -1)
1435+
else:
1436+
return result.astype(inference, copy=False)
14261437

14271438
return self._get_cythonized_result(
14281439
"group_any_all",
14291440
aggregate=True,
14301441
numeric_only=False,
1431-
cython_dtype=np.dtype(np.uint8),
1442+
cython_dtype=np.dtype(np.int8),
14321443
needs_values=True,
14331444
needs_mask=True,
1445+
needs_nullable=True,
14341446
pre_processing=objs_to_bool,
14351447
post_processing=result_to_bool,
14361448
val_test=val_test,
@@ -2613,6 +2625,7 @@ def _get_cythonized_result(
26132625
needs_counts: bool = False,
26142626
needs_values: bool = False,
26152627
needs_2d: bool = False,
2628+
needs_nullable: bool = False,
26162629
min_count: int | None = None,
26172630
needs_mask: bool = False,
26182631
needs_ngroups: bool = False,
@@ -2649,6 +2662,9 @@ def _get_cythonized_result(
26492662
signature
26502663
needs_ngroups : bool, default False
26512664
Whether number of groups is part of the Cython call signature
2665+
needs_nullable : bool, default False
2666+
Whether a bool specifying if the input is nullable is part
2667+
of the Cython call signature
26522668
result_is_index : bool, default False
26532669
Whether the result of the Cython operation is an index of
26542670
values to be retrieved, instead of the actual values themselves
@@ -2664,7 +2680,8 @@ def _get_cythonized_result(
26642680
Function to be applied to result of Cython function. Should accept
26652681
an array of values as the first argument and type inferences as its
26662682
second argument, i.e. the signature should be
2667-
(ndarray, Type).
2683+
(ndarray, Type). If `needs_nullable=True`, a third argument should be
2684+
`nullable`, to allow for processing specific to nullable values.
26682685
**kwargs : dict
26692686
Extra arguments to be passed back to Cython funcs
26702687
@@ -2739,6 +2756,12 @@ def _get_cythonized_result(
27392756
if needs_ngroups:
27402757
func = partial(func, ngroups)
27412758

2759+
if needs_nullable:
2760+
is_nullable = isinstance(values, BaseMaskedArray)
2761+
func = partial(func, nullable=is_nullable)
2762+
if post_processing:
2763+
post_processing = partial(post_processing, nullable=is_nullable)
2764+
27422765
func(**kwargs) # Call func to modify indexer values in place
27432766

27442767
if needs_2d:

0 commit comments

Comments
 (0)