Skip to content

Commit 4d9b6f7

Browse files
authored
ENH: 2D support for MaskedArray (#38992)
1 parent 0638f7f commit 4d9b6f7

16 files changed

+374
-93
lines changed

pandas/_libs/algos.pyx

+4-3
Original file line numberDiff line numberDiff line change
@@ -637,7 +637,7 @@ def pad_inplace(numeric_object_t[:] values, uint8_t[:] mask, limit=None):
637637

638638
@cython.boundscheck(False)
639639
@cython.wraparound(False)
640-
def pad_2d_inplace(numeric_object_t[:, :] values, const uint8_t[:, :] mask, limit=None):
640+
def pad_2d_inplace(numeric_object_t[:, :] values, uint8_t[:, :] mask, limit=None):
641641
cdef:
642642
Py_ssize_t i, j, N, K
643643
numeric_object_t val
@@ -656,10 +656,11 @@ def pad_2d_inplace(numeric_object_t[:, :] values, const uint8_t[:, :] mask, limi
656656
val = values[j, 0]
657657
for i in range(N):
658658
if mask[j, i]:
659-
if fill_count >= lim:
659+
if fill_count >= lim or i == 0:
660660
continue
661661
fill_count += 1
662662
values[j, i] = val
663+
mask[j, i] = False
663664
else:
664665
fill_count = 0
665666
val = values[j, i]
@@ -759,7 +760,7 @@ def backfill_inplace(numeric_object_t[:] values, uint8_t[:] mask, limit=None):
759760

760761

761762
def backfill_2d_inplace(numeric_object_t[:, :] values,
762-
const uint8_t[:, :] mask,
763+
uint8_t[:, :] mask,
763764
limit=None):
764765
pad_2d_inplace(values[:, ::-1], mask[:, ::-1], limit)
765766

pandas/core/array_algos/masked_reductions.py

+51-14
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,10 @@
33
for missing values.
44
"""
55

6-
from typing import Callable
6+
from typing import (
7+
Callable,
8+
Optional,
9+
)
710

811
import numpy as np
912

@@ -19,6 +22,7 @@ def _sumprod(
1922
*,
2023
skipna: bool = True,
2124
min_count: int = 0,
25+
axis: Optional[int] = None,
2226
):
2327
"""
2428
Sum or product for 1D masked array.
@@ -36,36 +40,55 @@ def _sumprod(
3640
min_count : int, default 0
3741
The required number of valid values to perform the operation. If fewer than
3842
``min_count`` non-NA values are present the result will be NA.
43+
axis : int, optional, default None
3944
"""
4045
if not skipna:
41-
if mask.any() or check_below_min_count(values.shape, None, min_count):
46+
if mask.any(axis=axis) or check_below_min_count(values.shape, None, min_count):
4247
return libmissing.NA
4348
else:
44-
return func(values)
49+
return func(values, axis=axis)
4550
else:
46-
if check_below_min_count(values.shape, mask, min_count):
51+
if check_below_min_count(values.shape, mask, min_count) and (
52+
axis is None or values.ndim == 1
53+
):
4754
return libmissing.NA
48-
return func(values, where=~mask)
55+
56+
return func(values, where=~mask, axis=axis)
4957

5058

5159
def sum(
52-
values: np.ndarray, mask: np.ndarray, *, skipna: bool = True, min_count: int = 0
60+
values: np.ndarray,
61+
mask: np.ndarray,
62+
*,
63+
skipna: bool = True,
64+
min_count: int = 0,
65+
axis: Optional[int] = None,
5366
):
5467
return _sumprod(
55-
np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count
68+
np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis
5669
)
5770

5871

5972
def prod(
60-
values: np.ndarray, mask: np.ndarray, *, skipna: bool = True, min_count: int = 0
73+
values: np.ndarray,
74+
mask: np.ndarray,
75+
*,
76+
skipna: bool = True,
77+
min_count: int = 0,
78+
axis: Optional[int] = None,
6179
):
6280
return _sumprod(
63-
np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count
81+
np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis
6482
)
6583

6684

6785
def _minmax(
68-
func: Callable, values: np.ndarray, mask: np.ndarray, *, skipna: bool = True
86+
func: Callable,
87+
values: np.ndarray,
88+
mask: np.ndarray,
89+
*,
90+
skipna: bool = True,
91+
axis: Optional[int] = None,
6992
):
7093
"""
7194
Reduction for 1D masked array.
@@ -80,6 +103,7 @@ def _minmax(
80103
Boolean numpy array (True values indicate missing values).
81104
skipna : bool, default True
82105
Whether to skip NA.
106+
axis : int, optional, default None
83107
"""
84108
if not skipna:
85109
if mask.any() or not values.size:
@@ -96,14 +120,27 @@ def _minmax(
96120
return libmissing.NA
97121

98122

99-
def min(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True):
100-
return _minmax(np.min, values=values, mask=mask, skipna=skipna)
123+
def min(
124+
values: np.ndarray,
125+
mask: np.ndarray,
126+
*,
127+
skipna: bool = True,
128+
axis: Optional[int] = None,
129+
):
130+
return _minmax(np.min, values=values, mask=mask, skipna=skipna, axis=axis)
101131

102132

103-
def max(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True):
104-
return _minmax(np.max, values=values, mask=mask, skipna=skipna)
133+
def max(
134+
values: np.ndarray,
135+
mask: np.ndarray,
136+
*,
137+
skipna: bool = True,
138+
axis: Optional[int] = None,
139+
):
140+
return _minmax(np.max, values=values, mask=mask, skipna=skipna, axis=axis)
105141

106142

143+
# TODO: axis kwarg
107144
def mean(values: np.ndarray, mask: np.ndarray, skipna: bool = True):
108145
if not values.size or mask.all():
109146
return libmissing.NA

pandas/core/arrays/_mixins.py

-21
Original file line numberDiff line numberDiff line change
@@ -298,27 +298,6 @@ def _wrap_reduction_result(self, axis: int | None, result):
298298
return self._box_func(result)
299299
return self._from_backing_data(result)
300300

301-
# ------------------------------------------------------------------------
302-
303-
def __repr__(self) -> str:
304-
if self.ndim == 1:
305-
return super().__repr__()
306-
307-
from pandas.io.formats.printing import format_object_summary
308-
309-
# the short repr has no trailing newline, while the truncated
310-
# repr does. So we include a newline in our template, and strip
311-
# any trailing newlines from format_object_summary
312-
lines = [
313-
format_object_summary(x, self._formatter(), indent_for_name=False).rstrip(
314-
", \n"
315-
)
316-
for x in self
317-
]
318-
data = ",\n".join(lines)
319-
class_name = f"<{type(self).__name__}>"
320-
return f"{class_name}\n[\n{data}\n]\nShape: {self.shape}, dtype: {self.dtype}"
321-
322301
# ------------------------------------------------------------------------
323302
# __array_function__ methods
324303

pandas/core/arrays/base.py

+19
Original file line numberDiff line numberDiff line change
@@ -1209,6 +1209,9 @@ def view(self, dtype: Dtype | None = None) -> ArrayLike:
12091209
# ------------------------------------------------------------------------
12101210

12111211
def __repr__(self) -> str:
1212+
if self.ndim > 1:
1213+
return self._repr_2d()
1214+
12121215
from pandas.io.formats.printing import format_object_summary
12131216

12141217
# the short repr has no trailing newline, while the truncated
@@ -1220,6 +1223,22 @@ def __repr__(self) -> str:
12201223
class_name = f"<{type(self).__name__}>\n"
12211224
return f"{class_name}{data}\nLength: {len(self)}, dtype: {self.dtype}"
12221225

1226+
def _repr_2d(self) -> str:
1227+
from pandas.io.formats.printing import format_object_summary
1228+
1229+
# the short repr has no trailing newline, while the truncated
1230+
# repr does. So we include a newline in our template, and strip
1231+
# any trailing newlines from format_object_summary
1232+
lines = [
1233+
format_object_summary(x, self._formatter(), indent_for_name=False).rstrip(
1234+
", \n"
1235+
)
1236+
for x in self
1237+
]
1238+
data = ",\n".join(lines)
1239+
class_name = f"<{type(self).__name__}>"
1240+
return f"{class_name}\n[\n{data}\n]\nShape: {self.shape}, dtype: {self.dtype}"
1241+
12231242
def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]:
12241243
"""
12251244
Formatting function for scalar values.

pandas/core/arrays/boolean.py

+141-4
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
npt,
2222
type_t,
2323
)
24+
from pandas.compat.numpy import function as nv
2425

2526
from pandas.core.dtypes.common import (
2627
is_bool_dtype,
@@ -245,10 +246,8 @@ def coerce_to_array(
245246
if mask_values is not None:
246247
mask = mask | mask_values
247248

248-
if values.ndim != 1:
249-
raise ValueError("values must be a 1D list-like")
250-
if mask.ndim != 1:
251-
raise ValueError("mask must be a 1D list-like")
249+
if values.shape != mask.shape:
250+
raise ValueError("values.shape and mask.shape must match")
252251

253252
return values, mask
254253

@@ -447,6 +446,144 @@ def _values_for_argsort(self) -> np.ndarray:
447446
data[self._mask] = -1
448447
return data
449448

449+
def any(self, *, skipna: bool = True, axis: int | None = 0, **kwargs):
450+
"""
451+
Return whether any element is True.
452+
453+
Returns False unless there is at least one element that is True.
454+
By default, NAs are skipped. If ``skipna=False`` is specified and
455+
missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
456+
is used as for logical operations.
457+
458+
Parameters
459+
----------
460+
skipna : bool, default True
461+
Exclude NA values. If the entire array is NA and `skipna` is
462+
True, then the result will be False, as for an empty array.
463+
If `skipna` is False, the result will still be True if there is
464+
at least one element that is True, otherwise NA will be returned
465+
if there are NA's present.
466+
axis : int or None, default 0
467+
**kwargs : any, default None
468+
Additional keywords have no effect but might be accepted for
469+
compatibility with NumPy.
470+
471+
Returns
472+
-------
473+
bool or :attr:`pandas.NA`
474+
475+
See Also
476+
--------
477+
numpy.any : Numpy version of this method.
478+
BooleanArray.all : Return whether all elements are True.
479+
480+
Examples
481+
--------
482+
The result indicates whether any element is True (and by default
483+
skips NAs):
484+
485+
>>> pd.array([True, False, True]).any()
486+
True
487+
>>> pd.array([True, False, pd.NA]).any()
488+
True
489+
>>> pd.array([False, False, pd.NA]).any()
490+
False
491+
>>> pd.array([], dtype="boolean").any()
492+
False
493+
>>> pd.array([pd.NA], dtype="boolean").any()
494+
False
495+
496+
With ``skipna=False``, the result can be NA if this is logically
497+
required (whether ``pd.NA`` is True or False influences the result):
498+
499+
>>> pd.array([True, False, pd.NA]).any(skipna=False)
500+
True
501+
>>> pd.array([False, False, pd.NA]).any(skipna=False)
502+
<NA>
503+
"""
504+
kwargs.pop("axis", None)
505+
nv.validate_any((), kwargs)
506+
507+
values = self._data.copy()
508+
np.putmask(values, self._mask, False)
509+
result = values.any(axis=axis)
510+
511+
if skipna:
512+
return result
513+
else:
514+
if result or self.size == 0 or not self._mask.any():
515+
return result
516+
else:
517+
return self.dtype.na_value
518+
519+
def all(self, *, skipna: bool = True, axis: int | None = 0, **kwargs):
520+
"""
521+
Return whether all elements are True.
522+
523+
Returns True unless there is at least one element that is False.
524+
By default, NAs are skipped. If ``skipna=False`` is specified and
525+
missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
526+
is used as for logical operations.
527+
528+
Parameters
529+
----------
530+
skipna : bool, default True
531+
Exclude NA values. If the entire array is NA and `skipna` is
532+
True, then the result will be True, as for an empty array.
533+
If `skipna` is False, the result will still be False if there is
534+
at least one element that is False, otherwise NA will be returned
535+
if there are NA's present.
536+
axis : int or None, default 0
537+
**kwargs : any, default None
538+
Additional keywords have no effect but might be accepted for
539+
compatibility with NumPy.
540+
541+
Returns
542+
-------
543+
bool or :attr:`pandas.NA`
544+
545+
See Also
546+
--------
547+
numpy.all : Numpy version of this method.
548+
BooleanArray.any : Return whether any element is True.
549+
550+
Examples
551+
--------
552+
The result indicates whether any element is True (and by default
553+
skips NAs):
554+
555+
>>> pd.array([True, True, pd.NA]).all()
556+
True
557+
>>> pd.array([True, False, pd.NA]).all()
558+
False
559+
>>> pd.array([], dtype="boolean").all()
560+
True
561+
>>> pd.array([pd.NA], dtype="boolean").all()
562+
True
563+
564+
With ``skipna=False``, the result can be NA if this is logically
565+
required (whether ``pd.NA`` is True or False influences the result):
566+
567+
>>> pd.array([True, True, pd.NA]).all(skipna=False)
568+
<NA>
569+
>>> pd.array([True, False, pd.NA]).all(skipna=False)
570+
False
571+
"""
572+
kwargs.pop("axis", None)
573+
nv.validate_all((), kwargs)
574+
575+
values = self._data.copy()
576+
np.putmask(values, self._mask, True)
577+
result = values.all(axis=axis)
578+
579+
if skipna:
580+
return result
581+
else:
582+
if not result or self.size == 0 or not self._mask.any():
583+
return result
584+
else:
585+
return self.dtype.na_value
586+
450587
def _logical_method(self, other, op):
451588

452589
assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"}

0 commit comments

Comments
 (0)