Skip to content

Commit 263b0f9

Browse files
committed
2 parents b1bb9da + b2dda5a commit 263b0f9

File tree

15 files changed

+134
-86
lines changed

15 files changed

+134
-86
lines changed

doc/source/user_guide/groupby.rst

+3-5
Original file line numberDiff line numberDiff line change
@@ -87,11 +87,9 @@ The mapping can be specified many different ways:
8787
* A Python function, to be called on each of the axis labels.
8888
* A list or NumPy array of the same length as the selected axis.
8989
* A dict or ``Series``, providing a ``label -> group name`` mapping.
90-
* For ``DataFrame`` objects, a string indicating a column to be used to group.
91-
Of course ``df.groupby('A')`` is just syntactic sugar for
92-
``df.groupby(df['A'])``, but it makes life simpler.
93-
* For ``DataFrame`` objects, a string indicating an index level to be used to
94-
group.
90+
* For ``DataFrame`` objects, a string indicating either a column name or
91+
an index level name to be used to group.
92+
* ``df.groupby('A')`` is just syntactic sugar for ``df.groupby(df['A'])``.
9593
* A list of any of the above things.
9694

9795
Collectively we refer to the grouping objects as the **keys**. For example,

doc/source/whatsnew/v1.2.0.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,8 @@ For example:
100100

101101
Other enhancements
102102
^^^^^^^^^^^^^^^^^^
103-
104103
- Added :meth:`~DataFrame.set_flags` for setting table-wide flags on a ``Series`` or ``DataFrame`` (:issue:`28394`)
104+
- :meth:`DataFrame.applymap` now supports ``na_action`` (:issue:`23803`)
105105
- :class:`Index` with object dtype supports division and multiplication (:issue:`34160`)
106106
- :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`)
107107
-
@@ -334,7 +334,7 @@ Sparse
334334
ExtensionArray
335335
^^^^^^^^^^^^^^
336336

337-
-
337+
- Fixed Bug where :class:`DataFrame` column set to scalar extension type via a dict instantion was considered an object type rather than the extension type (:issue:`35965`)
338338
-
339339

340340

pandas/_libs/lib.pyx

+7-1
Original file line numberDiff line numberDiff line change
@@ -2377,14 +2377,17 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=Tr
23772377

23782378
@cython.boundscheck(False)
23792379
@cython.wraparound(False)
2380-
def map_infer(ndarray arr, object f, bint convert=True):
2380+
def map_infer(ndarray arr, object f, bint convert=True, bint ignore_na=False):
23812381
"""
23822382
Substitute for np.vectorize with pandas-friendly dtype inference.
23832383
23842384
Parameters
23852385
----------
23862386
arr : ndarray
23872387
f : function
2388+
convert : bint
2389+
ignore_na : bint
2390+
If True, NA values will not have f applied
23882391
23892392
Returns
23902393
-------
@@ -2398,6 +2401,9 @@ def map_infer(ndarray arr, object f, bint convert=True):
23982401
n = len(arr)
23992402
result = np.empty(n, dtype=object)
24002403
for i in range(n):
2404+
if ignore_na and checknull(arr[i]):
2405+
result[i] = arr[i]
2406+
continue
24012407
val = f(arr[i])
24022408

24032409
if cnp.PyArray_IsZeroDim(val):

pandas/core/arrays/_mixins.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from pandas.errors import AbstractMethodError
77
from pandas.util._decorators import cache_readonly, doc
88

9-
from pandas.core.algorithms import searchsorted, take, unique
9+
from pandas.core.algorithms import take, unique
1010
from pandas.core.array_algos.transforms import shift
1111
from pandas.core.arrays.base import ExtensionArray
1212

@@ -102,6 +102,9 @@ def T(self: _T) -> _T:
102102

103103
# ------------------------------------------------------------------------
104104

105+
def _values_for_argsort(self):
106+
return self._ndarray
107+
105108
def copy(self: _T) -> _T:
106109
new_data = self._ndarray.copy()
107110
return self._from_backing_data(new_data)
@@ -135,7 +138,11 @@ def _concat_same_type(cls, to_concat, axis: int = 0):
135138

136139
@doc(ExtensionArray.searchsorted)
137140
def searchsorted(self, value, side="left", sorter=None):
138-
return searchsorted(self._ndarray, value, side=side, sorter=sorter)
141+
value = self._validate_searchsorted_value(value)
142+
return self._ndarray.searchsorted(value, side=side, sorter=sorter)
143+
144+
def _validate_searchsorted_value(self, value):
145+
return value
139146

140147
@doc(ExtensionArray.shift)
141148
def shift(self, periods=1, fill_value=None, axis=0):

pandas/core/arrays/categorical.py

+11-26
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from pandas._libs import NaT, algos as libalgos, hashtable as htable, lib
1313
from pandas._typing import ArrayLike, Dtype, Ordered, Scalar
1414
from pandas.compat.numpy import function as nv
15-
from pandas.util._decorators import cache_readonly, deprecate_kwarg, doc
15+
from pandas.util._decorators import cache_readonly, deprecate_kwarg
1616
from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs
1717

1818
from pandas.core.dtypes.cast import (
@@ -45,12 +45,7 @@
4545
import pandas.core.algorithms as algorithms
4646
from pandas.core.algorithms import _get_data_algo, factorize, take_1d, unique1d
4747
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
48-
from pandas.core.base import (
49-
ExtensionArray,
50-
NoNewAttributesMixin,
51-
PandasObject,
52-
_shared_docs,
53-
)
48+
from pandas.core.base import ExtensionArray, NoNewAttributesMixin, PandasObject
5449
import pandas.core.common as com
5550
from pandas.core.construction import array, extract_array, sanitize_array
5651
from pandas.core.indexers import check_array_indexer, deprecate_ndim_indexing
@@ -63,6 +58,7 @@
6358

6459
def _cat_compare_op(op):
6560
opname = f"__{op.__name__}__"
61+
fill_value = True if op is operator.ne else False
6662

6763
@unpack_zerodim_and_defer(opname)
6864
def func(self, other):
@@ -97,26 +93,23 @@ def func(self, other):
9793
else:
9894
other_codes = other._codes
9995

100-
f = getattr(self._codes, opname)
101-
ret = f(other_codes)
96+
ret = op(self._codes, other_codes)
10297
mask = (self._codes == -1) | (other_codes == -1)
10398
if mask.any():
104-
# In other series, the leads to False, so do that here too
105-
if opname == "__ne__":
106-
ret[(self._codes == -1) & (other_codes == -1)] = True
107-
else:
108-
ret[mask] = False
99+
ret[mask] = fill_value
109100
return ret
110101

111102
if is_scalar(other):
112103
if other in self.categories:
113104
i = self.categories.get_loc(other)
114-
ret = getattr(self._codes, opname)(i)
105+
ret = op(self._codes, i)
115106

116107
if opname not in {"__eq__", "__ge__", "__gt__"}:
117-
# check for NaN needed if we are not equal or larger
108+
# GH#29820 performance trick; get_loc will always give i>=0,
109+
# so in the cases (__ne__, __le__, __lt__) the setting
110+
# here is a no-op, so can be skipped.
118111
mask = self._codes == -1
119-
ret[mask] = False
112+
ret[mask] = fill_value
120113
return ret
121114
else:
122115
return ops.invalid_comparison(self, other, op)
@@ -1315,11 +1308,6 @@ def memory_usage(self, deep=False):
13151308
"""
13161309
return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep)
13171310

1318-
@doc(_shared_docs["searchsorted"], klass="Categorical")
1319-
def searchsorted(self, value, side="left", sorter=None):
1320-
value = self._validate_searchsorted_value(value)
1321-
return self.codes.searchsorted(value, side=side, sorter=sorter)
1322-
13231311
def isna(self):
13241312
"""
13251313
Detect missing values
@@ -1428,9 +1416,6 @@ def check_for_ordered(self, op):
14281416
"Categorical to an ordered one\n"
14291417
)
14301418

1431-
def _values_for_argsort(self):
1432-
return self._codes
1433-
14341419
def argsort(self, ascending=True, kind="quicksort", **kwargs):
14351420
"""
14361421
Return the indices that would sort the Categorical.
@@ -1879,7 +1864,7 @@ def __getitem__(self, key):
18791864
if result.ndim > 1:
18801865
deprecate_ndim_indexing(result)
18811866
return result
1882-
return self._constructor(result, dtype=self.dtype, fastpath=True)
1867+
return self._from_backing_data(result)
18831868

18841869
def __setitem__(self, key, value):
18851870
"""

pandas/core/arrays/datetimelike.py

+5-33
Original file line numberDiff line numberDiff line change
@@ -545,15 +545,18 @@ def __getitem__(self, key):
545545
result = self._ndarray[key]
546546
if self.ndim == 1:
547547
return self._box_func(result)
548-
return self._simple_new(result, dtype=self.dtype)
548+
return self._from_backing_data(result)
549549

550550
key = self._validate_getitem_key(key)
551551
result = self._ndarray[key]
552552
if lib.is_scalar(result):
553553
return self._box_func(result)
554554

555+
result = self._from_backing_data(result)
556+
555557
freq = self._get_getitem_freq(key)
556-
return self._simple_new(result, dtype=self.dtype, freq=freq)
558+
result._freq = freq
559+
return result
557560

558561
def _validate_getitem_key(self, key):
559562
if com.is_bool_indexer(key):
@@ -714,9 +717,6 @@ def _values_for_factorize(self):
714717
def _from_factorized(cls, values, original):
715718
return cls(values, dtype=original.dtype)
716719

717-
def _values_for_argsort(self):
718-
return self._ndarray
719-
720720
# ------------------------------------------------------------------
721721
# Validation Methods
722722
# TODO: try to de-duplicate these, ensure identical behavior
@@ -917,34 +917,6 @@ def _unbox(self, other, setitem: bool = False) -> Union[np.int64, np.ndarray]:
917917
# These are not part of the EA API, but we implement them because
918918
# pandas assumes they're there.
919919

920-
def searchsorted(self, value, side="left", sorter=None):
921-
"""
922-
Find indices where elements should be inserted to maintain order.
923-
924-
Find the indices into a sorted array `self` such that, if the
925-
corresponding elements in `value` were inserted before the indices,
926-
the order of `self` would be preserved.
927-
928-
Parameters
929-
----------
930-
value : array_like
931-
Values to insert into `self`.
932-
side : {'left', 'right'}, optional
933-
If 'left', the index of the first suitable location found is given.
934-
If 'right', return the last such index. If there is no suitable
935-
index, return either 0 or N (where N is the length of `self`).
936-
sorter : 1-D array_like, optional
937-
Optional array of integer indices that sort `self` into ascending
938-
order. They are typically the result of ``np.argsort``.
939-
940-
Returns
941-
-------
942-
indices : array of ints
943-
Array of insertion points with the same shape as `value`.
944-
"""
945-
value = self._validate_searchsorted_value(value)
946-
return self._data.searchsorted(value, side=side, sorter=sorter)
947-
948920
def value_counts(self, dropna=False):
949921
"""
950922
Return a Series containing counts of unique values.

pandas/core/arrays/numpy_.py

+9-8
Original file line numberDiff line numberDiff line change
@@ -260,15 +260,19 @@ def __getitem__(self, item):
260260
return result
261261

262262
def __setitem__(self, key, value) -> None:
263-
value = extract_array(value, extract_numpy=True)
263+
key = self._validate_setitem_key(key)
264+
value = self._validate_setitem_value(value)
265+
self._ndarray[key] = value
264266

265-
key = check_array_indexer(self, key)
266-
scalar_value = lib.is_scalar(value)
267+
def _validate_setitem_value(self, value):
268+
value = extract_array(value, extract_numpy=True)
267269

268-
if not scalar_value:
270+
if not lib.is_scalar(value):
269271
value = np.asarray(value, dtype=self._ndarray.dtype)
272+
return value
270273

271-
self._ndarray[key] = value
274+
def _validate_setitem_key(self, key):
275+
return check_array_indexer(self, key)
272276

273277
def isna(self) -> np.ndarray:
274278
return isna(self._ndarray)
@@ -308,9 +312,6 @@ def _validate_fill_value(self, fill_value):
308312
fill_value = self.dtype.na_value
309313
return fill_value
310314

311-
def _values_for_argsort(self) -> np.ndarray:
312-
return self._ndarray
313-
314315
def _values_for_factorize(self) -> Tuple[np.ndarray, int]:
315316
return self._ndarray, -1
316317

pandas/core/construction.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -472,7 +472,7 @@ def sanitize_array(
472472

473473
# figure out the dtype from the value (upcast if necessary)
474474
if dtype is None:
475-
dtype, value = infer_dtype_from_scalar(value)
475+
dtype, value = infer_dtype_from_scalar(value, pandas_dtype=True)
476476
else:
477477
# need to possibly convert the value here
478478
value = maybe_cast_to_datetime(value, dtype)

pandas/core/dtypes/cast.py

-1
Original file line numberDiff line numberDiff line change
@@ -709,7 +709,6 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj,
709709
elif pandas_dtype:
710710
if lib.is_period(val):
711711
dtype = PeriodDtype(freq=val.freq)
712-
val = val.ordinal
713712
elif lib.is_interval(val):
714713
subtype = infer_dtype_from_scalar(val.left, pandas_dtype=True)[0]
715714
dtype = IntervalDtype(subtype=subtype)

pandas/core/frame.py

+22-3
Original file line numberDiff line numberDiff line change
@@ -7619,7 +7619,7 @@ def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds):
76197619
)
76207620
return op.get_result()
76217621

7622-
def applymap(self, func) -> DataFrame:
7622+
def applymap(self, func, na_action: Optional[str] = None) -> DataFrame:
76237623
"""
76247624
Apply a function to a Dataframe elementwise.
76257625
@@ -7630,6 +7630,10 @@ def applymap(self, func) -> DataFrame:
76307630
----------
76317631
func : callable
76327632
Python function, returns a single value from a single value.
7633+
na_action : {None, 'ignore'}, default None
7634+
If ‘ignore’, propagate NaN values, without passing them to func.
7635+
7636+
.. versionadded:: 1.2
76337637
76347638
Returns
76357639
-------
@@ -7653,6 +7657,15 @@ def applymap(self, func) -> DataFrame:
76537657
0 3 4
76547658
1 5 5
76557659
7660+
Like Series.map, NA values can be ignored:
7661+
7662+
>>> df_copy = df.copy()
7663+
>>> df_copy.iloc[0, 0] = pd.NA
7664+
>>> df_copy.applymap(lambda x: len(str(x)), na_action='ignore')
7665+
0 1
7666+
0 <NA> 4
7667+
1 5 5
7668+
76567669
Note that a vectorized version of `func` often exists, which will
76577670
be much faster. You could square each number elementwise.
76587671
@@ -7668,11 +7681,17 @@ def applymap(self, func) -> DataFrame:
76687681
0 1.000000 4.494400
76697682
1 11.262736 20.857489
76707683
"""
7684+
if na_action not in {"ignore", None}:
7685+
raise ValueError(
7686+
f"na_action must be 'ignore' or None. Got {repr(na_action)}"
7687+
)
7688+
ignore_na = na_action == "ignore"
7689+
76717690
# if we have a dtype == 'M8[ns]', provide boxed values
76727691
def infer(x):
76737692
if x.empty:
7674-
return lib.map_infer(x, func)
7675-
return lib.map_infer(x.astype(object)._values, func)
7693+
return lib.map_infer(x, func, ignore_na=ignore_na)
7694+
return lib.map_infer(x.astype(object)._values, func, ignore_na=ignore_na)
76767695

76777696
return self.apply(infer)
76787697

pandas/core/indexes/interval.py

+2
Original file line numberDiff line numberDiff line change
@@ -589,6 +589,8 @@ def _maybe_convert_i8(self, key):
589589
if scalar:
590590
# Timestamp/Timedelta
591591
key_dtype, key_i8 = infer_dtype_from_scalar(key, pandas_dtype=True)
592+
if lib.is_period(key):
593+
key_i8 = key.ordinal
592594
else:
593595
# DatetimeIndex/TimedeltaIndex
594596
key_dtype, key_i8 = key.dtype, Index(key.asi8)

pandas/tests/dtypes/cast/test_infer_dtype.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -84,13 +84,11 @@ def test_infer_dtype_from_period(freq, pandas_dtype):
8484

8585
if pandas_dtype:
8686
exp_dtype = f"period[{freq}]"
87-
exp_val = p.ordinal
8887
else:
8988
exp_dtype = np.object_
90-
exp_val = p
9189

9290
assert dtype == exp_dtype
93-
assert val == exp_val
91+
assert val == p
9492

9593

9694
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)