Merge branch 'master' of https://github.com/pandas-dev/pandas into pandas-devgh-31922

ylin00 · ylin00 · commit 263b0f9107f3 · 2020-09-11T11:39:22.000-07:00
diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -87,11 +87,9 @@ The mapping can be specified many different ways:
 * A Python function, to be called on each of the axis labels.
 * A list or NumPy array of the same length as the selected axis.
 * A dict or ``Series``, providing a ``label -> group name`` mapping.
-* For ``DataFrame`` objects, a string indicating a column to be used to group.
-  Of course ``df.groupby('A')`` is just syntactic sugar for
-  ``df.groupby(df['A'])``, but it makes life simpler.
-* For ``DataFrame`` objects, a string indicating an index level to be used to
-  group.
+* For ``DataFrame`` objects, a string indicating either a column name or
+  an index level name to be used to group.
+* ``df.groupby('A')`` is just syntactic sugar for ``df.groupby(df['A'])``.
 * A list of any of the above things.
 
 Collectively we refer to the grouping objects as the **keys**. For example,
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -100,8 +100,8 @@ For example:
 
 Other enhancements
 ^^^^^^^^^^^^^^^^^^
-
 - Added :meth:`~DataFrame.set_flags` for setting table-wide flags on a ``Series`` or ``DataFrame`` (:issue:`28394`)
+- :meth:`DataFrame.applymap` now supports ``na_action`` (:issue:`23803`)
 - :class:`Index` with object dtype supports division and multiplication (:issue:`34160`)
 - :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`)
 -
@@ -334,7 +334,7 @@ Sparse
 ExtensionArray
 ^^^^^^^^^^^^^^
 
--
+- Fixed Bug where :class:`DataFrame` column set to scalar extension type via a dict instantion was considered an object type rather than the extension type (:issue:`35965`)
 -
 
 
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -2377,14 +2377,17 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=Tr
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def map_infer(ndarray arr, object f, bint convert=True):
+def map_infer(ndarray arr, object f, bint convert=True, bint ignore_na=False):
     """
     Substitute for np.vectorize with pandas-friendly dtype inference.
 
     Parameters
     ----------
     arr : ndarray
     f : function
+    convert : bint
+    ignore_na : bint
+        If True, NA values will not have f applied
 
     Returns
     -------
@@ -2398,6 +2401,9 @@ def map_infer(ndarray arr, object f, bint convert=True):
     n = len(arr)
     result = np.empty(n, dtype=object)
     for i in range(n):
+        if ignore_na and checknull(arr[i]):
+            result[i] = arr[i]
+            continue
         val = f(arr[i])
 
         if cnp.PyArray_IsZeroDim(val):
diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py
@@ -6,7 +6,7 @@
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import cache_readonly, doc
 
-from pandas.core.algorithms import searchsorted, take, unique
+from pandas.core.algorithms import take, unique
 from pandas.core.array_algos.transforms import shift
 from pandas.core.arrays.base import ExtensionArray
 
@@ -102,6 +102,9 @@ def T(self: _T) -> _T:
 
     # ------------------------------------------------------------------------
 
+    def _values_for_argsort(self):
+        return self._ndarray
+
     def copy(self: _T) -> _T:
         new_data = self._ndarray.copy()
         return self._from_backing_data(new_data)
@@ -135,7 +138,11 @@ def _concat_same_type(cls, to_concat, axis: int = 0):
 
     @doc(ExtensionArray.searchsorted)
     def searchsorted(self, value, side="left", sorter=None):
-        return searchsorted(self._ndarray, value, side=side, sorter=sorter)
+        value = self._validate_searchsorted_value(value)
+        return self._ndarray.searchsorted(value, side=side, sorter=sorter)
+
+    def _validate_searchsorted_value(self, value):
+        return value
 
     @doc(ExtensionArray.shift)
     def shift(self, periods=1, fill_value=None, axis=0):
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -12,7 +12,7 @@
 from pandas._libs import NaT, algos as libalgos, hashtable as htable, lib
 from pandas._typing import ArrayLike, Dtype, Ordered, Scalar
 from pandas.compat.numpy import function as nv
-from pandas.util._decorators import cache_readonly, deprecate_kwarg, doc
+from pandas.util._decorators import cache_readonly, deprecate_kwarg
 from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs
 
 from pandas.core.dtypes.cast import (
@@ -45,12 +45,7 @@
 import pandas.core.algorithms as algorithms
 from pandas.core.algorithms import _get_data_algo, factorize, take_1d, unique1d
 from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
-from pandas.core.base import (
-    ExtensionArray,
-    NoNewAttributesMixin,
-    PandasObject,
-    _shared_docs,
-)
+from pandas.core.base import ExtensionArray, NoNewAttributesMixin, PandasObject
 import pandas.core.common as com
 from pandas.core.construction import array, extract_array, sanitize_array
 from pandas.core.indexers import check_array_indexer, deprecate_ndim_indexing
@@ -63,6 +58,7 @@
 
 def _cat_compare_op(op):
     opname = f"__{op.__name__}__"
+    fill_value = True if op is operator.ne else False
 
     @unpack_zerodim_and_defer(opname)
     def func(self, other):
@@ -97,26 +93,23 @@ def func(self, other):
             else:
                 other_codes = other._codes
 
-            f = getattr(self._codes, opname)
-            ret = f(other_codes)
+            ret = op(self._codes, other_codes)
             mask = (self._codes == -1) | (other_codes == -1)
             if mask.any():
-                # In other series, the leads to False, so do that here too
-                if opname == "__ne__":
-                    ret[(self._codes == -1) & (other_codes == -1)] = True
-                else:
-                    ret[mask] = False
+                ret[mask] = fill_value
             return ret
 
         if is_scalar(other):
             if other in self.categories:
                 i = self.categories.get_loc(other)
-                ret = getattr(self._codes, opname)(i)
+                ret = op(self._codes, i)
 
                 if opname not in {"__eq__", "__ge__", "__gt__"}:
-                    # check for NaN needed if we are not equal or larger
+                    # GH#29820 performance trick; get_loc will always give i>=0,
+                    #  so in the cases (__ne__, __le__, __lt__) the setting
+                    #  here is a no-op, so can be skipped.
                     mask = self._codes == -1
-                    ret[mask] = False
+                    ret[mask] = fill_value
                 return ret
             else:
                 return ops.invalid_comparison(self, other, op)
@@ -1315,11 +1308,6 @@ def memory_usage(self, deep=False):
         """
         return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep)
 
-    @doc(_shared_docs["searchsorted"], klass="Categorical")
-    def searchsorted(self, value, side="left", sorter=None):
-        value = self._validate_searchsorted_value(value)
-        return self.codes.searchsorted(value, side=side, sorter=sorter)
-
     def isna(self):
         """
         Detect missing values
@@ -1428,9 +1416,6 @@ def check_for_ordered(self, op):
                 "Categorical to an ordered one\n"
             )
 
-    def _values_for_argsort(self):
-        return self._codes
-
     def argsort(self, ascending=True, kind="quicksort", **kwargs):
         """
         Return the indices that would sort the Categorical.
@@ -1879,7 +1864,7 @@ def __getitem__(self, key):
         if result.ndim > 1:
             deprecate_ndim_indexing(result)
             return result
-        return self._constructor(result, dtype=self.dtype, fastpath=True)
+        return self._from_backing_data(result)
 
     def __setitem__(self, key, value):
         """
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -545,15 +545,18 @@ def __getitem__(self, key):
             result = self._ndarray[key]
             if self.ndim == 1:
                 return self._box_func(result)
-            return self._simple_new(result, dtype=self.dtype)
+            return self._from_backing_data(result)
 
         key = self._validate_getitem_key(key)
         result = self._ndarray[key]
         if lib.is_scalar(result):
             return self._box_func(result)
 
+        result = self._from_backing_data(result)
+
         freq = self._get_getitem_freq(key)
-        return self._simple_new(result, dtype=self.dtype, freq=freq)
+        result._freq = freq
+        return result
 
     def _validate_getitem_key(self, key):
         if com.is_bool_indexer(key):
@@ -714,9 +717,6 @@ def _values_for_factorize(self):
     def _from_factorized(cls, values, original):
         return cls(values, dtype=original.dtype)
 
-    def _values_for_argsort(self):
-        return self._ndarray
-
     # ------------------------------------------------------------------
     # Validation Methods
     # TODO: try to de-duplicate these, ensure identical behavior
@@ -917,34 +917,6 @@ def _unbox(self, other, setitem: bool = False) -> Union[np.int64, np.ndarray]:
     #  These are not part of the EA API, but we implement them because
     #  pandas assumes they're there.
 
-    def searchsorted(self, value, side="left", sorter=None):
-        """
-        Find indices where elements should be inserted to maintain order.
-
-        Find the indices into a sorted array `self` such that, if the
-        corresponding elements in `value` were inserted before the indices,
-        the order of `self` would be preserved.
-
-        Parameters
-        ----------
-        value : array_like
-            Values to insert into `self`.
-        side : {'left', 'right'}, optional
-            If 'left', the index of the first suitable location found is given.
-            If 'right', return the last such index.  If there is no suitable
-            index, return either 0 or N (where N is the length of `self`).
-        sorter : 1-D array_like, optional
-            Optional array of integer indices that sort `self` into ascending
-            order. They are typically the result of ``np.argsort``.
-
-        Returns
-        -------
-        indices : array of ints
-            Array of insertion points with the same shape as `value`.
-        """
-        value = self._validate_searchsorted_value(value)
-        return self._data.searchsorted(value, side=side, sorter=sorter)
-
     def value_counts(self, dropna=False):
         """
         Return a Series containing counts of unique values.
diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
@@ -260,15 +260,19 @@ def __getitem__(self, item):
         return result
 
     def __setitem__(self, key, value) -> None:
-        value = extract_array(value, extract_numpy=True)
+        key = self._validate_setitem_key(key)
+        value = self._validate_setitem_value(value)
+        self._ndarray[key] = value
 
-        key = check_array_indexer(self, key)
-        scalar_value = lib.is_scalar(value)
+    def _validate_setitem_value(self, value):
+        value = extract_array(value, extract_numpy=True)
 
-        if not scalar_value:
+        if not lib.is_scalar(value):
             value = np.asarray(value, dtype=self._ndarray.dtype)
+        return value
 
-        self._ndarray[key] = value
+    def _validate_setitem_key(self, key):
+        return check_array_indexer(self, key)
 
     def isna(self) -> np.ndarray:
         return isna(self._ndarray)
@@ -308,9 +312,6 @@ def _validate_fill_value(self, fill_value):
             fill_value = self.dtype.na_value
         return fill_value
 
-    def _values_for_argsort(self) -> np.ndarray:
-        return self._ndarray
-
     def _values_for_factorize(self) -> Tuple[np.ndarray, int]:
         return self._ndarray, -1
 
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
@@ -472,7 +472,7 @@ def sanitize_array(
 
             # figure out the dtype from the value (upcast if necessary)
             if dtype is None:
-                dtype, value = infer_dtype_from_scalar(value)
+                dtype, value = infer_dtype_from_scalar(value, pandas_dtype=True)
             else:
                 # need to possibly convert the value here
                 value = maybe_cast_to_datetime(value, dtype)
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -709,7 +709,6 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj,
     elif pandas_dtype:
         if lib.is_period(val):
             dtype = PeriodDtype(freq=val.freq)
-            val = val.ordinal
         elif lib.is_interval(val):
             subtype = infer_dtype_from_scalar(val.left, pandas_dtype=True)[0]
             dtype = IntervalDtype(subtype=subtype)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -7619,7 +7619,7 @@ def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds):
         )
         return op.get_result()
 
-    def applymap(self, func) -> DataFrame:
+    def applymap(self, func, na_action: Optional[str] = None) -> DataFrame:
         """
         Apply a function to a Dataframe elementwise.
 
@@ -7630,6 +7630,10 @@ def applymap(self, func) -> DataFrame:
         ----------
         func : callable
             Python function, returns a single value from a single value.
+        na_action : {None, 'ignore'}, default None
+            If ‘ignore’, propagate NaN values, without passing them to func.
+
+            .. versionadded:: 1.2
 
         Returns
         -------
@@ -7653,6 +7657,15 @@ def applymap(self, func) -> DataFrame:
         0  3  4
         1  5  5
 
+        Like Series.map, NA values can be ignored:
+
+        >>> df_copy = df.copy()
+        >>> df_copy.iloc[0, 0] = pd.NA
+        >>> df_copy.applymap(lambda x: len(str(x)), na_action='ignore')
+              0  1
+        0  <NA>  4
+        1     5  5
+
         Note that a vectorized version of `func` often exists, which will
         be much faster. You could square each number elementwise.
 
@@ -7668,11 +7681,17 @@ def applymap(self, func) -> DataFrame:
         0   1.000000   4.494400
         1  11.262736  20.857489
         """
+        if na_action not in {"ignore", None}:
+            raise ValueError(
+                f"na_action must be 'ignore' or None. Got {repr(na_action)}"
+            )
+        ignore_na = na_action == "ignore"
+
         # if we have a dtype == 'M8[ns]', provide boxed values
         def infer(x):
             if x.empty:
-                return lib.map_infer(x, func)
-            return lib.map_infer(x.astype(object)._values, func)
+                return lib.map_infer(x, func, ignore_na=ignore_na)
+            return lib.map_infer(x.astype(object)._values, func, ignore_na=ignore_na)
 
         return self.apply(infer)
 
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
@@ -589,6 +589,8 @@ def _maybe_convert_i8(self, key):
         if scalar:
             # Timestamp/Timedelta
             key_dtype, key_i8 = infer_dtype_from_scalar(key, pandas_dtype=True)
+            if lib.is_period(key):
+                key_i8 = key.ordinal
         else:
             # DatetimeIndex/TimedeltaIndex
             key_dtype, key_i8 = key.dtype, Index(key.asi8)
diff --git a/pandas/tests/dtypes/cast/test_infer_dtype.py b/pandas/tests/dtypes/cast/test_infer_dtype.py
@@ -84,13 +84,11 @@ def test_infer_dtype_from_period(freq, pandas_dtype):
 
     if pandas_dtype:
         exp_dtype = f"period[{freq}]"
-        exp_val = p.ordinal
     else:
         exp_dtype = np.object_
-        exp_val = p
 
     assert dtype == exp_dtype
-    assert val == exp_val
+    assert val == p
 
 
 @pytest.mark.parametrize(
diff --git a/pandas/tests/frame/apply/test_frame_apply.py b/pandas/tests/frame/apply/test_frame_apply.py
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py

Original file line number	Diff line number	Diff line change
`@@ -100,8 +100,8 @@ For example:`
`100`	`100`
`101`	`101`	`Other enhancements`
`102`	`102`	`^^^^^^^^^^^^^^^^^^`
`103`		`-`
`104`	`103`	- Added :meth:`~DataFrame.set_flags` for setting table-wide flags on a ``Series`` or ``DataFrame`` (:issue:`28394`)
	`104`	+- :meth:`DataFrame.applymap` now supports ``na_action`` (:issue:`23803`)
`105`	`105`	- :class:`Index` with object dtype supports division and multiplication (:issue:`34160`)
`106`	`106`	- :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`)
`107`	`107`	`-`
`@@ -334,7 +334,7 @@ Sparse`
`334`	`334`	`ExtensionArray`
`335`	`335`	`^^^^^^^^^^^^^^`
`336`	`336`
`337`		`--`
	`337`	+- Fixed Bug where :class:`DataFrame` column set to scalar extension type via a dict instantion was considered an object type rather than the extension type (:issue:`35965`)
`338`	`338`	`-`
`339`	`339`
`340`	`340`