Merge remote-tracking branch 'upstream/master' into disown-tz-only-rebased

TomAugspurger · TomAugspurger · commit b046791fd1ac · 2018-12-21T09:13:24.000-06:00
diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
@@ -1145,7 +1145,8 @@ dtype in apply
 
 Pandas currently does not preserve the dtype in apply functions: If you apply along rows you get
 a `Series` of ``object`` `dtype` (same as getting a row -> getting one element will return a
-basic type) and applying along columns will also convert to object.
+basic type) and applying along columns will also convert to object. ``NaN`` values are unaffected.
+You can use ``fillna`` to handle missing values before applying a function.
 
 .. ipython:: python
 
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -1013,7 +1013,8 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your
 
 **Other changes**
 
-- ``ExtensionArray`` has gained the abstract methods ``.dropna()`` (:issue:`21185`)
+- :meth:`~pandas.api.types.ExtensionArray.dropna` has been added (:issue:`21185`)
+- :meth:`~pandas.api.types.ExtensionArray.repeat` has been added (:issue:`24349`)
 - ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore
   the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`)
 - An ``ExtensionArray`` with a boolean dtype now works correctly as a boolean indexer. :meth:`pandas.api.types.is_bool_dtype` now properly considers them boolean (:issue:`22326`)
@@ -1310,6 +1311,7 @@ Categorical
 - Bug when resampling :meth:`DataFrame.resample()` and aggregating on categorical data, the categorical dtype was getting lost. (:issue:`23227`)
 - Bug in many methods of the ``.str``-accessor, which always failed on calling the ``CategoricalIndex.str`` constructor (:issue:`23555`, :issue:`23556`)
 - Bug in :meth:`Series.where` losing the categorical dtype for categorical data (:issue:`24077`)
+- Bug in :meth:`Categorical.apply` where ``NaN`` values could be handled unpredictably. They now remain unchanged (:issue:`24241`)
 
 Datetimelike
 ^^^^^^^^^^^^
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -580,6 +580,35 @@ def factorize(self, na_sentinel=-1):
         uniques = self._from_factorized(uniques, self)
         return labels, uniques
 
+    def repeat(self, repeats, axis=None):
+        """
+        Repeat elements of an array.
+
+        .. versionadded:: 0.24.0
+
+        Parameters
+        ----------
+        repeats : int
+            This should be a non-negative integer. Repeating 0 times
+            will return an empty array.
+
+        Returns
+        -------
+        repeated_array : ExtensionArray
+            Same type as the input, with elements repeated `repeats` times.
+
+        See Also
+        --------
+        numpy.repeat : Similar method for :class:`numpy.ndarray`.
+        ExtensionArray.take : Take arbitrary positions.
+        """
+        if axis is not None:
+            raise ValueError("'axis' must be None.")
+        if repeats < 0:
+            raise ValueError("negative repeats are not allowed.")
+        ind = np.arange(len(self)).repeat(repeats)
+        return self.take(ind)
+
     # ------------------------------------------------------------------------
     # Indexing methods
     # ------------------------------------------------------------------------
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -1166,7 +1166,7 @@ def map(self, mapper):
         Maps the categories to new categories. If the mapping correspondence is
         one-to-one the result is a :class:`~pandas.Categorical` which has the
         same order property as the original, otherwise a :class:`~pandas.Index`
-        is returned.
+        is returned. NaN values are unaffected.
 
         If a `dict` or :class:`~pandas.Series` is used any unmapped category is
         mapped to `NaN`. Note that if this happens an :class:`~pandas.Index`
@@ -1234,6 +1234,11 @@ def map(self, mapper):
                                    categories=new_categories,
                                    ordered=self.ordered)
         except ValueError:
+            # NA values are represented in self._codes with -1
+            # np.take causes NA values to take final element in new_categories
+            if np.any(self._codes == -1):
+                new_categories = new_categories.insert(len(new_categories),
+                                                       np.nan)
             return np.take(new_categories, self._codes)
 
     __eq__ = _cat_compare_op('__eq__')
diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
@@ -264,3 +264,34 @@ def test_where_series(self, data, na_value, as_frame):
         if as_frame:
             expected = expected.to_frame(name='a')
         self.assert_equal(result, expected)
+
+    @pytest.mark.parametrize("as_series", [True, False])
+    @pytest.mark.parametrize("repeats", [0, 1, 2])
+    def test_repeat(self, data, repeats, as_series):
+        a, b, c = data[:3]
+        arr = type(data)._from_sequence([a, b, c], dtype=data.dtype)
+
+        if as_series:
+            arr = pd.Series(arr)
+
+        result = arr.repeat(repeats)
+
+        if repeats == 0:
+            expected = []
+        elif repeats == 1:
+            expected = [a, b, c]
+        else:
+            expected = [a, a, b, b, c, c]
+        expected = type(data)._from_sequence(expected, dtype=data.dtype)
+        if as_series:
+            index = pd.Series(np.arange(len(arr))).repeat(repeats).index
+            expected = pd.Series(expected, index=index)
+        self.assert_equal(result, expected)
+
+    def test_repeat_raises(self, data):
+        with pytest.raises(ValueError, match="'axis'"):
+            data.repeat(2, axis=1)
+
+        with pytest.raises(ValueError,
+                           match="negative"):
+            data.repeat(-1)
diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py
@@ -311,6 +311,29 @@ def test_map_with_categorical_series(self):
         exp = pd.Index(["odd", "even", "odd", np.nan])
         tm.assert_index_equal(a.map(c), exp)
 
+    @pytest.mark.parametrize(
+        (
+            'data',
+            'f'
+        ),
+        (
+            ([1, 1, np.nan], pd.isna),
+            ([1, 2, np.nan], pd.isna),
+            ([1, 1, np.nan], {1: False}),
+            ([1, 2, np.nan], {1: False, 2: False}),
+            ([1, 1, np.nan], pd.Series([False, False])),
+            ([1, 2, np.nan], pd.Series([False, False, False]))
+        ))
+    def test_map_with_nan(self, data, f):  # GH 24241
+        values = pd.Categorical(data)
+        result = values.map(f)
+        if data[1] == 1:
+            expected = pd.Categorical([False, False, np.nan])
+            tm.assert_categorical_equal(result, expected)
+        else:
+            expected = pd.Index([False, False, np.nan])
+            tm.assert_index_equal(result, expected)
+
     @pytest.mark.parametrize('klass', [list, tuple, np.array, pd.Series])
     def test_where(self, klass):
         i = self.create_index()