Merge branch 'categorical-fillna' of https://github.com/MarcoGorelli/pandas into categorical-fillna

MarcoGorelli · MarcoGorelli · commit 1c7e63c9cedf · 2019-08-15T19:39:38.000+01:00
diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml
@@ -17,7 +17,7 @@ dependencies:
   - openpyxl
   - pytables
   - python-dateutil
-  - python=3.7.3
+  - python=3.7.*
   - pytz
   - s3fs
   - scipy
diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml
@@ -2,7 +2,7 @@ name: pandas-dev
 channels:
   - defaults
 dependencies:
-  - python=3.7.3
+  - python=3.7.*
   - pytz
   - Cython>=0.28.2
   # universal
diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml
@@ -4,7 +4,7 @@ channels:
   - conda-forge
   - c3i_test
 dependencies:
-  - python=3.7.3
+  - python=3.7.*
   - botocore>=1.11
   - cython>=0.28.2
   - numpy
diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst
@@ -21,6 +21,8 @@ Other enhancements
 Bug fixes
 ~~~~~~~~~
 
+- Bug in :func:`fill_na` in :class:`Categorical` would replace all values, not just those that are NaN (:issue:`26215`)
+
 
 Categorical
 ^^^^^^^^^^^
@@ -83,7 +85,7 @@ Indexing
 ^^^^^^^^
 
 - Bug in partial-string indexing returning a NumPy array rather than a ``Series`` when indexing with a scalar like ``.loc['2015']`` (:issue:`27516`)
-- Break reference cycle involving :class:`Index` to allow garbage collection of :class:`Index` objects without running the GC. (:issue:`27585`)
+- Break reference cycle involving :class:`Index` and other index classes to allow garbage collection of index objects without running the GC. (:issue:`27585`, :issue:`27840`)
 - Fix regression in assigning values to a single column of a DataFrame with a ``MultiIndex`` columns (:issue:`27841`).
 -
 
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -21,27 +21,27 @@ including other versions of pandas.
 Enhancements
 ~~~~~~~~~~~~
 
-.. _whatsnew_1000.enhancements.other:
-
 -
 -
 
+.. _whatsnew_1000.enhancements.other:
+
 Other enhancements
 ^^^^^^^^^^^^^^^^^^
 
-.. _whatsnew_1000.api_breaking:
-
 -
 -
 
+.. _whatsnew_1000.api_breaking:
+
 Backwards incompatible API changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. _whatsnew_1000.api.other:
-
 - :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`).
 -
 
+.. _whatsnew_1000.api.other:
+
 Other API changes
 ^^^^^^^^^^^^^^^^^
 
diff --git a/environment.yml b/environment.yml
@@ -5,7 +5,7 @@ channels:
 dependencies:
   # required
   - numpy>=1.15
-  - python=3.7.3
+  - python=3
   - python-dateutil>=2.6.1
   - pytz
 
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -28,13 +28,11 @@
     is_complex_dtype,
     is_datetime64_any_dtype,
     is_datetime64_ns_dtype,
-    is_datetime64tz_dtype,
     is_datetimelike,
     is_extension_array_dtype,
     is_float_dtype,
     is_integer,
     is_integer_dtype,
-    is_interval_dtype,
     is_list_like,
     is_numeric_dtype,
     is_object_dtype,
@@ -183,8 +181,6 @@ def _reconstruct_data(values, dtype, original):
 
     if is_extension_array_dtype(dtype):
         values = dtype.construct_array_type()._from_sequence(values)
-    elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype):
-        values = Index(original)._shallow_copy(values, name=None)
     elif is_bool_dtype(dtype):
         values = values.astype(dtype)
 
@@ -1645,19 +1641,13 @@ def take_nd(
         May be the same type as the input, or cast to an ndarray.
     """
 
-    # TODO(EA): Remove these if / elifs as datetimeTZ, interval, become EAs
-    # dispatch to internal type takes
     if is_extension_array_dtype(arr):
         return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
-    elif is_datetime64tz_dtype(arr):
-        return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
-    elif is_interval_dtype(arr):
-        return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
 
     if is_sparse(arr):
         arr = arr.to_dense()
     elif isinstance(arr, (ABCIndexClass, ABCSeries)):
-        arr = arr.values
+        arr = arr._values
 
     arr = np.asarray(arr)
 
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -1824,7 +1824,6 @@ def fillna(self, value=None, method=None, limit=None):
 
         # pad / bfill
         if method is not None:
-
             values = self.to_dense().reshape(-1, len(self))
             values = interpolate_2d(values, method, 0, None, value).astype(
                 self.categories.dtype
@@ -1838,10 +1837,9 @@ def fillna(self, value=None, method=None, limit=None):
             if isinstance(value, ABCSeries):
                 if not value[~value.isin(self.categories)].isna().all():
                     raise ValueError("fill value must be in categories")
-
                 values_codes = _get_codes_for_values(value, self.categories)
-                indexer = np.where(values_codes != -1)
-                codes[indexer] = values_codes[values_codes != -1]
+                indexer = np.where(codes == -1)
+                codes[indexer] = values_codes[codes == -1]
 
             # If value is not a dict or Series it should be a scalar
             elif is_hashable(value):
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -775,7 +775,8 @@ def style(self):
         Iterates over the DataFrame columns, returning a tuple with
         the column name and the content as a Series.
 
-        %s
+        Yields
+        ------
         label : object
             The column names for the DataFrame being iterated over.
         content : Series
@@ -816,7 +817,7 @@ def style(self):
         Name: population, dtype: int64
         """
 
-    @Appender(_shared_docs["items"] % "Yields\n        ------")
+    @Appender(_shared_docs["items"])
     def items(self):
         if self.columns.is_unique and hasattr(self, "_item_cache"):
             for k in self.columns:
@@ -825,9 +826,9 @@ def items(self):
             for i, k in enumerate(self.columns):
                 yield k, self._ixs(i, axis=1)
 
-    @Appender(_shared_docs["items"] % "Returns\n        -------")
+    @Appender(_shared_docs["items"])
     def iteritems(self):
-        return self.items()
+        yield from self.items()
 
     def iterrows(self):
         """
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -665,7 +665,7 @@ def _cleanup(self):
     def _engine(self):
         # property, for now, slow to look up
 
-        # to avoid a refernce cycle, bind `_ndarray_values` to a local variable, so
+        # to avoid a reference cycle, bind `_ndarray_values` to a local variable, so
         # `self` is not passed into the lambda.
         _ndarray_values = self._ndarray_values
         return self._engine_type(lambda: _ndarray_values, len(self))
@@ -5341,16 +5341,6 @@ def _maybe_update_attributes(self, attrs):
         """
         return attrs
 
-    def _validate_for_numeric_unaryop(self, op, opstr):
-        """
-        Validate if we can perform a numeric unary operation.
-        """
-        if not self._is_numeric_dtype:
-            raise TypeError(
-                "cannot evaluate a numeric op "
-                "{opstr} for type: {typ}".format(opstr=opstr, typ=type(self).__name__)
-            )
-
     @classmethod
     def _add_numeric_methods_binary(cls):
         """
@@ -5383,7 +5373,6 @@ def _add_numeric_methods_unary(cls):
         def _make_evaluate_unary(op, opstr):
             def _evaluate_numeric_unary(self):
 
-                self._validate_for_numeric_unaryop(op, opstr)
                 attrs = self._get_attributes_dict()
                 attrs = self._maybe_update_attributes(attrs)
                 return Index(op(self.values), **attrs)
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -446,9 +446,11 @@ def argsort(self, *args, **kwargs):
 
     @cache_readonly
     def _engine(self):
-
-        # we are going to look things up with the codes themselves
-        return self._engine_type(lambda: self.codes, len(self))
+        # we are going to look things up with the codes themselves.
+        # To avoid a reference cycle, bind `codes` to a local variable, so
+        # `self` is not passed into the lambda.
+        codes = self.codes
+        return self._engine_type(lambda: codes, len(self))
 
     # introspection
     @cache_readonly
diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py
@@ -1,5 +1,6 @@
 from datetime import datetime, timedelta
 import warnings
+import weakref
 
 import numpy as np
 
@@ -441,7 +442,9 @@ def _formatter_func(self):
 
     @cache_readonly
     def _engine(self):
-        return self._engine_type(lambda: self, len(self))
+        # To avoid a reference cycle, pass a weakref of self to _engine_type.
+        period = weakref.ref(self)
+        return self._engine_type(period, len(self))
 
     @Appender(_index_shared_docs["contains"])
     def __contains__(self, key):
diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py
@@ -47,7 +47,7 @@
 
 import pandas as pd
 from pandas._typing import ArrayLike
-from pandas.core.construction import extract_array
+from pandas.core.construction import array, extract_array
 from pandas.core.ops import missing
 from pandas.core.ops.docstrings import (
     _arith_doc_FRAME,
@@ -460,6 +460,33 @@ def masked_arith_op(x, y, op):
 # Dispatch logic
 
 
+def should_extension_dispatch(left: ABCSeries, right: Any) -> bool:
+    """
+    Identify cases where Series operation should use dispatch_to_extension_op.
+
+    Parameters
+    ----------
+    left : Series
+    right : object
+
+    Returns
+    -------
+    bool
+    """
+    if (
+        is_extension_array_dtype(left.dtype)
+        or is_datetime64_dtype(left.dtype)
+        or is_timedelta64_dtype(left.dtype)
+    ):
+        return True
+
+    if is_extension_array_dtype(right) and not is_scalar(right):
+        # GH#22378 disallow scalar to exclude e.g. "category", "Int64"
+        return True
+
+    return False
+
+
 def should_series_dispatch(left, right, op):
     """
     Identify cases where a DataFrame operation should dispatch to its
@@ -564,19 +591,18 @@ def dispatch_to_extension_op(op, left, right):
     apply the operator defined by op.
     """
 
+    if left.dtype.kind in "mM":
+        # We need to cast datetime64 and timedelta64 ndarrays to
+        #  DatetimeArray/TimedeltaArray.  But we avoid wrapping others in
+        #  PandasArray as that behaves poorly with e.g. IntegerArray.
+        left = array(left)
+
     # The op calls will raise TypeError if the op is not defined
     # on the ExtensionArray
 
     # unbox Series and Index to arrays
-    if isinstance(left, (ABCSeries, ABCIndexClass)):
-        new_left = left._values
-    else:
-        new_left = left
-
-    if isinstance(right, (ABCSeries, ABCIndexClass)):
-        new_right = right._values
-    else:
-        new_right = right
+    new_left = extract_array(left, extract_numpy=True)
+    new_right = extract_array(right, extract_numpy=True)
 
     try:
         res_values = op(new_left, new_right)
@@ -684,56 +710,27 @@ def wrapper(left, right):
         res_name = get_op_result_name(left, right)
         right = maybe_upcast_for_op(right, left.shape)
 
-        if is_categorical_dtype(left):
-            raise TypeError(
-                "{typ} cannot perform the operation "
-                "{op}".format(typ=type(left).__name__, op=str_rep)
-            )
-
-        elif is_datetime64_dtype(left) or is_datetime64tz_dtype(left):
-            from pandas.core.arrays import DatetimeArray
-
-            result = dispatch_to_extension_op(op, DatetimeArray(left), right)
-            return construct_result(left, result, index=left.index, name=res_name)
-
-        elif is_extension_array_dtype(left) or (
-            is_extension_array_dtype(right) and not is_scalar(right)
-        ):
-            # GH#22378 disallow scalar to exclude e.g. "category", "Int64"
+        if should_extension_dispatch(left, right):
             result = dispatch_to_extension_op(op, left, right)
-            return construct_result(left, result, index=left.index, name=res_name)
 
-        elif is_timedelta64_dtype(left):
-            from pandas.core.arrays import TimedeltaArray
-
-            result = dispatch_to_extension_op(op, TimedeltaArray(left), right)
-            return construct_result(left, result, index=left.index, name=res_name)
-
-        elif is_timedelta64_dtype(right):
-            # We should only get here with non-scalar values for right
-            #  upcast by maybe_upcast_for_op
+        elif is_timedelta64_dtype(right) or isinstance(
+            right, (ABCDatetimeArray, ABCDatetimeIndex)
+        ):
+            # We should only get here with td64 right with non-scalar values
+            #  for right upcast by maybe_upcast_for_op
             assert not isinstance(right, (np.timedelta64, np.ndarray))
-
             result = op(left._values, right)
 
-            # We do not pass dtype to ensure that the Series constructor
-            #  does inference in the case where `result` has object-dtype.
-            return construct_result(left, result, index=left.index, name=res_name)
-
-        elif isinstance(right, (ABCDatetimeArray, ABCDatetimeIndex)):
-            result = op(left._values, right)
-            return construct_result(left, result, index=left.index, name=res_name)
+        else:
+            lvalues = extract_array(left, extract_numpy=True)
+            rvalues = extract_array(right, extract_numpy=True)
 
-        lvalues = left.values
-        rvalues = right
-        if isinstance(rvalues, (ABCSeries, ABCIndexClass)):
-            rvalues = rvalues._values
+            with np.errstate(all="ignore"):
+                result = na_op(lvalues, rvalues)
 
-        with np.errstate(all="ignore"):
-            result = na_op(lvalues, rvalues)
-        return construct_result(
-            left, result, index=left.index, name=res_name, dtype=None
-        )
+        # We do not pass dtype to ensure that the Series constructor
+        #  does inference in the case where `result` has object-dtype.
+        return construct_result(left, result, index=left.index, name=res_name)
 
     wrapper.__name__ = op_name
     return wrapper
diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py
diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py
diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py
diff --git a/requirements-dev.txt b/requirements-dev.txt

Original file line number	Diff line number	Diff line change
`@@ -21,27 +21,27 @@ including other versions of pandas.`
`21`	`21`	`Enhancements`
`22`	`22`	`~~~~~~~~~~~~`
`23`	`23`
`24`		`-.. _whatsnew_1000.enhancements.other:`
`25`		`-`
`26`	`24`	`-`
`27`	`25`	`-`
`28`	`26`
	`27`	`+.. _whatsnew_1000.enhancements.other:`
	`28`	`+`
`29`	`29`	`Other enhancements`
`30`	`30`	`^^^^^^^^^^^^^^^^^^`
`31`	`31`
`32`		`-.. _whatsnew_1000.api_breaking:`
`33`		`-`
`34`	`32`	`-`
`35`	`33`	`-`
`36`	`34`
	`35`	`+.. _whatsnew_1000.api_breaking:`
	`36`	`+`
`37`	`37`	`Backwards incompatible API changes`
`38`	`38`	`~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
`39`	`39`
`40`		`-.. _whatsnew_1000.api.other:`
`41`		`-`
`42`	`40`	- :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`).
`43`	`41`	`-`
`44`	`42`
	`43`	`+.. _whatsnew_1000.api.other:`
	`44`	`+`
`45`	`45`	`Other API changes`
`46`	`46`	`^^^^^^^^^^^^^^^^^`
`47`	`47`