Merge remote-tracking branch 'upstream/master' into pd-todatetime-unit_s-float-vs-int

arw2019 · arw2019 · commit 6f9caeb0fcfb · 2020-09-15T15:33:33.000Z
diff --git a/.github/workflows/stale-pr.yml b/.github/workflows/stale-pr.yml
@@ -0,0 +1,21 @@
+name: "Stale PRs"
+on:
+  schedule:
+  # * is a special character in YAML so you have to quote this string
+  - cron: "0 */6 * * *"
+
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/stale@v3
+      with:
+        repo-token: ${{ secrets.GITHUB_TOKEN }}
+        stale-pr-message: "This pull request is stale because it has been open for thirty days with no activity."
+        skip-stale-pr-message: false
+        stale-pr-label: "Stale"
+        exempt-pr-labels: "Needs Review,Blocked"
+        days-before-stale: 30
+        days-before-close: -1
+        remove-stale-when-updated: true
+        debug-only: true
diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py
@@ -9,6 +9,7 @@
 from pandas.core.algorithms import take, unique
 from pandas.core.array_algos.transforms import shift
 from pandas.core.arrays.base import ExtensionArray
+from pandas.core.indexers import check_array_indexer
 
 _T = TypeVar("_T", bound="NDArrayBackedExtensionArray")
 
@@ -156,3 +157,14 @@ def _validate_shift_value(self, fill_value):
         # TODO: after deprecation in datetimelikearraymixin is enforced,
         #  we can remove this and ust validate_fill_value directly
         return self._validate_fill_value(fill_value)
+
+    def __setitem__(self, key, value):
+        key = self._validate_setitem_key(key)
+        value = self._validate_setitem_value(value)
+        self._ndarray[key] = value
+
+    def _validate_setitem_key(self, key):
+        return check_array_indexer(self, key)
+
+    def _validate_setitem_value(self, value):
+        return value
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -93,7 +93,7 @@ def func(self, other):
 
         if is_scalar(other):
             if other in self.categories:
-                i = self.categories.get_loc(other)
+                i = self._unbox_scalar(other)
                 ret = op(self._codes, i)
 
                 if opname not in {"__eq__", "__ge__", "__gt__"}:
@@ -1184,8 +1184,7 @@ def _validate_searchsorted_value(self, value):
         # searchsorted is very performance sensitive. By converting codes
         # to same dtype as self.codes, we get much faster performance.
         if is_scalar(value):
-            codes = self.categories.get_loc(value)
-            codes = self.codes.dtype.type(codes)
+            codes = self._unbox_scalar(value)
         else:
             locs = [self.categories.get_loc(x) for x in value]
             codes = np.array(locs, dtype=self.codes.dtype)
@@ -1212,7 +1211,7 @@ def _validate_fill_value(self, fill_value):
         if isna(fill_value):
             fill_value = -1
         elif fill_value in self.categories:
-            fill_value = self.categories.get_loc(fill_value)
+            fill_value = self._unbox_scalar(fill_value)
         else:
             raise ValueError(
                 f"'fill_value={fill_value}' is not present "
@@ -1680,7 +1679,7 @@ def fillna(self, value=None, method=None, limit=None):
                     if isna(value):
                         codes[mask] = -1
                     else:
-                        codes[mask] = self.categories.get_loc(value)
+                        codes[mask] = self._unbox_scalar(value)
 
             else:
                 raise TypeError(
@@ -1734,6 +1733,17 @@ def _validate_listlike(self, target: ArrayLike) -> np.ndarray:
 
         return codes
 
+    def _unbox_scalar(self, key) -> int:
+        # searchsorted is very performance sensitive. By converting codes
+        # to same dtype as self.codes, we get much faster performance.
+        code = self.categories.get_loc(key)
+        code = self._codes.dtype.type(code)
+        return code
+
+    def _unbox_listlike(self, value):
+        unboxed = self.categories.get_indexer(value)
+        return unboxed.astype(self._ndarray.dtype, copy=False)
+
     # ------------------------------------------------------------------
 
     def take_nd(self, indexer, allow_fill: bool = False, fill_value=None):
@@ -1884,20 +1894,6 @@ def __getitem__(self, key):
             return result
         return self._from_backing_data(result)
 
-    def __setitem__(self, key, value):
-        """
-        Item assignment.
-
-        Raises
-        ------
-        ValueError
-            If (one or more) Value is not in categories or if a assigned
-            `Categorical` does not have the same categories
-        """
-        key = self._validate_setitem_key(key)
-        value = self._validate_setitem_value(value)
-        self._ndarray[key] = value
-
     def _validate_setitem_value(self, value):
         value = extract_array(value, extract_numpy=True)
 
@@ -1925,11 +1921,7 @@ def _validate_setitem_value(self, value):
                 "category, set the categories first"
             )
 
-        lindexer = self.categories.get_indexer(rvalue)
-        if isinstance(lindexer, np.ndarray) and lindexer.dtype.kind == "i":
-            lindexer = lindexer.astype(self._ndarray.dtype)
-
-        return lindexer
+        return self._unbox_listlike(rvalue)
 
     def _validate_setitem_key(self, key):
         if lib.is_integer(key):
@@ -2155,8 +2147,7 @@ def unique(self):
         return cat.set_categories(cat.categories.take(take_codes))
 
     def _values_for_factorize(self):
-        codes = self.codes.astype("int64")
-        return codes, -1
+        return self._ndarray, -1
 
     @classmethod
     def _from_factorized(cls, uniques, original):
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -609,9 +609,7 @@ def __setitem__(
         if no_op:
             return
 
-        value = self._validate_setitem_value(value)
-        key = check_array_indexer(self, key)
-        self._ndarray[key] = value
+        super().__setitem__(key, value)
         self._maybe_clear_freq()
 
     def _maybe_clear_freq(self):
@@ -697,7 +695,7 @@ def copy(self: DatetimeLikeArrayT) -> DatetimeLikeArrayT:
         return new_obj
 
     def _values_for_factorize(self):
-        return self.asi8, iNaT
+        return self._ndarray, iNaT
 
     @classmethod
     def _from_factorized(cls, values, original):
diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
@@ -259,21 +259,13 @@ def __getitem__(self, item):
             result = type(self)(result)
         return result
 
-    def __setitem__(self, key, value) -> None:
-        key = self._validate_setitem_key(key)
-        value = self._validate_setitem_value(value)
-        self._ndarray[key] = value
-
     def _validate_setitem_value(self, value):
         value = extract_array(value, extract_numpy=True)
 
         if not lib.is_scalar(value):
             value = np.asarray(value, dtype=self._ndarray.dtype)
         return value
 
-    def _validate_setitem_key(self, key):
-        return check_array_indexer(self, key)
-
     def isna(self) -> np.ndarray:
         return isna(self._ndarray)
 
diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py
@@ -10,6 +10,8 @@
 
 import numpy as np
 
+from pandas.compat import PY39
+
 import pandas.core.common as com
 from pandas.core.computation.ops import (
     ARITH_OPS_SYMS,
@@ -186,7 +188,6 @@ def _filter_nodes(superclass, all_nodes=_all_nodes):
 _stmt_nodes = _filter_nodes(ast.stmt)
 _expr_nodes = _filter_nodes(ast.expr)
 _expr_context_nodes = _filter_nodes(ast.expr_context)
-_slice_nodes = _filter_nodes(ast.slice)
 _boolop_nodes = _filter_nodes(ast.boolop)
 _operator_nodes = _filter_nodes(ast.operator)
 _unary_op_nodes = _filter_nodes(ast.unaryop)
@@ -197,6 +198,9 @@ def _filter_nodes(superclass, all_nodes=_all_nodes):
 _keyword_nodes = _filter_nodes(ast.keyword)
 _alias_nodes = _filter_nodes(ast.alias)
 
+if not PY39:
+    _slice_nodes = _filter_nodes(ast.slice)
+
 
 # nodes that we don't support directly but are needed for parsing
 _hacked_nodes = frozenset(["Assign", "Module", "Expr"])
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -143,7 +143,6 @@
 )
 from pandas.core.reshape.melt import melt
 from pandas.core.series import Series
-from pandas.core.sorting import ensure_key_mapped
 
 from pandas.io.common import get_filepath_or_buffer
 from pandas.io.formats import console, format as fmt
@@ -5448,62 +5447,17 @@ def sort_index(
         C  3
         d  4
         """
-        # TODO: this can be combined with Series.sort_index impl as
-        # almost identical
-
-        inplace = validate_bool_kwarg(inplace, "inplace")
-
-        axis = self._get_axis_number(axis)
-        labels = self._get_axis(axis)
-        labels = ensure_key_mapped(labels, key, levels=level)
-
-        # make sure that the axis is lexsorted to start
-        # if not we need to reconstruct to get the correct indexer
-        labels = labels._sort_levels_monotonic()
-        if level is not None:
-            new_axis, indexer = labels.sortlevel(
-                level, ascending=ascending, sort_remaining=sort_remaining
-            )
-
-        elif isinstance(labels, MultiIndex):
-            from pandas.core.sorting import lexsort_indexer
-
-            indexer = lexsort_indexer(
-                labels._get_codes_for_sorting(),
-                orders=ascending,
-                na_position=na_position,
-            )
-        else:
-            from pandas.core.sorting import nargsort
-
-            # Check monotonic-ness before sort an index
-            # GH11080
-            if (ascending and labels.is_monotonic_increasing) or (
-                not ascending and labels.is_monotonic_decreasing
-            ):
-                if inplace:
-                    return
-                else:
-                    return self.copy()
-
-            indexer = nargsort(
-                labels, kind=kind, ascending=ascending, na_position=na_position
-            )
-
-        baxis = self._get_block_manager_axis(axis)
-        new_data = self._mgr.take(indexer, axis=baxis, verify=False)
-
-        # reconstruct axis if needed
-        new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic()
-
-        if ignore_index:
-            new_data.axes[1] = ibase.default_index(len(indexer))
-
-        result = self._constructor(new_data)
-        if inplace:
-            return self._update_inplace(result)
-        else:
-            return result.__finalize__(self, method="sort_index")
+        return super().sort_index(
+            axis,
+            level,
+            ascending,
+            inplace,
+            kind,
+            na_position,
+            sort_remaining,
+            ignore_index,
+            key,
+        )
 
     def value_counts(
         self,
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -40,6 +40,7 @@
     CompressionOptions,
     FilePathOrBuffer,
     FrameOrSeries,
+    IndexKeyFunc,
     IndexLabel,
     JSONSerializable,
     Label,
@@ -92,6 +93,7 @@
 import pandas.core.common as com
 from pandas.core.construction import create_series_with_explicit_dtype
 from pandas.core.flags import Flags
+from pandas.core.indexes import base as ibase
 from pandas.core.indexes.api import Index, MultiIndex, RangeIndex, ensure_index
 from pandas.core.indexes.datetimes import DatetimeIndex
 from pandas.core.indexes.period import Period, PeriodIndex
@@ -100,6 +102,7 @@
 from pandas.core.missing import find_valid_index
 from pandas.core.ops import align_method_FRAME
 from pandas.core.shared_docs import _shared_docs
+from pandas.core.sorting import get_indexer_indexer
 from pandas.core.window import Expanding, ExponentialMovingWindow, Rolling, Window
 
 from pandas.io.formats import format as fmt
@@ -4409,6 +4412,50 @@ def sort_values(
         """
         raise AbstractMethodError(self)
 
+    def sort_index(
+        self,
+        axis=0,
+        level=None,
+        ascending: bool_t = True,
+        inplace: bool_t = False,
+        kind: str = "quicksort",
+        na_position: str = "last",
+        sort_remaining: bool_t = True,
+        ignore_index: bool_t = False,
+        key: IndexKeyFunc = None,
+    ):
+
+        inplace = validate_bool_kwarg(inplace, "inplace")
+        axis = self._get_axis_number(axis)
+        target = self._get_axis(axis)
+
+        indexer = get_indexer_indexer(
+            target, level, ascending, kind, na_position, sort_remaining, key
+        )
+
+        if indexer is None:
+            if inplace:
+                return
+            else:
+                return self.copy()
+
+        baxis = self._get_block_manager_axis(axis)
+        new_data = self._mgr.take(indexer, axis=baxis, verify=False)
+
+        # reconstruct axis if needed
+        new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic()
+
+        if ignore_index:
+            axis = 1 if isinstance(self, ABCDataFrame) else 0
+            new_data.axes[axis] = ibase.default_index(len(indexer))
+
+        result = self._constructor(new_data)
+
+        if inplace:
+            return self._update_inplace(result)
+        else:
+            return result.__finalize__(self, method="sort_index")
+
     @doc(
         klass=_shared_doc_kwargs["klass"],
         axes=_shared_doc_kwargs["axes"],
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -2250,7 +2250,7 @@ def fillna(self, value=None, downcast=None):
         DataFrame.fillna : Fill NaN values of a DataFrame.
         Series.fillna : Fill NaN Values of a Series.
         """
-        self._assert_can_do_op(value)
+        value = self._validate_scalar(value)
         if self.hasnans:
             result = self.putmask(self._isnan, value)
             if downcast is None:
@@ -4053,12 +4053,14 @@ def _validate_fill_value(self, value):
         """
         return value
 
-    def _assert_can_do_op(self, value):
+    def _validate_scalar(self, value):
         """
-        Check value is valid for scalar op.
+        Check that this is a scalar value that we can use for setitem-like
+        operations without changing dtype.
         """
         if not is_scalar(value):
             raise TypeError(f"'value' must be a scalar, passed: {type(value).__name__}")
+        return value
 
     @property
     def _has_complex_internals(self) -> bool:
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
diff --git a/pandas/core/series.py b/pandas/core/series.py
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py