Merge remote-tracking branch 'upstream/master' into apply-regr-31505

jorisvandenbossche · jorisvandenbossche · commit a2d4fd50da20 · 2020-02-04T22:36:47.000+01:00
diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst
@@ -1973,7 +1973,7 @@ Pandas has two ways to store strings.
 1. ``object`` dtype, which can hold any Python object, including strings.
 2. :class:`StringDtype`, which is dedicated to strings.
 
-Generally, we recommend using :class:`StringDtype`. See :ref:`text.types` fore more.
+Generally, we recommend using :class:`StringDtype`. See :ref:`text.types` for more.
 
 Finally, arbitrary objects may be stored using the ``object`` dtype, but should
 be avoided to the extent possible (for performance and interoperability with
diff --git a/doc/source/whatsnew/v1.0.1.rst b/doc/source/whatsnew/v1.0.1.rst
@@ -22,10 +22,12 @@ Fixed regressions
 - Fixed regression in :meth:`GroupBy.apply` if called with a function which returned a non-pandas non-scalar object (e.g. a list or numpy array) (:issue:`31441`)
 - Fixed regression in :meth:`to_datetime` when parsing non-nanosecond resolution datetimes (:issue:`31491`)
 - Fixed regression in :meth:`~DataFrame.to_csv` where specifying an ``na_rep`` might truncate the values written (:issue:`31447`)
+- Fixed regression in :class:`Categorical` construction with ``numpy.str_`` categories (:issue:`31499`)
 - Fixed regression where setting :attr:`pd.options.display.max_colwidth` was not accepting negative integer. In addition, this behavior has been deprecated in favor of using ``None`` (:issue:`31532`)
 - Fixed regression in objTOJSON.c fix return-type warning (:issue:`31463`)
 - Fixed regression in :meth:`qcut` when passed a nullable integer. (:issue:`31389`)
 - Fixed regression in assigning to a :class:`Series` using a nullable integer dtype (:issue:`31446`)
+- Fixed performance regression when indexing a ``DataFrame`` or ``Series`` with a :class:`MultiIndex` for the index using a list of labels (:issue:`31648`)
 
 .. ---------------------------------------------------------------------------
 
@@ -56,6 +58,9 @@ Bug fixes
 
 - Plotting tz-aware timeseries no longer gives UserWarning (:issue:`31205`)
 
+**Interval**
+
+- Bug in :meth:`Series.shift` with ``interval`` dtype raising a ``TypeError`` when shifting an interval array of integers or datetimes (:issue:`34195`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -670,7 +670,9 @@ cdef class StringHashTable(HashTable):
             val = values[i]
 
             if isinstance(val, str):
-                v = get_c_string(val)
+                # GH#31499 if we have a np.str_ get_c_string wont recognize
+                #  it as a str, even though isinstance does.
+                v = get_c_string(<str>val)
             else:
                 v = get_c_string(self.na_string_sentinel)
             vecs[i] = v
@@ -703,7 +705,9 @@ cdef class StringHashTable(HashTable):
             val = values[i]
 
             if isinstance(val, str):
-                v = get_c_string(val)
+                # GH#31499 if we have a np.str_ get_c_string wont recognize
+                #  it as a str, even though isinstance does.
+                v = get_c_string(<str>val)
             else:
                 v = get_c_string(self.na_string_sentinel)
             vecs[i] = v
diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
@@ -27,6 +27,7 @@
 from pandas.core.dtypes.dtypes import IntervalDtype
 from pandas.core.dtypes.generic import (
     ABCDatetimeIndex,
+    ABCExtensionArray,
     ABCIndexClass,
     ABCInterval,
     ABCIntervalIndex,
@@ -789,6 +790,33 @@ def size(self) -> int:
         # Avoid materializing self.values
         return self.left.size
 
+    def shift(self, periods: int = 1, fill_value: object = None) -> ABCExtensionArray:
+        if not len(self) or periods == 0:
+            return self.copy()
+
+        if isna(fill_value):
+            fill_value = self.dtype.na_value
+
+        # ExtensionArray.shift doesn't work for two reasons
+        # 1. IntervalArray.dtype.na_value may not be correct for the dtype.
+        # 2. IntervalArray._from_sequence only accepts NaN for missing values,
+        #    not other values like NaT
+
+        empty_len = min(abs(periods), len(self))
+        if isna(fill_value):
+            fill_value = self.left._na_value
+            empty = IntervalArray.from_breaks([fill_value] * (empty_len + 1))
+        else:
+            empty = self._from_sequence([fill_value] * empty_len)
+
+        if periods > 0:
+            a = empty
+            b = self[:-periods]
+        else:
+            a = self[abs(periods) :]
+            b = empty
+        return self._concat_same_type([a, b])
+
     def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs):
         """
         Take elements from the IntervalArray.
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -677,8 +677,11 @@ def __len__(self) -> int:
     # --------------------------------------------------------------------
     # Levels Methods
 
-    @property
+    @cache_readonly
     def levels(self):
+        # Use cache_readonly to ensure that self.get_locs doesn't repeatedly
+        # create new IndexEngine
+        # https://github.com/pandas-dev/pandas/issues/31648
         result = [
             x._shallow_copy(name=name) for x, name in zip(self._levels, self._names)
         ]
@@ -1302,6 +1305,9 @@ def _set_names(self, names, level=None, validate=True):
                     )
             self._names[lev] = name
 
+        # If .levels has been accessed, the names in our cache will be stale.
+        self._reset_cache()
+
     names = property(
         fset=_set_names, fget=_get_names, doc="""\nNames of levels in MultiIndex.\n"""
     )
diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py
@@ -408,6 +408,11 @@ def test_constructor_str_unknown(self):
         with pytest.raises(ValueError, match="Unknown dtype"):
             Categorical([1, 2], dtype="foo")
 
+    def test_constructor_np_strs(self):
+        # GH#31499 Hastable.map_locations needs to work on np.str_ objects
+        cat = pd.Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")])
+        assert all(isinstance(x, np.str_) for x in cat.categories)
+
     def test_constructor_from_categorical_with_dtype(self):
         dtype = CategoricalDtype(["a", "b", "c"], ordered=True)
         values = Categorical(["a", "b", "d"])
diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py
@@ -81,6 +81,24 @@ def test_where_raises(self, other):
         with pytest.raises(ValueError, match=match):
             ser.where([True, False, True], other=other)
 
+    def test_shift(self):
+        # https://github.com/pandas-dev/pandas/issues/31495
+        a = IntervalArray.from_breaks([1, 2, 3])
+        result = a.shift()
+        # int -> float
+        expected = IntervalArray.from_tuples([(np.nan, np.nan), (1.0, 2.0)])
+        tm.assert_interval_array_equal(result, expected)
+
+    def test_shift_datetime(self):
+        a = IntervalArray.from_breaks(pd.date_range("2000", periods=4))
+        result = a.shift(2)
+        expected = a.take([-1, -1, 0], allow_fill=True)
+        tm.assert_interval_array_equal(result, expected)
+
+        result = a.shift(-1)
+        expected = a.take([1, 2, -1], allow_fill=True)
+        tm.assert_interval_array_equal(result, expected)
+
 
 class TestSetitem:
     def test_set_na(self, left_right_dtypes):
diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
@@ -280,6 +280,13 @@ def test_shift_empty_array(self, data, periods):
         expected = empty
         self.assert_extension_array_equal(result, expected)
 
+    def test_shift_zero_copies(self, data):
+        result = data.shift(0)
+        assert result is not data
+
+        result = data[:0].shift(2)
+        assert result is not data
+
     def test_shift_fill_value(self, data):
         arr = data[:4]
         fill_value = data[0]
diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py
@@ -159,7 +159,7 @@ def test_set_levels_codes_directly(idx):
     minor_codes = [(x + 1) % 1 for x in minor_codes]
     new_codes = [major_codes, minor_codes]
 
-    msg = "can't set attribute"
+    msg = "[Cc]an't set attribute"
     with pytest.raises(AttributeError, match=msg):
         idx.levels = new_levels
     with pytest.raises(AttributeError, match=msg):