ENH: partial string indexing on non-monotonic PeriodIndex (#31096)

jbrockmendel · WillAyd · commit 4050e4c807e3 · 2020-01-21T08:27:24.000-08:00
diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
@@ -1951,6 +1951,10 @@ The ``period`` dtype can be used in ``.astype(...)``. It allows one to change th
 PeriodIndex partial string indexing
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+PeriodIndex now supports partial string slicing with non-monotonic indexes.
+
+.. versionadded:: 1.1.0
+
 You can pass in dates and strings to ``Series`` and ``DataFrame`` with ``PeriodIndex``, in the same manner as ``DatetimeIndex``. For details, refer to :ref:`DatetimeIndex Partial String Indexing <timeseries.partialindexing>`.
 
 .. ipython:: python
@@ -1981,6 +1985,7 @@ As with ``DatetimeIndex``, the endpoints will be included in the result. The exa
 
    dfp['2013-01-01 10H':'2013-01-01 11H']
 
+
 Frequency conversion and resampling with PeriodIndex
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The frequency of ``Period`` and ``PeriodIndex`` can be converted via the ``asfreq``
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -13,6 +13,27 @@ including other versions of pandas.
 Enhancements
 ~~~~~~~~~~~~
 
+.. _whatsnew_110.period_index_partial_string_slicing:
+
+Nonmonotonic PeriodIndex Partial String Slicing
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+:class:`PeriodIndex` now supports partial string slicing for non-monotonic indexes, mirroring :class:`DatetimeIndex` behavior (:issue:`31096`)
+
+For example:
+
+.. ipython:: python
+
+   dti = pd.date_range("2014-01-01", periods=30, freq="30D")
+   pi = dti.to_period("D")
+   ser_monotonic = pd.Series(np.arange(30), index=pi)
+   shuffler = list(range(0, 30, 2)) + list(range(1, 31, 2))
+   ser = ser_monotonic[shuffler]
+   ser
+
+.. ipython:: python
+   ser["2014"]
+   ser.loc["May 2015"]
+
 .. _whatsnew_110.enhancements.other:
 
 Other enhancements
diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py
@@ -567,6 +567,11 @@ def get_loc(self, key, method=None, tolerance=None):
         """
 
         if isinstance(key, str):
+            try:
+                return self._get_string_slice(key)
+            except (TypeError, KeyError, ValueError, OverflowError):
+                pass
+
             try:
                 asdt, reso = parse_time_string(key, self.freq)
                 key = asdt
@@ -648,10 +653,6 @@ def _parsed_string_to_bounds(self, reso: str, parsed: datetime):
 
     def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True):
         # TODO: Check for non-True use_lhs/use_rhs
-        raw = key
-        if not self.is_monotonic:
-            raise ValueError("Partial indexing only valid for ordered time series")
-
         parsed, reso = parse_time_string(key, self.freq)
         grp = resolution.Resolution.get_freq_group(reso)
         freqn = resolution.get_freq_group(self.freq)
@@ -660,18 +661,35 @@ def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True
             # TODO: we used to also check for
             #  reso in ["day", "hour", "minute", "second"]
             #  why is that check not needed?
-            raise TypeError(key)
+            raise ValueError(key)
 
         t1, t2 = self._parsed_string_to_bounds(reso, parsed)
-        if len(self):
-            if t2 < self.min() or t1 > self.max():
-                raise KeyError(raw)
-
-        # Use asi8 searchsorted to avoid overhead of re-validating inputs
-        return slice(
-            self.asi8.searchsorted(t1.ordinal, side="left"),
-            self.asi8.searchsorted(t2.ordinal, side="right"),
-        )
+        i8vals = self.asi8
+
+        if self.is_monotonic:
+
+            # we are out of range
+            if len(self) and (
+                (use_lhs and t1 < self[0] and t2 < self[0])
+                or ((use_rhs and t1 > self[-1] and t2 > self[-1]))
+            ):
+                raise KeyError(key)
+
+            # TODO: does this depend on being monotonic _increasing_?
+            #  If so, DTI will also be affected.
+
+            # a monotonic (sorted) series can be sliced
+            # Use asi8.searchsorted to avoid re-validating Periods
+            left = i8vals.searchsorted(t1.ordinal, side="left") if use_lhs else None
+            right = i8vals.searchsorted(t2.ordinal, side="right") if use_rhs else None
+            return slice(left, right)
+
+        else:
+            lhs_mask = (i8vals >= t1.ordinal) if use_lhs else True
+            rhs_mask = (i8vals <= t2.ordinal) if use_rhs else True
+
+            # try to find a the dates
+            return (lhs_mask & rhs_mask).nonzero()[0]
 
     def _convert_tolerance(self, tolerance, target):
         tolerance = DatetimeIndexOpsMixin._convert_tolerance(self, tolerance, target)
diff --git a/pandas/tests/indexes/period/test_partial_slicing.py b/pandas/tests/indexes/period/test_partial_slicing.py
@@ -7,9 +7,6 @@
 
 
 class TestPeriodIndex:
-    def setup_method(self, method):
-        pass
-
     def test_slice_with_negative_step(self):
         ts = Series(np.arange(20), period_range("2014-01", periods=20, freq="M"))
         SLC = pd.IndexSlice
@@ -133,3 +130,53 @@ def test_range_slice_outofbounds(self):
             tm.assert_frame_equal(df["2013/10/15":"2013/10/17"], empty)
             tm.assert_frame_equal(df["2013-06":"2013-09"], empty)
             tm.assert_frame_equal(df["2013-11":"2013-12"], empty)
+
+    def test_partial_slice_doesnt_require_monotonicity(self):
+        # See also: DatetimeIndex test ofm the same name
+        dti = pd.date_range("2014-01-01", periods=30, freq="30D")
+        pi = dti.to_period("D")
+
+        ser_montonic = pd.Series(np.arange(30), index=pi)
+
+        shuffler = list(range(0, 30, 2)) + list(range(1, 31, 2))
+        ser = ser_montonic[shuffler]
+        nidx = ser.index
+
+        # Manually identified locations of year==2014
+        indexer_2014 = np.array(
+            [0, 1, 2, 3, 4, 5, 6, 15, 16, 17, 18, 19, 20], dtype=np.intp
+        )
+        assert (nidx[indexer_2014].year == 2014).all()
+        assert not (nidx[~indexer_2014].year == 2014).any()
+
+        result = nidx.get_loc("2014")
+        tm.assert_numpy_array_equal(result, indexer_2014)
+
+        expected = ser[indexer_2014]
+
+        result = nidx.get_value(ser, "2014")
+        tm.assert_series_equal(result, expected)
+
+        result = ser.loc["2014"]
+        tm.assert_series_equal(result, expected)
+
+        result = ser["2014"]
+        tm.assert_series_equal(result, expected)
+
+        # Manually identified locations where ser.index is within Mat 2015
+        indexer_may2015 = np.array([23], dtype=np.intp)
+        assert nidx[23].year == 2015 and nidx[23].month == 5
+
+        result = nidx.get_loc("May 2015")
+        tm.assert_numpy_array_equal(result, indexer_may2015)
+
+        expected = ser[indexer_may2015]
+
+        result = nidx.get_value(ser, "May 2015")
+        tm.assert_series_equal(result, expected)
+
+        result = ser.loc["May 2015"]
+        tm.assert_series_equal(result, expected)
+
+        result = ser["May 2015"]
+        tm.assert_series_equal(result, expected)