diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 9253124f7e8b2..6d56a28170645 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -457,22 +457,6 @@ We are stopping on the included end-point as it is part of the index dft['2013-1-15':'2013-1-15 12:30:00'] -.. warning:: - - The following selection will raise a ``KeyError``; otherwise this selection methodology - would be inconsistent with other selection methods in pandas (as this is not a *slice*, nor does it - resolve to one) - - .. code-block:: python - - dft['2013-1-15 12:30:00'] - - To select a single row, use ``.loc`` - - .. ipython:: python - - dft.loc['2013-1-15 12:30:00'] - .. versionadded:: 0.18.0 DatetimeIndex Partial String Indexing also works on DataFrames with a ``MultiIndex``. For example: @@ -491,10 +475,79 @@ DatetimeIndex Partial String Indexing also works on DataFrames with a ``MultiInd dft2 = dft2.swaplevel(0, 1).sort_index() dft2.loc[idx[:, '2013-01-05'], :] +.. _timeseries.slice_vs_exact_match: + +Slice vs. exact match +^^^^^^^^^^^^^^^^^^^^^ + +The same string used as an indexing parameter can be treated either as a slice or as an exact match depending on the resolution of an index. If the string is less accurate than the index, it will be treated as a slice, otherwise as an exact match. + +For example, let us consider ``Series`` object which index has minute resolution. + +.. ipython:: python + + series_minute = pd.Series([1, 2, 3], + pd.DatetimeIndex(['2011-12-31 23:59:00', + '2012-01-01 00:00:00', + '2012-01-01 00:02:00'])) + series_minute.index.resolution + +Timestamp string less accurate than minute gives ``Series`` object. + +.. ipython:: python + + series_minute['2011-12-31 23'] + +Timestamp string with minute resolution (or more accurate) gives scalar instead, i.e. it is not casted to a slice. + +.. ipython:: python + + series_minute['2011-12-31 23:59'] + series_minute['2011-12-31 23:59:00'] + +If index resolution is second, the minute-accurate timestamp gives ``Series``. + +.. ipython:: python + + series_second = pd.Series([1, 2, 3], + pd.DatetimeIndex(['2011-12-31 23:59:59', + '2012-01-01 00:00:00', + '2012-01-01 00:00:01'])) + series_second.index.resolution + series_second['2011-12-31 23:59'] + +If the timestamp string is treated as a slice, it can be used to index ``DataFrame`` with ``[]`` as well. + +.. ipython:: python + + dft_minute = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, + index=series_minute.index) + dft_minute['2011-12-31 23'] + +However if the string is treated as an exact match the selection in ``DataFrame``'s ``[]`` will be column-wise and not row-wise, see :ref:`Indexing Basics `. For example ``dft_minute['2011-12-31 23:59']`` will raise ``KeyError`` as ``'2012-12-31 23:59'`` has the same resolution as index and there is no column with such name: + +To select a single row, use ``.loc``. + +.. ipython:: python + + dft_minute.loc['2011-12-31 23:59'] + +Note also that ``DatetimeIndex`` resolution cannot be less precise than day. + +.. ipython:: python + + series_monthly = pd.Series([1, 2, 3], + pd.DatetimeIndex(['2011-12', + '2012-01', + '2012-02'])) + series_monthly.index.resolution + series_monthly['2011-12'] # returns Series + + Datetime Indexing ~~~~~~~~~~~~~~~~~ -Indexing a ``DateTimeIndex`` with a partial string depends on the "accuracy" of the period, in other words how specific the interval is in relation to the frequency of the index. In contrast, indexing with datetime objects is exact, because the objects have exact meaning. These also follow the semantics of *including both endpoints*. +As discussed in previous section, indexing a ``DateTimeIndex`` with a partial string depends on the "accuracy" of the period, in other words how specific the interval is in relation to the resolution of the index. In contrast, indexing with datetime objects is exact, because the objects have exact meaning. These also follow the semantics of *including both endpoints*. These ``datetime`` objects are specific ``hours, minutes,`` and ``seconds`` even though they were not explicitly specified (they are ``0``). diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 83a70aa34fccf..3b09c5dd51f40 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -193,14 +193,42 @@ in prior versions of pandas) (:issue:`11915`). .. _whatsnew_0200.api: +Other API Changes +^^^^^^^^^^^^^^^^^ + - ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv`` and will be removed in the future (:issue:`12665`) - ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`) +- :ref:`DatetimeIndex Partial String Indexing ` now works as exact match provided that string resolution coincides with index resolution, including a case when both are seconds (:issue:`14826`). See :ref:`Slice vs. Exact Match ` for details. + .. ipython:: python + df = DataFrame({'a': [1, 2, 3]}, DatetimeIndex(['2011-12-31 23:59:59', + '2012-01-01 00:00:00', + '2012-01-01 00:00:01'])) + Previous Behavior: + .. code-block:: ipython -Other API Changes -^^^^^^^^^^^^^^^^^ + In [4]: df['2011-12-31 23:59:59'] + Out[4]: + a + 2011-12-31 23:59:59 1 + + In [5]: df['a']['2011-12-31 23:59:59'] + Out[5]: + 2011-12-31 23:59:59 1 + Name: a, dtype: int64 + + + New Behavior: + + .. code-block:: ipython + + In [4]: df['2011-12-31 23:59:59'] + KeyError: '2011-12-31 23:59:59' + + In [5]: df['a']['2011-12-31 23:59:59'] + Out[5]: 1 .. _whatsnew_0200.deprecations: diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 3edf75fbb82ae..aca962c8178d3 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1293,14 +1293,12 @@ def _parsed_string_to_bounds(self, reso, parsed): def _partial_date_slice(self, reso, parsed, use_lhs=True, use_rhs=True): is_monotonic = self.is_monotonic - if ((reso in ['day', 'hour', 'minute'] and - not (self._resolution < Resolution.get_reso(reso) or - not is_monotonic)) or - (reso == 'second' and - not (self._resolution <= Resolution.RESO_SEC or - not is_monotonic))): + if (is_monotonic and reso in ['day', 'hour', 'minute', 'second'] and + self._resolution >= Resolution.get_reso(reso)): # These resolution/monotonicity validations came from GH3931, # GH3452 and GH2369. + + # See also GH14826 raise KeyError if reso == 'microsecond': diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index cd22ac561c6f7..3f4a10619f7f5 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -266,16 +266,15 @@ def test_indexing(self): expected = ts['2013'] assert_series_equal(expected, ts) - # GH 3925, indexing with a seconds resolution string / datetime object + # GH14826, indexing with a seconds resolution string / datetime object df = DataFrame(randn(5, 5), columns=['open', 'high', 'low', 'close', 'volume'], index=date_range('2012-01-02 18:01:00', periods=5, tz='US/Central', freq='s')) expected = df.loc[[df.index[2]]] - result = df['2012-01-02 18:01:02'] - assert_frame_equal(result, expected) # this is a single date, so will raise + self.assertRaises(KeyError, df.__getitem__, '2012-01-02 18:01:02', ) self.assertRaises(KeyError, df.__getitem__, df.index[2], ) def test_recreate_from_data(self): @@ -4953,6 +4952,73 @@ def test_partial_slice_second_precision(self): self.assertRaisesRegexp(KeyError, '2005-1-1 00:00:00', lambda: s['2005-1-1 00:00:00']) + def test_partial_slicing_dataframe(self): + # GH14856 + # Test various combinations of string slicing resolution vs. + # index resolution + # - If string resolution is less precise than index resolution, + # string is considered a slice + # - If string resolution is equal to or more precise than index + # resolution, string is considered an exact match + formats = ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%d %H', + '%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S'] + resolutions = ['year', 'month', 'day', 'hour', 'minute', 'second'] + for rnum, resolution in enumerate(resolutions[2:], 2): + # we check only 'day', 'hour', 'minute' and 'second' + unit = Timedelta("1 " + resolution) + middate = datetime(2012, 1, 1, 0, 0, 0) + index = DatetimeIndex([middate - unit, + middate, middate + unit]) + values = [1, 2, 3] + df = DataFrame({'a': values}, index, dtype=np.int64) + self.assertEqual(df.index.resolution, resolution) + + # Timestamp with the same resolution as index + # Should be exact match for Series (return scalar) + # and raise KeyError for Frame + for timestamp, expected in zip(index, values): + ts_string = timestamp.strftime(formats[rnum]) + # make ts_string as precise as index + result = df['a'][ts_string] + self.assertIsInstance(result, np.int64) + self.assertEqual(result, expected) + self.assertRaises(KeyError, df.__getitem__, ts_string) + + # Timestamp with resolution less precise than index + for fmt in formats[:rnum]: + for element, theslice in [[0, slice(None, 1)], + [1, slice(1, None)]]: + ts_string = index[element].strftime(fmt) + + # Series should return slice + result = df['a'][ts_string] + expected = df['a'][theslice] + assert_series_equal(result, expected) + + # Frame should return slice as well + result = df[ts_string] + expected = df[theslice] + assert_frame_equal(result, expected) + + # Timestamp with resolution more precise than index + # Compatible with existing key + # Should return scalar for Series + # and raise KeyError for Frame + for fmt in formats[rnum + 1:]: + ts_string = index[1].strftime(fmt) + result = df['a'][ts_string] + self.assertIsInstance(result, np.int64) + self.assertEqual(result, 2) + self.assertRaises(KeyError, df.__getitem__, ts_string) + + # Not compatible with existing key + # Should raise KeyError + for fmt, res in list(zip(formats, resolutions))[rnum + 1:]: + ts = index[1] + Timedelta("1 " + res) + ts_string = ts.strftime(fmt) + self.assertRaises(KeyError, df['a'].__getitem__, ts_string) + self.assertRaises(KeyError, df.__getitem__, ts_string) + def test_partial_slicing_with_multiindex(self): # GH 4758