Skip to content

BUG: Implement PeriodEngine to fix PeriodIndex truncate bug #17755

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Nov 4, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ Conversion
Indexing
^^^^^^^^

-
- Bug in :func:`PeriodIndex.truncate` which raises ``TypeError`` when ``PeriodIndex`` is monotonic (:issue:`17717`)
-
-

Expand Down
54 changes: 52 additions & 2 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ from tslib cimport _to_i8

from hashtable cimport HashTable

from pandas._libs import algos, hashtable as _hash
from pandas._libs import algos, period as periodlib, hashtable as _hash
from pandas._libs.tslib import Timestamp, Timedelta
from datetime import datetime, timedelta

Expand Down Expand Up @@ -270,13 +270,16 @@ cdef class IndexEngine:

values = self._get_index_values()
self.mapping = self._make_hash_table(len(values))
self.mapping.map_locations(values)
self._call_map_locations(values)

if len(self.mapping) == len(values):
self.unique = 1

self.need_unique_check = 0

cpdef _call_map_locations(self, values):
self.mapping.map_locations(values)

def clear_mapping(self):
self.mapping = None
self.need_monotonic_check = 1
Expand Down Expand Up @@ -490,6 +493,53 @@ cdef class TimedeltaEngine(DatetimeEngine):
cdef _get_box_dtype(self):
return 'm8[ns]'


cdef class PeriodEngine(Int64Engine):

cdef _get_index_values(self):
return super(PeriodEngine, self).vgetter()

cpdef _call_map_locations(self, values):
super(PeriodEngine, self)._call_map_locations(values.view('i8'))

def _call_monotonic(self, values):
return super(PeriodEngine, self)._call_monotonic(values.view('i8'))

def get_indexer(self, values):
cdef ndarray[int64_t, ndim=1] ordinals

super(PeriodEngine, self)._ensure_mapping_populated()

freq = super(PeriodEngine, self).vgetter().freq
ordinals = periodlib.extract_ordinals(values, freq)

return self.mapping.lookup(ordinals)

def get_pad_indexer(self, other, limit=None):
freq = super(PeriodEngine, self).vgetter().freq
ordinal = periodlib.extract_ordinals(other, freq)

return algos.pad_int64(self._get_index_values(),
np.asarray(ordinal), limit=limit)

def get_backfill_indexer(self, other, limit=None):
freq = super(PeriodEngine, self).vgetter().freq
ordinal = periodlib.extract_ordinals(other, freq)

return algos.backfill_int64(self._get_index_values(),
np.asarray(ordinal), limit=limit)

def get_indexer_non_unique(self, targets):
freq = super(PeriodEngine, self).vgetter().freq
ordinal = periodlib.extract_ordinals(targets, freq)
ordinal_array = np.asarray(ordinal)

return super(PeriodEngine, self).get_indexer_non_unique(ordinal_array)

cdef _get_index_values_for_bool_indexer(self):
return self._get_index_values().view('i8')


cpdef convert_scalar(ndarray arr, object value):
# we don't turn integers
# into datetimes/timedeltas
Expand Down
5 changes: 4 additions & 1 deletion pandas/_libs/index_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ cdef class {{name}}Engine(IndexEngine):
raise KeyError(val)
{{endif}}

values = self._get_index_values()
values = self._get_index_values_for_bool_indexer()
n = len(values)

result = np.empty(n, dtype=bool)
Expand All @@ -86,6 +86,9 @@ cdef class {{name}}Engine(IndexEngine):
return last_true

return result

cdef _get_index_values_for_bool_indexer(self):
return self._get_index_values()
{{endif}}

{{endfor}}
8 changes: 7 additions & 1 deletion pandas/core/indexes/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
import pandas.tseries.offsets as offsets

from pandas._libs.lib import infer_dtype
from pandas._libs import tslib, period
from pandas._libs import tslib, period, index as libindex
from pandas._libs.period import (Period, IncompatibleFrequency,
get_period_field_arr, _validate_end_alias,
_quarter_to_myear)
Expand Down Expand Up @@ -192,6 +192,8 @@ class PeriodIndex(DatelikeOps, DatetimeIndexOpsMixin, Int64Index):

freq = None

_engine_type = libindex.PeriodEngine

__eq__ = _period_index_cmp('__eq__')
__ne__ = _period_index_cmp('__ne__', nat_result=True)
__lt__ = _period_index_cmp('__lt__')
Expand Down Expand Up @@ -275,6 +277,10 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None,
data = period.extract_ordinals(data, freq)
return cls._from_ordinals(data, name=name, freq=freq)

@cache_readonly
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this might already be implemented by a superclass

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I c. ok, we should make this much more generic (IOW just pass in self and have the engine extract what it needs), but that's another issue.

def _engine(self):
return self._engine_type(lambda: self, len(self))

@classmethod
def _generate_range(cls, start, end, periods, freq, fields):
if freq is not None:
Expand Down
196 changes: 195 additions & 1 deletion pandas/tests/indexes/period/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pandas as pd
from pandas.util import testing as tm
from pandas.compat import lrange
from pandas._libs import tslib
from pandas._libs import tslib, tslibs
from pandas import (PeriodIndex, Series, DatetimeIndex,
period_range, Period)

Expand Down Expand Up @@ -310,3 +310,197 @@ def test_take_fill_value(self):

with pytest.raises(IndexError):
idx.take(np.array([1, -5]))

def test_get_loc(self):
# GH 17717
p0 = pd.Period('2017-09-01')
p1 = pd.Period('2017-09-02')
p2 = pd.Period('2017-09-03')

# get the location of p1/p2 from
# monotonic increasing PeriodIndex with non-duplicate
idx0 = pd.PeriodIndex([p0, p1, p2])
expected_idx1_p1 = 1
expected_idx1_p2 = 2

assert idx0.get_loc(p1) == expected_idx1_p1
assert idx0.get_loc(str(p1)) == expected_idx1_p1
assert idx0.get_loc(p2) == expected_idx1_p2
assert idx0.get_loc(str(p2)) == expected_idx1_p2

pytest.raises(tslibs.parsing.DateParseError, idx0.get_loc, 'foo')
pytest.raises(KeyError, idx0.get_loc, 1.1)
pytest.raises(TypeError, idx0.get_loc, idx0)

# get the location of p1/p2 from
# monotonic increasing PeriodIndex with duplicate
idx1 = pd.PeriodIndex([p1, p1, p2])
expected_idx1_p1 = slice(0, 2)
expected_idx1_p2 = 2

assert idx1.get_loc(p1) == expected_idx1_p1
assert idx1.get_loc(str(p1)) == expected_idx1_p1
assert idx1.get_loc(p2) == expected_idx1_p2
assert idx1.get_loc(str(p2)) == expected_idx1_p2

pytest.raises(tslibs.parsing.DateParseError, idx1.get_loc, 'foo')
pytest.raises(KeyError, idx1.get_loc, 1.1)
pytest.raises(TypeError, idx1.get_loc, idx1)

# get the location of p1/p2 from
# non-monotonic increasing/decreasing PeriodIndex with duplicate
idx2 = pd.PeriodIndex([p2, p1, p2])
expected_idx2_p1 = 1
expected_idx2_p2 = np.array([True, False, True])

assert idx2.get_loc(p1) == expected_idx2_p1
assert idx2.get_loc(str(p1)) == expected_idx2_p1
tm.assert_numpy_array_equal(idx2.get_loc(p2), expected_idx2_p2)
tm.assert_numpy_array_equal(idx2.get_loc(str(p2)), expected_idx2_p2)

def test_is_monotonic_increasing(self):
# GH 17717
p0 = pd.Period('2017-09-01')
p1 = pd.Period('2017-09-02')
p2 = pd.Period('2017-09-03')

idx_inc0 = pd.PeriodIndex([p0, p1, p2])
idx_inc1 = pd.PeriodIndex([p0, p1, p1])
idx_dec0 = pd.PeriodIndex([p2, p1, p0])
idx_dec1 = pd.PeriodIndex([p2, p1, p1])
idx = pd.PeriodIndex([p1, p2, p0])

assert idx_inc0.is_monotonic_increasing
assert idx_inc1.is_monotonic_increasing
assert not idx_dec0.is_monotonic_increasing
assert not idx_dec1.is_monotonic_increasing
assert not idx.is_monotonic_increasing

def test_is_monotonic_decreasing(self):
# GH 17717
p0 = pd.Period('2017-09-01')
p1 = pd.Period('2017-09-02')
p2 = pd.Period('2017-09-03')

idx_inc0 = pd.PeriodIndex([p0, p1, p2])
idx_inc1 = pd.PeriodIndex([p0, p1, p1])
idx_dec0 = pd.PeriodIndex([p2, p1, p0])
idx_dec1 = pd.PeriodIndex([p2, p1, p1])
idx = pd.PeriodIndex([p1, p2, p0])

assert not idx_inc0.is_monotonic_decreasing
assert not idx_inc1.is_monotonic_decreasing
assert idx_dec0.is_monotonic_decreasing
assert idx_dec1.is_monotonic_decreasing
assert not idx.is_monotonic_decreasing

def test_is_unique(self):
# GH 17717
p0 = pd.Period('2017-09-01')
p1 = pd.Period('2017-09-02')
p2 = pd.Period('2017-09-03')

idx0 = pd.PeriodIndex([p0, p1, p2])
assert idx0.is_unique

idx1 = pd.PeriodIndex([p1, p1, p2])
assert not idx1.is_unique

def test_contains(self):
# GH 17717
p0 = pd.Period('2017-09-01')
p1 = pd.Period('2017-09-02')
p2 = pd.Period('2017-09-03')
p3 = pd.Period('2017-09-04')

ps0 = [p0, p1, p2]
idx0 = pd.PeriodIndex(ps0)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cc @jorisvandenbossche @MaximilianR @shoyer these are similar to Interval discussions, though conceptually much simpler.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, and similar to that discussion, I don't like the current (and tested on the lines below) behaviour.

I don't think that:

In [97]: p0 = pd.Period('2017-09-01')
    ...: p1 = pd.Period('2017-09-02')

In [98]: idx = pd.PeriodIndex([p0, p1])

In [99]: idx.contains("2017-09-01 09:00")
Out[99]: True

should be the behaviour of contains. But let's leave that discussion out of this PR, as I don't think the PR is making any changes to the behaviour of __contains__ / contains ? (or does it?)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jorisvandenbossche @jreback Yes. No changes in __contains__ / contains.

for p in ps0:
assert idx0.contains(p)
assert p in idx0

assert idx0.contains(str(p))
assert str(p) in idx0

assert idx0.contains('2017-09-01 00:00:01')
assert '2017-09-01 00:00:01' in idx0

assert idx0.contains('2017-09')
assert '2017-09' in idx0

assert not idx0.contains(p3)
assert p3 not in idx0

def test_get_value(self):
# GH 17717
p0 = pd.Period('2017-09-01')
p1 = pd.Period('2017-09-02')
p2 = pd.Period('2017-09-03')

idx0 = pd.PeriodIndex([p0, p1, p2])
input0 = np.array([1, 2, 3])
expected0 = 2

result0 = idx0.get_value(input0, p1)
assert result0 == expected0

idx1 = pd.PeriodIndex([p1, p1, p2])
input1 = np.array([1, 2, 3])
expected1 = np.array([1, 2])

result1 = idx1.get_value(input1, p1)
tm.assert_numpy_array_equal(result1, expected1)

idx2 = pd.PeriodIndex([p1, p2, p1])
input2 = np.array([1, 2, 3])
expected2 = np.array([1, 3])

result2 = idx2.get_value(input2, p1)
tm.assert_numpy_array_equal(result2, expected2)

def test_get_indexer(self):
# GH 17717
p1 = pd.Period('2017-09-01')
p2 = pd.Period('2017-09-04')
p3 = pd.Period('2017-09-07')

tp0 = pd.Period('2017-08-31')
tp1 = pd.Period('2017-09-02')
tp2 = pd.Period('2017-09-05')
tp3 = pd.Period('2017-09-09')

idx = pd.PeriodIndex([p1, p2, p3])

tm.assert_numpy_array_equal(idx.get_indexer(idx),
np.array([0, 1, 2], dtype=np.intp))

target = pd.PeriodIndex([tp0, tp1, tp2, tp3])
tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'),
np.array([-1, 0, 1, 2], dtype=np.intp))
tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'),
np.array([0, 1, 2, -1], dtype=np.intp))
tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'),
np.array([0, 0, 1, 2], dtype=np.intp))

res = idx.get_indexer(target, 'nearest',
tolerance=pd.Timedelta('1 day'))
tm.assert_numpy_array_equal(res,
np.array([0, 0, 1, -1], dtype=np.intp))

def test_get_indexer_non_unique(self):
# GH 17717
p1 = pd.Period('2017-09-02')
p2 = pd.Period('2017-09-03')
p3 = pd.Period('2017-09-04')
p4 = pd.Period('2017-09-05')

idx1 = pd.PeriodIndex([p1, p2, p1])
idx2 = pd.PeriodIndex([p2, p1, p3, p4])

result = idx1.get_indexer_non_unique(idx2)
expected_indexer = np.array([1, 0, 2, -1, -1], dtype=np.int64)
expected_missing = np.array([2, 3], dtype=np.int64)

tm.assert_numpy_array_equal(result[0], expected_indexer)
tm.assert_numpy_array_equal(result[1], expected_missing)
30 changes: 30 additions & 0 deletions pandas/tests/series/test_period.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,3 +249,33 @@ def test_align_series(self):
msg = "Input has different freq=D from PeriodIndex\\(freq=A-DEC\\)"
with tm.assert_raises_regex(period.IncompatibleFrequency, msg):
ts + ts.asfreq('D', how="end")

def test_truncate(self):
# GH 17717
idx1 = pd.PeriodIndex([
pd.Period('2017-09-02'),
pd.Period('2017-09-02'),
pd.Period('2017-09-03')
])
series1 = pd.Series([1, 2, 3], index=idx1)
result1 = series1.truncate(after='2017-09-02')

expected_idx1 = pd.PeriodIndex([
pd.Period('2017-09-02'),
pd.Period('2017-09-02')
])
tm.assert_series_equal(result1, pd.Series([1, 2], index=expected_idx1))

idx2 = pd.PeriodIndex([
pd.Period('2017-09-03'),
pd.Period('2017-09-02'),
pd.Period('2017-09-03')
])
series2 = pd.Series([1, 2, 3], index=idx2)
result2 = series2.truncate(after='2017-09-02')

expected_idx2 = pd.PeriodIndex([
pd.Period('2017-09-03'),
pd.Period('2017-09-02')
])
tm.assert_series_equal(result2, pd.Series([1, 2], index=expected_idx2))