Skip to content

Commit e1fb7f4

Browse files
committed
PERF: Improve Period hasing
1 parent 4c9ae94 commit e1fb7f4

File tree

5 files changed

+152
-29
lines changed

5 files changed

+152
-29
lines changed

asv_bench/benchmarks/period.py

+25-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from pandas import PeriodIndex, date_range
1+
from pandas import Series, Period, PeriodIndex, date_range
22

33

44
class create_period_index_from_date_range(object):
@@ -7,3 +7,27 @@ class create_period_index_from_date_range(object):
77
def time_period_index(self):
88
# Simulate irregular PeriodIndex
99
PeriodIndex(date_range('1985', periods=1000).to_pydatetime(), freq='D')
10+
11+
12+
class period_algorithm(object):
13+
goal_time = 0.2
14+
15+
def setup(self):
16+
data = [Period('2011-01', freq='M'), Period('2011-02', freq='M'),
17+
Period('2011-03', freq='M'), Period('2011-04', freq='M')]
18+
self.s = Series(data * 1000)
19+
self.i = PeriodIndex(data, freq='M')
20+
21+
def time_period_series_drop_duplicates(self):
22+
self.s.drop_duplicates()
23+
24+
def time_period_index_drop_duplicates(self):
25+
self.i.drop_duplicates()
26+
27+
def time_period_series_value_counts(self):
28+
self.s.value_counts()
29+
30+
def time_period_index_value_counts(self):
31+
self.i.value_counts()
32+
33+

doc/source/whatsnew/v0.19.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,8 @@ Performance Improvements
567567
- Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`)
568568
- Improved performance of ``Index.difference`` (:issue:`12044`)
569569
- Improved performance of datetime string parsing in ``DatetimeIndex`` (:issue:`13692`)
570+
- Improved performance of hashing ``Period`` (:issue:`12817`)
571+
570572

571573
.. _whatsnew_0190.bug_fixes:
572574

pandas/src/period.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -727,7 +727,7 @@ cdef class _Period(object):
727727
(type(self).__name__, type(other).__name__))
728728

729729
def __hash__(self):
730-
return hash((self.ordinal, self.freq))
730+
return hash((self.ordinal, self.freqstr))
731731

732732
def _add_delta(self, other):
733733
if isinstance(other, (timedelta, np.timedelta64, offsets.Tick, Timedelta)):

pandas/tseries/tests/test_base.py

+111-27
Original file line numberDiff line numberDiff line change
@@ -491,13 +491,15 @@ def test_value_counts_unique(self):
491491
for tz in [None, 'UTC', 'Asia/Tokyo', 'US/Eastern']:
492492
idx = pd.date_range('2011-01-01 09:00', freq='H', periods=10)
493493
# create repeated values, 'n'th element is repeated by n+1 times
494-
idx = DatetimeIndex(
495-
np.repeat(idx.values, range(1, len(idx) + 1)), tz=tz)
494+
idx = DatetimeIndex(np.repeat(idx.values, range(1, len(idx) + 1)),
495+
tz=tz)
496496

497497
exp_idx = pd.date_range('2011-01-01 18:00', freq='-1H', periods=10,
498498
tz=tz)
499499
expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64')
500-
tm.assert_series_equal(idx.value_counts(), expected)
500+
501+
for obj in [idx, Series(idx)]:
502+
tm.assert_series_equal(obj.value_counts(), expected)
501503

502504
expected = pd.date_range('2011-01-01 09:00', freq='H', periods=10,
503505
tz=tz)
@@ -507,15 +509,20 @@ def test_value_counts_unique(self):
507509
'2013-01-01 09:00', '2013-01-01 08:00',
508510
'2013-01-01 08:00', pd.NaT], tz=tz)
509511

510-
exp_idx = DatetimeIndex(
511-
['2013-01-01 09:00', '2013-01-01 08:00'], tz=tz)
512+
exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00'],
513+
tz=tz)
512514
expected = Series([3, 2], index=exp_idx)
513-
tm.assert_series_equal(idx.value_counts(), expected)
514515

515-
exp_idx = DatetimeIndex(
516-
['2013-01-01 09:00', '2013-01-01 08:00', pd.NaT], tz=tz)
516+
for obj in [idx, Series(idx)]:
517+
tm.assert_series_equal(obj.value_counts(), expected)
518+
519+
exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00',
520+
pd.NaT], tz=tz)
517521
expected = Series([3, 2, 1], index=exp_idx)
518-
tm.assert_series_equal(idx.value_counts(dropna=False), expected)
522+
523+
for obj in [idx, Series(idx)]:
524+
tm.assert_series_equal(obj.value_counts(dropna=False),
525+
expected)
519526

520527
tm.assert_index_equal(idx.unique(), exp_idx)
521528

@@ -654,6 +661,27 @@ def test_drop_duplicates_metadata(self):
654661
self.assert_index_equal(idx, result)
655662
self.assertIsNone(result.freq)
656663

664+
def test_drop_duplicates(self):
665+
# to check Index/Series compat
666+
base = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx')
667+
idx = base.append(base[:5])
668+
669+
res = idx.drop_duplicates()
670+
tm.assert_index_equal(res, base)
671+
res = Series(idx).drop_duplicates()
672+
tm.assert_series_equal(res, Series(base))
673+
674+
res = idx.drop_duplicates(keep='last')
675+
exp = base[5:].append(base[:5])
676+
tm.assert_index_equal(res, exp)
677+
res = Series(idx).drop_duplicates(keep='last')
678+
tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36)))
679+
680+
res = idx.drop_duplicates(keep=False)
681+
tm.assert_index_equal(res, base[5:])
682+
res = Series(idx).drop_duplicates(keep=False)
683+
tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31)))
684+
657685
def test_take(self):
658686
# GH 10295
659687
idx1 = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx')
@@ -1303,23 +1331,29 @@ def test_value_counts_unique(self):
13031331

13041332
exp_idx = timedelta_range('1 days 18:00:00', freq='-1H', periods=10)
13051333
expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64')
1306-
tm.assert_series_equal(idx.value_counts(), expected)
1334+
1335+
for obj in [idx, Series(idx)]:
1336+
tm.assert_series_equal(obj.value_counts(), expected)
13071337

13081338
expected = timedelta_range('1 days 09:00:00', freq='H', periods=10)
13091339
tm.assert_index_equal(idx.unique(), expected)
13101340

1311-
idx = TimedeltaIndex(
1312-
['1 days 09:00:00', '1 days 09:00:00', '1 days 09:00:00',
1313-
'1 days 08:00:00', '1 days 08:00:00', pd.NaT])
1341+
idx = TimedeltaIndex(['1 days 09:00:00', '1 days 09:00:00',
1342+
'1 days 09:00:00', '1 days 08:00:00',
1343+
'1 days 08:00:00', pd.NaT])
13141344

13151345
exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00'])
13161346
expected = Series([3, 2], index=exp_idx)
1317-
tm.assert_series_equal(idx.value_counts(), expected)
13181347

1319-
exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00', pd.NaT
1320-
])
1348+
for obj in [idx, Series(idx)]:
1349+
tm.assert_series_equal(obj.value_counts(), expected)
1350+
1351+
exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00',
1352+
pd.NaT])
13211353
expected = Series([3, 2, 1], index=exp_idx)
1322-
tm.assert_series_equal(idx.value_counts(dropna=False), expected)
1354+
1355+
for obj in [idx, Series(idx)]:
1356+
tm.assert_series_equal(obj.value_counts(dropna=False), expected)
13231357

13241358
tm.assert_index_equal(idx.unique(), exp_idx)
13251359

@@ -1454,6 +1488,27 @@ def test_drop_duplicates_metadata(self):
14541488
self.assert_index_equal(idx, result)
14551489
self.assertIsNone(result.freq)
14561490

1491+
def test_drop_duplicates(self):
1492+
# to check Index/Series compat
1493+
base = pd.timedelta_range('1 day', '31 day', freq='D', name='idx')
1494+
idx = base.append(base[:5])
1495+
1496+
res = idx.drop_duplicates()
1497+
tm.assert_index_equal(res, base)
1498+
res = Series(idx).drop_duplicates()
1499+
tm.assert_series_equal(res, Series(base))
1500+
1501+
res = idx.drop_duplicates(keep='last')
1502+
exp = base[5:].append(base[:5])
1503+
tm.assert_index_equal(res, exp)
1504+
res = Series(idx).drop_duplicates(keep='last')
1505+
tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36)))
1506+
1507+
res = idx.drop_duplicates(keep=False)
1508+
tm.assert_index_equal(res, base[5:])
1509+
res = Series(idx).drop_duplicates(keep=False)
1510+
tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31)))
1511+
14571512
def test_take(self):
14581513
# GH 10295
14591514
idx1 = pd.timedelta_range('1 day', '31 day', freq='D', name='idx')
@@ -2121,8 +2176,8 @@ def test_value_counts_unique(self):
21212176
# GH 7735
21222177
idx = pd.period_range('2011-01-01 09:00', freq='H', periods=10)
21232178
# create repeated values, 'n'th element is repeated by n+1 times
2124-
idx = PeriodIndex(
2125-
np.repeat(idx.values, range(1, len(idx) + 1)), freq='H')
2179+
idx = PeriodIndex(np.repeat(idx.values, range(1, len(idx) + 1)),
2180+
freq='H')
21262181

21272182
exp_idx = PeriodIndex(['2011-01-01 18:00', '2011-01-01 17:00',
21282183
'2011-01-01 16:00', '2011-01-01 15:00',
@@ -2131,24 +2186,31 @@ def test_value_counts_unique(self):
21312186
'2011-01-01 10:00',
21322187
'2011-01-01 09:00'], freq='H')
21332188
expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64')
2134-
tm.assert_series_equal(idx.value_counts(), expected)
21352189

2136-
expected = pd.period_range('2011-01-01 09:00', freq='H', periods=10)
2190+
for obj in [idx, Series(idx)]:
2191+
tm.assert_series_equal(obj.value_counts(), expected)
2192+
2193+
expected = pd.period_range('2011-01-01 09:00', freq='H',
2194+
periods=10)
21372195
tm.assert_index_equal(idx.unique(), expected)
21382196

21392197
idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 09:00',
21402198
'2013-01-01 09:00', '2013-01-01 08:00',
21412199
'2013-01-01 08:00', pd.NaT], freq='H')
21422200

2143-
exp_idx = PeriodIndex(
2144-
['2013-01-01 09:00', '2013-01-01 08:00'], freq='H')
2201+
exp_idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 08:00'],
2202+
freq='H')
21452203
expected = Series([3, 2], index=exp_idx)
2146-
tm.assert_series_equal(idx.value_counts(), expected)
21472204

2148-
exp_idx = PeriodIndex(
2149-
['2013-01-01 09:00', '2013-01-01 08:00', pd.NaT], freq='H')
2205+
for obj in [idx, Series(idx)]:
2206+
tm.assert_series_equal(obj.value_counts(), expected)
2207+
2208+
exp_idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 08:00',
2209+
pd.NaT], freq='H')
21502210
expected = Series([3, 2, 1], index=exp_idx)
2151-
tm.assert_series_equal(idx.value_counts(dropna=False), expected)
2211+
2212+
for obj in [idx, Series(idx)]:
2213+
tm.assert_series_equal(obj.value_counts(dropna=False), expected)
21522214

21532215
tm.assert_index_equal(idx.unique(), exp_idx)
21542216

@@ -2164,6 +2226,28 @@ def test_drop_duplicates_metadata(self):
21642226
self.assert_index_equal(idx, result)
21652227
self.assertEqual(idx.freq, result.freq)
21662228

2229+
def test_drop_duplicates(self):
2230+
# to check Index/Series compat
2231+
base = pd.period_range('2011-01-01', '2011-01-31', freq='D',
2232+
name='idx')
2233+
idx = base.append(base[:5])
2234+
2235+
res = idx.drop_duplicates()
2236+
tm.assert_index_equal(res, base)
2237+
res = Series(idx).drop_duplicates()
2238+
tm.assert_series_equal(res, Series(base))
2239+
2240+
res = idx.drop_duplicates(keep='last')
2241+
exp = base[5:].append(base[:5])
2242+
tm.assert_index_equal(res, exp)
2243+
res = Series(idx).drop_duplicates(keep='last')
2244+
tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36)))
2245+
2246+
res = idx.drop_duplicates(keep=False)
2247+
tm.assert_index_equal(res, base[5:])
2248+
res = Series(idx).drop_duplicates(keep=False)
2249+
tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31)))
2250+
21672251
def test_order_compat(self):
21682252
def _check_freq(index, expected_index):
21692253
if isinstance(index, PeriodIndex):

pandas/tseries/tests/test_period.py

+13
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,19 @@ def test_period_deprecated_freq(self):
462462
p = Period('2016-03-01 09:00', freq=exp)
463463
tm.assertIsInstance(p, Period)
464464

465+
def test_hash(self):
466+
self.assertEqual(hash(Period('2011-01', freq='M')),
467+
hash(Period('2011-01', freq='M')))
468+
469+
self.assertNotEqual(hash(Period('2011-01-01', freq='D')),
470+
hash(Period('2011-01', freq='M')))
471+
472+
self.assertNotEqual(hash(Period('2011-01', freq='3M')),
473+
hash(Period('2011-01', freq='2M')))
474+
475+
self.assertNotEqual(hash(Period('2011-01', freq='M')),
476+
hash(Period('2011-02', freq='M')))
477+
465478
def test_repr(self):
466479
p = Period('Jan-2000')
467480
self.assertIn('2000-01', repr(p))

0 commit comments

Comments
 (0)