diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py index 012030a71ac82..c1b89ae1db75b 100644 --- a/asv_bench/benchmarks/period.py +++ b/asv_bench/benchmarks/period.py @@ -1,4 +1,4 @@ -from pandas import PeriodIndex, date_range +from pandas import Series, Period, PeriodIndex, date_range class create_period_index_from_date_range(object): @@ -7,3 +7,27 @@ class create_period_index_from_date_range(object): def time_period_index(self): # Simulate irregular PeriodIndex PeriodIndex(date_range('1985', periods=1000).to_pydatetime(), freq='D') + + +class period_algorithm(object): + goal_time = 0.2 + + def setup(self): + data = [Period('2011-01', freq='M'), Period('2011-02', freq='M'), + Period('2011-03', freq='M'), Period('2011-04', freq='M')] + self.s = Series(data * 1000) + self.i = PeriodIndex(data, freq='M') + + def time_period_series_drop_duplicates(self): + self.s.drop_duplicates() + + def time_period_index_drop_duplicates(self): + self.i.drop_duplicates() + + def time_period_series_value_counts(self): + self.s.value_counts() + + def time_period_index_value_counts(self): + self.i.value_counts() + + diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index efa6e5575fa79..5750b991aa950 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -567,6 +567,8 @@ Performance Improvements - Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`) - Improved performance of ``Index.difference`` (:issue:`12044`) - Improved performance of datetime string parsing in ``DatetimeIndex`` (:issue:`13692`) +- Improved performance of hashing ``Period`` (:issue:`12817`) + .. _whatsnew_0190.bug_fixes: diff --git a/pandas/src/period.pyx b/pandas/src/period.pyx index 37f265ede07e7..45743d1cf70ff 100644 --- a/pandas/src/period.pyx +++ b/pandas/src/period.pyx @@ -727,7 +727,7 @@ cdef class _Period(object): (type(self).__name__, type(other).__name__)) def __hash__(self): - return hash((self.ordinal, self.freq)) + return hash((self.ordinal, self.freqstr)) def _add_delta(self, other): if isinstance(other, (timedelta, np.timedelta64, offsets.Tick, Timedelta)): diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 4aa1e2f5d33dd..05f7d9d9ce7b8 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -491,13 +491,15 @@ def test_value_counts_unique(self): for tz in [None, 'UTC', 'Asia/Tokyo', 'US/Eastern']: idx = pd.date_range('2011-01-01 09:00', freq='H', periods=10) # create repeated values, 'n'th element is repeated by n+1 times - idx = DatetimeIndex( - np.repeat(idx.values, range(1, len(idx) + 1)), tz=tz) + idx = DatetimeIndex(np.repeat(idx.values, range(1, len(idx) + 1)), + tz=tz) exp_idx = pd.date_range('2011-01-01 18:00', freq='-1H', periods=10, tz=tz) expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') - tm.assert_series_equal(idx.value_counts(), expected) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) expected = pd.date_range('2011-01-01 09:00', freq='H', periods=10, tz=tz) @@ -507,15 +509,20 @@ def test_value_counts_unique(self): '2013-01-01 09:00', '2013-01-01 08:00', '2013-01-01 08:00', pd.NaT], tz=tz) - exp_idx = DatetimeIndex( - ['2013-01-01 09:00', '2013-01-01 08:00'], tz=tz) + exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00'], + tz=tz) expected = Series([3, 2], index=exp_idx) - tm.assert_series_equal(idx.value_counts(), expected) - exp_idx = DatetimeIndex( - ['2013-01-01 09:00', '2013-01-01 08:00', pd.NaT], tz=tz) + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00', + pd.NaT], tz=tz) expected = Series([3, 2, 1], index=exp_idx) - tm.assert_series_equal(idx.value_counts(dropna=False), expected) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(dropna=False), + expected) tm.assert_index_equal(idx.unique(), exp_idx) @@ -654,6 +661,27 @@ def test_drop_duplicates_metadata(self): self.assert_index_equal(idx, result) self.assertIsNone(result.freq) + def test_drop_duplicates(self): + # to check Index/Series compat + base = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') + idx = base.append(base[:5]) + + res = idx.drop_duplicates() + tm.assert_index_equal(res, base) + res = Series(idx).drop_duplicates() + tm.assert_series_equal(res, Series(base)) + + res = idx.drop_duplicates(keep='last') + exp = base[5:].append(base[:5]) + tm.assert_index_equal(res, exp) + res = Series(idx).drop_duplicates(keep='last') + tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) + + res = idx.drop_duplicates(keep=False) + tm.assert_index_equal(res, base[5:]) + res = Series(idx).drop_duplicates(keep=False) + tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) + def test_take(self): # GH 10295 idx1 = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') @@ -1303,23 +1331,29 @@ def test_value_counts_unique(self): exp_idx = timedelta_range('1 days 18:00:00', freq='-1H', periods=10) expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') - tm.assert_series_equal(idx.value_counts(), expected) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) expected = timedelta_range('1 days 09:00:00', freq='H', periods=10) tm.assert_index_equal(idx.unique(), expected) - idx = TimedeltaIndex( - ['1 days 09:00:00', '1 days 09:00:00', '1 days 09:00:00', - '1 days 08:00:00', '1 days 08:00:00', pd.NaT]) + idx = TimedeltaIndex(['1 days 09:00:00', '1 days 09:00:00', + '1 days 09:00:00', '1 days 08:00:00', + '1 days 08:00:00', pd.NaT]) exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00']) expected = Series([3, 2], index=exp_idx) - tm.assert_series_equal(idx.value_counts(), expected) - exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00', pd.NaT - ]) + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00', + pd.NaT]) expected = Series([3, 2, 1], index=exp_idx) - tm.assert_series_equal(idx.value_counts(dropna=False), expected) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(dropna=False), expected) tm.assert_index_equal(idx.unique(), exp_idx) @@ -1454,6 +1488,27 @@ def test_drop_duplicates_metadata(self): self.assert_index_equal(idx, result) self.assertIsNone(result.freq) + def test_drop_duplicates(self): + # to check Index/Series compat + base = pd.timedelta_range('1 day', '31 day', freq='D', name='idx') + idx = base.append(base[:5]) + + res = idx.drop_duplicates() + tm.assert_index_equal(res, base) + res = Series(idx).drop_duplicates() + tm.assert_series_equal(res, Series(base)) + + res = idx.drop_duplicates(keep='last') + exp = base[5:].append(base[:5]) + tm.assert_index_equal(res, exp) + res = Series(idx).drop_duplicates(keep='last') + tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) + + res = idx.drop_duplicates(keep=False) + tm.assert_index_equal(res, base[5:]) + res = Series(idx).drop_duplicates(keep=False) + tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) + def test_take(self): # GH 10295 idx1 = pd.timedelta_range('1 day', '31 day', freq='D', name='idx') @@ -2121,8 +2176,8 @@ def test_value_counts_unique(self): # GH 7735 idx = pd.period_range('2011-01-01 09:00', freq='H', periods=10) # create repeated values, 'n'th element is repeated by n+1 times - idx = PeriodIndex( - np.repeat(idx.values, range(1, len(idx) + 1)), freq='H') + idx = PeriodIndex(np.repeat(idx.values, range(1, len(idx) + 1)), + freq='H') exp_idx = PeriodIndex(['2011-01-01 18:00', '2011-01-01 17:00', '2011-01-01 16:00', '2011-01-01 15:00', @@ -2131,24 +2186,31 @@ def test_value_counts_unique(self): '2011-01-01 10:00', '2011-01-01 09:00'], freq='H') expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') - tm.assert_series_equal(idx.value_counts(), expected) - expected = pd.period_range('2011-01-01 09:00', freq='H', periods=10) + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + expected = pd.period_range('2011-01-01 09:00', freq='H', + periods=10) tm.assert_index_equal(idx.unique(), expected) idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 09:00', '2013-01-01 09:00', '2013-01-01 08:00', '2013-01-01 08:00', pd.NaT], freq='H') - exp_idx = PeriodIndex( - ['2013-01-01 09:00', '2013-01-01 08:00'], freq='H') + exp_idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 08:00'], + freq='H') expected = Series([3, 2], index=exp_idx) - tm.assert_series_equal(idx.value_counts(), expected) - exp_idx = PeriodIndex( - ['2013-01-01 09:00', '2013-01-01 08:00', pd.NaT], freq='H') + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + exp_idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 08:00', + pd.NaT], freq='H') expected = Series([3, 2, 1], index=exp_idx) - tm.assert_series_equal(idx.value_counts(dropna=False), expected) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(dropna=False), expected) tm.assert_index_equal(idx.unique(), exp_idx) @@ -2164,6 +2226,28 @@ def test_drop_duplicates_metadata(self): self.assert_index_equal(idx, result) self.assertEqual(idx.freq, result.freq) + def test_drop_duplicates(self): + # to check Index/Series compat + base = pd.period_range('2011-01-01', '2011-01-31', freq='D', + name='idx') + idx = base.append(base[:5]) + + res = idx.drop_duplicates() + tm.assert_index_equal(res, base) + res = Series(idx).drop_duplicates() + tm.assert_series_equal(res, Series(base)) + + res = idx.drop_duplicates(keep='last') + exp = base[5:].append(base[:5]) + tm.assert_index_equal(res, exp) + res = Series(idx).drop_duplicates(keep='last') + tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) + + res = idx.drop_duplicates(keep=False) + tm.assert_index_equal(res, base[5:]) + res = Series(idx).drop_duplicates(keep=False) + tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) + def test_order_compat(self): def _check_freq(index, expected_index): if isinstance(index, PeriodIndex): diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index c90cbbf80086a..e3a67289a587b 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -462,6 +462,19 @@ def test_period_deprecated_freq(self): p = Period('2016-03-01 09:00', freq=exp) tm.assertIsInstance(p, Period) + def test_hash(self): + self.assertEqual(hash(Period('2011-01', freq='M')), + hash(Period('2011-01', freq='M'))) + + self.assertNotEqual(hash(Period('2011-01-01', freq='D')), + hash(Period('2011-01', freq='M'))) + + self.assertNotEqual(hash(Period('2011-01', freq='3M')), + hash(Period('2011-01', freq='2M'))) + + self.assertNotEqual(hash(Period('2011-01', freq='M')), + hash(Period('2011-02', freq='M'))) + def test_repr(self): p = Period('Jan-2000') self.assertIn('2000-01', repr(p))