From 65b4f0fd0fcb7211e10bc30d909c383cfb4133a3 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 11 Oct 2018 09:09:12 -0700 Subject: [PATCH 1/4] Avoid using object dtype for PeriodIndex methods --- pandas/core/indexes/base.py | 32 +++++++++++++++++++++++--------- pandas/core/indexes/multi.py | 10 +++++++++- pandas/core/indexes/period.py | 2 +- pandas/util/testing.py | 3 ++- 4 files changed, 35 insertions(+), 12 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 51c84d6e28cb4..6b9920c21bfce 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -519,7 +519,7 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): @Appender(_index_shared_docs['_shallow_copy']) def _shallow_copy(self, values=None, **kwargs): if values is None: - values = self.values + values = self._take_values attributes = self._get_attributes_dict() attributes.update(kwargs) if not len(values) and 'dtype' not in kwargs: @@ -763,6 +763,17 @@ def get_values(self): """ return self.values + @property + def _take_values(self): + """ + Values to use in `take` operation, suitable to being passed back to + _shallow_copy/_simple_new. + """ + if is_period_dtype(self): + # Avoid casting to object-dtype + return self._ndarray_values + return self.values + @Appender(IndexOpsMixin.memory_usage.__doc__) def memory_usage(self, deep=False): result = super(Index, self).memory_usage(deep=deep) @@ -2158,7 +2169,7 @@ def take(self, indices, axis=0, allow_fill=True, if allow_fill and fill_value is not None: msg = 'Unable to fill values because {0} cannot contain NA' raise ValueError(msg.format(self.__class__.__name__)) - taken = self.values.take(indices) + taken = self._take_values.take(indices) return self._shallow_copy(taken) def _assert_take_fillable(self, values, indices, allow_fill=True, @@ -2929,7 +2940,8 @@ def difference(self, other): self._assert_can_do_setop(other) if self.equals(other): - return self._shallow_copy([]) + # pass an empty array with the appropriate dtype + return self._shallow_copy(self[:0]) other, result_name = self._convert_can_do_setop(other) @@ -2940,12 +2952,14 @@ def difference(self, other): label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) - the_diff = this.values.take(label_diff) + the_diff = this._take_values.take(label_diff) try: the_diff = sorting.safe_sort(the_diff) except TypeError: pass + if is_period_dtype(this): + return this._shallow_copy(the_diff, name=result_name) return this._shallow_copy(the_diff, name=result_name, freq=None) def symmetric_difference(self, other, result_name=None): @@ -2994,11 +3008,11 @@ def symmetric_difference(self, other, result_name=None): common_indexer = indexer.take((indexer != -1).nonzero()[0]) left_indexer = np.setdiff1d(np.arange(this.size), common_indexer, assume_unique=True) - left_diff = this.values.take(left_indexer) + left_diff = this._take_values.take(left_indexer) # {other} minus {this} right_indexer = (indexer == -1).nonzero()[0] - right_diff = other.values.take(right_indexer) + right_diff = other._take_values.take(right_indexer) the_diff = _concat._concat_compat([left_diff, right_diff]) try: @@ -3008,7 +3022,7 @@ def symmetric_difference(self, other, result_name=None): attribs = self._get_attributes_dict() attribs['name'] = result_name - if 'freq' in attribs: + if 'freq' in attribs and not is_period_dtype(self): attribs['freq'] = None return self._shallow_copy_with_infer(the_diff, **attribs) @@ -3028,7 +3042,7 @@ def _get_unique_index(self, dropna=False): if self.is_unique and not dropna: return self - values = self.values + values = self._take_values if not self.is_unique: values = self.unique() @@ -4678,7 +4692,7 @@ def dropna(self, how='any'): raise ValueError("invalid how option: {0}".format(how)) if self.hasnans: - return self._shallow_copy(self.values[~self._isnan]) + return self._shallow_copy(self._take_values[~self._isnan]) return self._shallow_copy() def _evaluate_with_timedelta_like(self, other, op): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 3cccb65503378..7b2bf8326d1f0 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -17,6 +17,7 @@ from pandas.core.dtypes.common import ( ensure_int64, ensure_platform_int, + is_period_dtype, is_categorical_dtype, is_object_dtype, is_hashable, @@ -1036,7 +1037,14 @@ def _get_level_values(self, level, unique=False): labels = self.labels[level] if unique: labels = algos.unique(labels) - filled = algos.take_1d(values._values, labels, + + if is_period_dtype(values): + take_vals = values._ndarray_values + else: + # TODO: Why is this _values where elsewhere it is values? + # if it were values here we could use _take_values + take_vals = values._values + filled = algos.take_1d(take_vals, labels, fill_value=values._na_value) values = values._shallow_copy(filled) return values diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index f151389b02463..7b04e9362e114 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -350,7 +350,7 @@ def __array_wrap__(self, result, context=None): return result # the result is object dtype array of Period # cannot pass _simple_new as it is - return self._shallow_copy(result, freq=self.freq, name=self.name) + return type(self)(result, freq=self.freq, name=self.name) @property def size(self): diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 4e01e0feb004c..1b8b5547a3f6b 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -824,7 +824,8 @@ def _get_ilevel_values(index, level): # accept level number only unique = index.levels[level] labels = index.labels[level] - filled = take_1d(unique.values, labels, fill_value=unique._na_value) + filled = take_1d(unique._take_values, labels, + fill_value=unique._na_value) values = unique._shallow_copy(filled, name=index.names[level]) return values From ba80015c70ab9ec968c33749569febfeb44f8b84 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 11 Oct 2018 10:36:34 -0700 Subject: [PATCH 2/4] Add asvs --- asv_bench/benchmarks/period.py | 11 +++++++++++ pandas/core/indexes/period.py | 4 ++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py index 29b8c7efda40c..013ecf8cd8a7c 100644 --- a/asv_bench/benchmarks/period.py +++ b/asv_bench/benchmarks/period.py @@ -96,6 +96,8 @@ def time_value_counts(self, typ): class Indexing(object): goal_time = 0.2 + pi_with_nas = PeriodIndex(['1985Q1', 'NaT', '1985Q2'] * 1000, freq='Q') + pi_diff = PeriodIndex(['1985Q1', 'NaT', '1985Q3'] * 1000, freq='Q') def setup(self): self.index = PeriodIndex(start='1985', periods=1000, freq='D') @@ -122,3 +124,12 @@ def time_intersection(self): def time_unique(self): self.index.unique() + + def time_dropna(self): + self.pi_with_nas.dropna() + + def time_difference(self): + self.pi_with_nas.difference(self.pi_diff) + + def time_symmetric_difference(self): + self.pi_with_nas.symmetric_difference(self.pi_diff) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 7b04e9362e114..bdf84feca1fd7 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -327,9 +327,9 @@ def __array_wrap__(self, result, context=None): """ if isinstance(context, tuple) and len(context) > 0: func = context[0] - if (func is np.add): + if func is np.add: pass - elif (func is np.subtract): + elif func is np.subtract: name = self.name left = context[1][0] right = context[1][1] From f3d332142b3272bb5764168f862cafdfdfead110 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 11 Oct 2018 10:37:34 -0700 Subject: [PATCH 3/4] Whatsnew --- doc/source/whatsnew/v0.24.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 9b70dd4ba549f..043de09fcf42f 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -701,7 +701,7 @@ Performance Improvements (:issue:`21372`) - Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`) - Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`) -- Improved performance of :func:`PeriodIndex.unique` (:issue:`23083`) +- Improved performance of :func:`PeriodIndex.unique`, :func:`PeriodIndex.difference`, :func:`PeriodIndex.symmetric_difference`, and :func:`PeriodIndex.dropna`, (:issue:`23083`) .. _whatsnew_0240.docs: From 62f0f1f63d64c92e6ce0fb5dbd778986fb0a5f08 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 11 Oct 2018 10:39:18 -0700 Subject: [PATCH 4/4] Update GH references --- asv_bench/benchmarks/period.py | 4 ++++ doc/source/whatsnew/v0.24.0.txt | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py index 013ecf8cd8a7c..36a57456e2a08 100644 --- a/asv_bench/benchmarks/period.py +++ b/asv_bench/benchmarks/period.py @@ -123,13 +123,17 @@ def time_intersection(self): self.index[:750].intersection(self.index[250:]) def time_unique(self): + # GH#23083 self.index.unique() def time_dropna(self): + # GH#23095 self.pi_with_nas.dropna() def time_difference(self): + # GH#23095 self.pi_with_nas.difference(self.pi_diff) def time_symmetric_difference(self): + # GH#23095 self.pi_with_nas.symmetric_difference(self.pi_diff) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 043de09fcf42f..aa12231b840ed 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -701,7 +701,7 @@ Performance Improvements (:issue:`21372`) - Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`) - Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`) -- Improved performance of :func:`PeriodIndex.unique`, :func:`PeriodIndex.difference`, :func:`PeriodIndex.symmetric_difference`, and :func:`PeriodIndex.dropna`, (:issue:`23083`) +- Improved performance of :func:`PeriodIndex.unique`, :func:`PeriodIndex.difference`, :func:`PeriodIndex.symmetric_difference`, and :func:`PeriodIndex.dropna`, (:issue:`23083`, :issue:`23095`) .. _whatsnew_0240.docs: