Skip to content

Commit 6a3755b

Browse files
chris-b1jorisvandenbossche
authored andcommitted
PERF: Period factorization (pandas-dev#14419)
1 parent e70252b commit 6a3755b

File tree

3 files changed

+50
-19
lines changed

3 files changed

+50
-19
lines changed

asv_bench/benchmarks/groupby.py

+26
Original file line numberDiff line numberDiff line change
@@ -548,6 +548,32 @@ def time_groupby_sum(self):
548548
self.df.groupby(['a'])['b'].sum()
549549

550550

551+
class groupby_period(object):
552+
# GH 14338
553+
goal_time = 0.2
554+
555+
def make_grouper(self, N):
556+
return pd.period_range('1900-01-01', freq='D', periods=N)
557+
558+
def setup(self):
559+
N = 10000
560+
self.grouper = self.make_grouper(N)
561+
self.df = pd.DataFrame(np.random.randn(N, 2))
562+
563+
def time_groupby_sum(self):
564+
self.df.groupby(self.grouper).sum()
565+
566+
567+
class groupby_datetime(groupby_period):
568+
def make_grouper(self, N):
569+
return pd.date_range('1900-01-01', freq='D', periods=N)
570+
571+
572+
class groupby_datetimetz(groupby_period):
573+
def make_grouper(self, N):
574+
return pd.date_range('1900-01-01', freq='D', periods=N,
575+
tz='US/Central')
576+
551577
#----------------------------------------------------------------------
552578
# Series.value_counts
553579

doc/source/whatsnew/v0.19.1.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ Highlights include:
2020
Performance Improvements
2121
~~~~~~~~~~~~~~~~~~~~~~~~
2222

23-
23+
- Fixed performance regression in factorization of ``Period`` data (:issue:`14338`)
2424

2525

2626

pandas/core/algorithms.py

+23-18
Original file line numberDiff line numberDiff line change
@@ -285,18 +285,27 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
285285
note: an array of Periods will ignore sort as it returns an always sorted
286286
PeriodIndex
287287
"""
288-
from pandas import Index, Series, DatetimeIndex
289-
290-
vals = np.asarray(values)
291-
292-
# localize to UTC
293-
is_datetimetz_type = is_datetimetz(values)
294-
if is_datetimetz_type:
295-
values = DatetimeIndex(values)
296-
vals = values.asi8
288+
from pandas import Index, Series, DatetimeIndex, PeriodIndex
289+
290+
# handling two possibilities here
291+
# - for a numpy datetimelike simply view as i8 then cast back
292+
# - for an extension datetimelike view as i8 then
293+
# reconstruct from boxed values to transfer metadata
294+
dtype = None
295+
if needs_i8_conversion(values):
296+
if is_period_dtype(values):
297+
values = PeriodIndex(values)
298+
vals = values.asi8
299+
elif is_datetimetz(values):
300+
values = DatetimeIndex(values)
301+
vals = values.asi8
302+
else:
303+
# numpy dtype
304+
dtype = values.dtype
305+
vals = values.view(np.int64)
306+
else:
307+
vals = np.asarray(values)
297308

298-
is_datetime = is_datetime64_dtype(vals)
299-
is_timedelta = is_timedelta64_dtype(vals)
300309
(hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)
301310

302311
table = hash_klass(size_hint or len(vals))
@@ -311,13 +320,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
311320
uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel,
312321
assume_unique=True)
313322

314-
if is_datetimetz_type:
315-
# reset tz
316-
uniques = values._shallow_copy(uniques)
317-
elif is_datetime:
318-
uniques = uniques.astype('M8[ns]')
319-
elif is_timedelta:
320-
uniques = uniques.astype('m8[ns]')
323+
if dtype is not None:
324+
uniques = uniques.astype(dtype)
325+
321326
if isinstance(values, Index):
322327
uniques = values._shallow_copy(uniques, name=None)
323328
elif isinstance(values, Series):

0 commit comments

Comments
 (0)