PERF: Period factorization (pandas-dev#14419)

chris-b1 · jorisvandenbossche · commit 6a3755b0b77c · 2016-10-15T21:27:11.000+02:00
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -548,6 +548,32 @@ def time_groupby_sum(self):
         self.df.groupby(['a'])['b'].sum()
 
 
+class groupby_period(object):
+    # GH 14338
+    goal_time = 0.2
+
+    def make_grouper(self, N):
+        return pd.period_range('1900-01-01', freq='D', periods=N)
+
+    def setup(self):
+        N = 10000
+        self.grouper = self.make_grouper(N)
+        self.df = pd.DataFrame(np.random.randn(N, 2))
+
+    def time_groupby_sum(self):
+        self.df.groupby(self.grouper).sum()
+
+
+class groupby_datetime(groupby_period):
+    def make_grouper(self, N):
+        return pd.date_range('1900-01-01', freq='D', periods=N)
+
+
+class groupby_datetimetz(groupby_period):
+    def make_grouper(self, N):
+        return pd.date_range('1900-01-01', freq='D', periods=N,
+                             tz='US/Central')
+
 #----------------------------------------------------------------------
 # Series.value_counts
 
diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt
@@ -20,7 +20,7 @@ Highlights include:
 Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
-
+ - Fixed performance regression in factorization of ``Period`` data (:issue:`14338`)
 
 
 
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -285,18 +285,27 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
     note: an array of Periods will ignore sort as it returns an always sorted
     PeriodIndex
     """
-    from pandas import Index, Series, DatetimeIndex
-
-    vals = np.asarray(values)
-
-    # localize to UTC
-    is_datetimetz_type = is_datetimetz(values)
-    if is_datetimetz_type:
-        values = DatetimeIndex(values)
-        vals = values.asi8
+    from pandas import Index, Series, DatetimeIndex, PeriodIndex
+
+    # handling two possibilities here
+    # - for a numpy datetimelike simply view as i8 then cast back
+    # - for an extension datetimelike view as i8 then
+    #   reconstruct from boxed values to transfer metadata
+    dtype = None
+    if needs_i8_conversion(values):
+        if is_period_dtype(values):
+            values = PeriodIndex(values)
+            vals = values.asi8
+        elif is_datetimetz(values):
+            values = DatetimeIndex(values)
+            vals = values.asi8
+        else:
+            # numpy dtype
+            dtype = values.dtype
+            vals = values.view(np.int64)
+    else:
+        vals = np.asarray(values)
 
-    is_datetime = is_datetime64_dtype(vals)
-    is_timedelta = is_timedelta64_dtype(vals)
     (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)
 
     table = hash_klass(size_hint or len(vals))
@@ -311,13 +320,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
         uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel,
                                     assume_unique=True)
 
-    if is_datetimetz_type:
-        # reset tz
-        uniques = values._shallow_copy(uniques)
-    elif is_datetime:
-        uniques = uniques.astype('M8[ns]')
-    elif is_timedelta:
-        uniques = uniques.astype('m8[ns]')
+    if dtype is not None:
+        uniques = uniques.astype(dtype)
+
     if isinstance(values, Index):
         uniques = values._shallow_copy(uniques, name=None)
     elif isinstance(values, Series):