ENH: groupby refactoring to use khash, add sort option, GH #595

wesm · wesm · commit f9f198ebc869 · 2012-01-11T15:39:47.000-05:00
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -383,21 +383,19 @@ def set_printoptions(precision=None, column_space=None, max_rows=None,
         out how big the terminal is and will not display more rows or/and
         columns that can fit on it.
     """
-    global GlobalPrintConfig
     if precision is not None:
-        GlobalPrintConfig.precision = precision
+        print_config.precision = precision
     if column_space is not None:
-        GlobalPrintConfig.column_space = column_space
+        print_config.column_space = column_space
     if max_rows is not None:
-        GlobalPrintConfig.max_rows = max_rows
+        print_config.max_rows = max_rows
     if max_columns is not None:
-        GlobalPrintConfig.max_columns = max_columns
+        print_config.max_columns = max_columns
     if colheader_justify is not None:
-        GlobalPrintConfig.colheader_justify = colheader_justify
+        print_config.colheader_justify = colheader_justify
 
 def reset_printoptions():
-    global GlobalPrintConfig
-    GlobalPrintConfig.reset()
+    print_config.reset()
 
 class EngFormatter(object):
     """
@@ -503,9 +501,8 @@ def set_eng_float_format(precision=None, accuracy=3, use_eng_prefix=False):
                       "being renamed to 'accuracy'" , FutureWarning)
         accuracy = precision
 
-    global GlobalPrintConfig
-    GlobalPrintConfig.float_format = EngFormatter(accuracy, use_eng_prefix)
-    GlobalPrintConfig.column_space = max(12, accuracy + 9)
+    print_config.float_format = EngFormatter(accuracy, use_eng_prefix)
+    print_config.column_space = max(12, accuracy + 9)
 
 #_float_format = None
 #_column_space = 12
@@ -526,7 +523,7 @@ def _float_format_default(v, width=None):
     to fit the width, reformat it to that width.
     """
 
-    fmt_str   = '%% .%dg' % GlobalPrintConfig.precision
+    fmt_str   = '%% .%dg' % print_config.precision
     formatted = fmt_str % v
 
     if width is None:
@@ -588,8 +585,8 @@ def _make_float_format(x):
 
         if float_format:
             formatted = float_format(x)
-        elif GlobalPrintConfig.float_format:
-            formatted = GlobalPrintConfig.float_format(x)
+        elif print_config.float_format:
+            formatted = print_config.float_format(x)
         else:
             formatted = _float_format_default(x, col_width)
 
@@ -621,7 +618,7 @@ def __init__(self):
     def reset(self):
         self.__init__()
 
-GlobalPrintConfig = _GlobalPrintConfig()
+print_config = _GlobalPrintConfig()
 
 #------------------------------------------------------------------------------
 # miscellaneous python tools
diff --git a/pandas/core/format.py b/pandas/core/format.py
@@ -64,7 +64,7 @@ def __init__(self, frame, buf=None, columns=None, col_space=None,
         self.index = index
 
         if justify is None:
-            self.justify = com.GlobalPrintConfig.colheader_justify
+            self.justify = com.print_config.colheader_justify
         else:
             self.justify = justify
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -413,10 +413,12 @@ def __repr__(self):
         """
         Return a string representation for a particular DataFrame
         """
+        config = com.print_config
+
         terminal_width, terminal_height = get_terminal_size()
-        max_rows = (terminal_height if com.GlobalPrintConfig.max_rows == 0
-                    else com.GlobalPrintConfig.max_rows)
-        max_columns = com.GlobalPrintConfig.max_columns
+        max_rows = (terminal_height if config.max_rows == 0
+                    else config.max_rows)
+        max_columns = config.max_columns
 
         if max_columns > 0:
             buf = StringIO()
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -79,7 +79,7 @@ def get(self, key, default=None):
         except KeyError:
             return default
 
-    def groupby(self, by=None, axis=0, level=None, as_index=True):
+    def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True):
         """
         Group series using mapper (dict or key function, apply given function
         to group, return result as series) or by a series of columns
@@ -99,6 +99,8 @@ def groupby(self, by=None, axis=0, level=None, as_index=True):
             For aggregated output, return object with group labels as the
             index. Only relevant for DataFrame input. as_index=False is
             effectively "SQL-style" grouped output
+        sort : boolean, default True
+            Sort group keys. Get better performance by turning this off
 
         Examples
         --------
@@ -116,7 +118,8 @@ def groupby(self, by=None, axis=0, level=None, as_index=True):
         GroupBy object
         """
         from pandas.core.groupby import groupby
-        return groupby(self, by, axis=axis, level=level, as_index=as_index)
+        return groupby(self, by, axis=axis, level=level, as_index=as_index,
+                       sort=sort)
 
     def select(self, crit, axis=0):
         """
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -85,7 +85,8 @@ class GroupBy(object):
     """
 
     def __init__(self, obj, grouper=None, axis=0, level=None,
-                 groupings=None, exclusions=None, column=None, as_index=True):
+                 groupings=None, exclusions=None, column=None, as_index=True,
+                 sort=True):
         self._column = column
 
         if isinstance(obj, NDFrame):
@@ -105,10 +106,11 @@ def __init__(self, obj, grouper=None, axis=0, level=None,
 
         self.as_index = as_index
         self.grouper = grouper
+        self.sort = sort
 
         if groupings is None:
             groupings, exclusions = _get_groupings(obj, grouper, axis=axis,
-                                                   level=level)
+                                                   level=level, sort=sort)
 
         self.groupings = groupings
         self.exclusions = set(exclusions) if exclusions else set()
@@ -132,6 +134,7 @@ def indices(self):
         if len(self.groupings) == 1:
             return self.primary.indices
         else:
+            # TODO: this is massively inefficient
             to_groupby = zip(*(ping.grouper for ping in self.groupings))
             to_groupby = Index(to_groupby)
             return lib.groupby_indices(to_groupby)
@@ -149,7 +152,7 @@ def _obj_with_exclusions(self):
 
     @property
     def _group_shape(self):
-        return tuple(len(ping.counts) for ping in self.groupings)
+        return tuple(ping.ngroups for ping in self.groupings)
 
     def __getattr__(self, attr):
         if hasattr(self.obj, attr):
@@ -525,11 +528,13 @@ class Grouping(object):
       * group_index : unique groups
       * groups : dict of {group -> label_list}
     """
-    def __init__(self, index, grouper=None, name=None, level=None):
+    def __init__(self, index, grouper=None, name=None, level=None,
+                 sort=True):
         self.name = name
         self.level = level
         self.grouper = _convert_grouper(index, grouper)
         self.index = index
+        self.sort = sort
 
         # right place for this?
         if isinstance(grouper, Series) and name is None:
@@ -576,6 +581,10 @@ def __iter__(self):
     _counts = None
     _group_index = None
 
+    @property
+    def ngroups(self):
+        return len(self.group_index)
+
     @cache_readonly
     def indices(self):
         return _groupby_indices(self.grouper)
@@ -589,38 +598,58 @@ def labels(self):
     @property
     def ids(self):
         if self._ids is None:
-            if self._was_factor:
-                index = self._group_index
-                self._ids = dict(zip(range(len(index)), index))
-            else:
-                self._make_labels()
+            index = self.group_index
+            self._ids = dict(zip(range(len(index)), index))
         return self._ids
 
     @property
     def counts(self):
         if self._counts is None:
-            self._make_labels()
+            if self._was_factor:
+                self._counts = lib.group_count(self.labels, self.ngroups)
+            else:
+                self._make_labels()
         return self._counts
 
     @property
     def group_index(self):
         if self._group_index is None:
-            ids = self.ids
-            values = np.arange(len(self.ids), dtype='O')
-            self._group_index = Index(lib.lookup_values(values, ids),
-                                      name=self.name)
+            self._make_labels()
+
+            # ids = self.ids
+            # values = np.arange(len(self.ids), dtype='O')
+            # self._group_index = Index(lib.lookup_values(values, ids),
+            #                           name=self.name)
         return self._group_index
 
     def _make_labels(self):
         if self._was_factor:  # pragma: no cover
             raise Exception('Should not call this method grouping by level')
         else:
-            ids, labels, counts  = _group_labels(self.grouper)
-            sids, slabels, scounts = sort_group_labels(ids, labels, counts)
+            values = self.grouper
+            if values.dtype != np.object_:
+                values = values.astype('O')
+
+            # khash
+            rizer = lib.Factorizer(len(values))
+            labels, counts = rizer.factorize(values, sort=False)
+
+            uniques = Index(rizer.uniques, name=self.name)
+            if self.sort and len(counts) > 0:
+                sorter = uniques.argsort()
+                reverse_indexer = np.empty(len(sorter), dtype=np.int32)
+                reverse_indexer.put(sorter, np.arange(len(sorter)))
+
+                mask = labels < 0
+                labels = reverse_indexer.take(labels)
+                np.putmask(labels, mask, -1)
 
-        self._labels = slabels
-        self._ids = sids
-        self._counts = scounts
+                uniques = uniques.take(sorter)
+                counts = counts.take(sorter)
+
+            self._labels = labels
+            self._group_index = uniques
+            self._counts = counts
 
     _groups = None
     @property
@@ -629,7 +658,8 @@ def groups(self):
             self._groups = self.index.groupby(self.grouper)
         return self._groups
 
-def _get_groupings(obj, grouper=None, axis=0, level=None):
+
+def _get_groupings(obj, grouper=None, axis=0, level=None, sort=True):
     group_axis = obj._get_axis(axis)
 
     if level is not None and not isinstance(group_axis, MultiIndex):
@@ -655,7 +685,7 @@ def _get_groupings(obj, grouper=None, axis=0, level=None):
             exclusions.append(gpr)
             name = gpr
             gpr = obj[gpr]
-        ping = Grouping(group_axis, gpr, name=name, level=level)
+        ping = Grouping(group_axis, gpr, name=name, level=level, sort=sort)
         if ping.name is None:
             ping.name = 'key_%d' % i
         groupings.append(ping)
@@ -785,7 +815,7 @@ def _get_index():
                 index = MultiIndex.from_tuples(keys, names=key_names)
             else:
                 ping = self.groupings[0]
-                if len(keys) == len(ping.counts):
+                if len(keys) == ping.ngroups:
                     index = ping.group_index
                     index.name = key_names[0]
                 else:
@@ -1056,7 +1086,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
                 key_index = MultiIndex.from_tuples(keys, names=key_names)
             else:
                 ping = self.groupings[0]
-                if len(keys) == len(ping.counts):
+                if len(keys) == ping.ngroups:
                     key_index = ping.group_index
                     key_index.name = key_names[0]
 
@@ -1235,6 +1265,9 @@ def slicer(data, slob):
             yield i, slicer(sorted_data, slice(start, end))
 
 def get_group_index(label_list, shape):
+    if len(label_list) == 1:
+        return label_list[0]
+
     n = len(label_list[0])
     group_index = np.zeros(n, dtype=int)
     mask = np.zeros(n, dtype=bool)
@@ -1353,11 +1386,6 @@ def _groupby_indices(values):
         values = values.astype('O')
     return lib.groupby_indices(values)
 
-def _group_labels(values):
-    if values.dtype != np.object_:
-        values = values.astype('O')
-    return lib.group_labels(values)
-
 def _ensure_platform_int(labels):
     if labels.dtype != np.int_:
         labels = labels.astype(np.int_)
@@ -1367,25 +1395,3 @@ def _ensure_int64(labels):
     if labels.dtype != np.int64:
         labels = labels.astype(np.int64)
     return labels
-
-def sort_group_labels(ids, labels, counts):
-    n = len(ids)
-
-    # corner all NA case
-    if n == 0:
-        return ids, labels, counts
-
-    rng = np.arange(n)
-    values = Series(ids, index=rng, dtype=object).values
-    indexer = values.argsort()
-
-    reverse_indexer = np.empty(n, dtype=np.int32)
-    reverse_indexer.put(indexer, np.arange(n))
-
-    new_labels = reverse_indexer.take(labels)
-    np.putmask(new_labels, labels == -1, -1)
-
-    new_ids = dict(izip(rng, values.take(indexer)))
-    new_counts = counts.take(indexer)
-
-    return new_ids, new_labels, new_counts
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -456,8 +456,8 @@ def __setslice__(self, i, j, value):
     def __repr__(self):
         """Clean string representation of a Series"""
         width, height = get_terminal_size()
-        max_rows = (height if com.GlobalPrintConfig.max_rows == 0
-                    else com.GlobalPrintConfig.max_rows)
+        max_rows = (height if com.print_config.max_rows == 0
+                    else com.print_config.max_rows)
         if len(self.index) > max_rows:
             result = self._tidy_repr(min(30, max_rows - 4))
         elif len(self.index) > 0:
@@ -518,7 +518,7 @@ def _get_repr(self, name=False, print_header=False, length=True,
         padSpace = min(maxlen, 60)
 
         if float_format is None:
-            float_format = com.GlobalPrintConfig.float_format
+            float_format = com.print_config.float_format
             if float_format is None:
                 float_format = com._float_format_default
 
diff --git a/pandas/src/groupby.pyx b/pandas/src/groupby.pyx
diff --git a/pandas/src/hashtable.pyx b/pandas/src/hashtable.pyx
diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py