Merge pull request #10931 from nickeubank/test_groupby_sort_preservation

jreback · jreback · commit 5dea811b2dde · 2015-09-05T18:20:24.000-04:00
Add tests to ensure sort preserved by groupby, add docs
diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -160,6 +160,31 @@ only verifies that you've passed a valid mapping.
    GroupBy operations (though can't be guaranteed to be the most
    efficient). You can get quite creative with the label mapping functions.
 
+.. _groupby.sorting:
+
+GroupBy sorting
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By default the group keys are sorted during the ``groupby`` operation. You may however pass ``sort=False`` for potential speedups:
+
+.. ipython:: python
+
+   df2 = pd.DataFrame({'X' : ['B', 'B', 'A', 'A'], 'Y' : [1, 2, 3, 4]})
+   df2.groupby(['X']).sum()
+   df2.groupby(['X'], sort=False).sum()
+
+
+Note that ``groupby`` will preserve the order in which *observations* are sorted *within* each group. For example, the groups created by ``groupby()`` below are in the order the appeared in the original ``DataFrame``:
+
+.. ipython:: python
+
+   df3 = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
+   df3.groupby(['X']).get_group('A')
+
+   df3.groupby(['X']).get_group('B')
+
+
+
 .. _groupby.attributes:
 
 GroupBy object attributes
@@ -183,14 +208,6 @@ the length of the ``groups`` dict, so it is largely just a convenience:
    grouped.groups
    len(grouped)
 
-By default the group keys are sorted during the groupby operation. You may
-however pass ``sort=False`` for potential speedups:
-
-.. ipython:: python
-
-   df2 = pd.DataFrame({'X' : ['B', 'B', 'A', 'A'], 'Y' : [1, 2, 3, 4]})
-   df2.groupby(['X'], sort=True).sum()
-   df2.groupby(['X'], sort=False).sum()
 
 .. _groupby.tabcompletion:
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3252,11 +3252,13 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
             index. Only relevant for DataFrame input. as_index=False is
             effectively "SQL-style" grouped output
         sort : boolean, default True
-            Sort group keys. Get better performance by turning this off
+            Sort group keys. Get better performance by turning this off.
+            Note this does not influence the order of observations within each group. 
+            groupby preserves the order of rows within each group. 
         group_keys : boolean, default True
             When calling apply, add group keys to index to identify pieces
         squeeze : boolean, default False
-            reduce the dimensionaility of the return type if possible,
+            reduce the dimensionality of the return type if possible,
             otherwise return a consistent type
 
         Examples
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -5436,6 +5436,32 @@ def test_first_last_max_min_on_time_data(self):
         assert_frame_equal(grouped_ref.first(),grouped_test.first())
         assert_frame_equal(grouped_ref.last(),grouped_test.last())
 
+    def test_groupby_preserves_sort(self):
+        # Test to ensure that groupby always preserves sort order of original
+        # object. Issue #8588 and #9651
+        
+        df = DataFrame({'int_groups':[3,1,0,1,0,3,3,3], 
+                        'string_groups':['z','a','z','a','a','g','g','g'], 
+                        'ints':[8,7,4,5,2,9,1,1],
+                        'floats':[2.3,5.3,6.2,-2.4,2.2,1.1,1.1,5],
+                        'strings':['z','d','a','e','word','word2','42','47']})
+
+        # Try sorting on different types and with different group types
+        for sort_column in ['ints', 'floats', 'strings', ['ints','floats'], 
+                  ['ints','strings']]:
+            for group_column in ['int_groups', 'string_groups', 
+                                 ['int_groups','string_groups']]:
+
+                df = df.sort_values(by=sort_column)
+
+                g = df.groupby(group_column)
+                
+                def test_sort(x):
+                    assert_frame_equal(x, x.sort_values(by=sort_column))
+    
+                g.apply(test_sort)
+
+
 def assert_fp_equal(a, b):
     assert (np.abs(a - b) < 1e-12).all()