pandas-dev · lukovnikov · Oct 6, 2015 · Oct 6, 2015 · Oct 7, 2015 · Oct 7, 2015
diff --git a/pandas/core/api.py b/pandas/core/api.py
@@ -6,7 +6,7 @@
 from pandas.core.algorithms import factorize, match, unique, value_counts
 from pandas.core.common import isnull, notnull
 from pandas.core.categorical import Categorical
-from pandas.core.groupby import Grouper
+from pandas.core.groupby import Grouper, RandomPartitioner, Partitioner
 from pandas.core.format import set_eng_float_format
 from pandas.core.index import Index, CategoricalIndex, Int64Index, Float64Index, MultiIndex
 

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3,6 +3,8 @@
 import operator
 import weakref
 import gc
+from numbers import Real
+from math import floor
 
 import numpy as np
 import pandas.lib as lib
@@ -2034,6 +2036,37 @@ def tail(self, n=5):
             return self
         return self.iloc[-n:]
 
+    def split(self, weights=(50, 50), random=False, axis=None):
+        """
+        Returns a random split from an axis of this object
+
+        Parameters
+        ----------
+        weights : weights: list or tuple or equivalent, optional
+            The passed collection of weights serves as relative sizes of the splits
+            of the returned datasets.
+            Default = (50,50).
+        random : boolean or int or numpy.random.RandomState, optional
+            If False (=default value), makes consecutive splits from beginning to end.
+            If not False, a seed for the random number generator can be provided (if int) or
+            a numpy RandomState object. If True, default random behavior.
+            Default = False.
+        axis : int or string, optional
+            Axis to sample. Accepts axis number or name. Default is stat axis
+            for given data type (0 for Series and DataFrames, 1 for Panels).
+
+        Returns
+        -------
+        Multiple objects of the same type as original object. The number of returned objects
+        is the same as the number of weights provided as parameter.
+        """
+        g = pd.Partitioner(weights, axis)
+        if random is not False and random is not None:
+            if random is True:
+                random = None
+            g = pd.RandomPartitioner(weights, axis, random)
+        return self.groupby(g).split()
+
 
     def sample(self, n=None, frac=None, replace=False, weights=None, random_state=None, axis=None):
         """

diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -4,6 +4,7 @@
 import datetime
 import collections
 import warnings
+from numbers import Real
 
 from pandas.compat import(
     zip, builtins, range, long, lzip,
@@ -296,6 +297,62 @@ def groups(self):
         return self.grouper.groups
 
 
+class Partitioner(Grouper):
+    '''
+
+    '''
+
+    def __init__(self, proportions=(1,1), axis=None):
+        self._proportions = proportions
+        self._axis = axis
+        self.key = None
+        # check weight type
+        if len(self._proportions) < 2:
+            raise ValueError("must split into more than 1 partition")
+        for w in self._proportions:
+            if not (com.is_float(w) or com.is_integer(w)) or w <=0:
+                raise ValueError("weights must be strictly positive real numbers")
+
+        # compute proportions as fractions
+        self._proportions = np.asarray(self._proportions, dtype="float64")
+        self._proportions = self._proportions/self._proportions.sum()
+        super(Partitioner, self).__init__()
+
+    def _get_grouper(self, obj):
+        if self._axis is None:
+            self._axis = obj._stat_axis_number
+        self._axis = obj._get_axis_number(self._axis)
+        axis_length = obj.shape[self._axis]
+
+        numbers = np.rint(self._proportions * axis_length).astype("int32")
+
+        newcol = reduce(lambda x, y: x + y, [[x]*numbers[x] for x in range(len(numbers))])
+        while len(newcol) < axis_length:
+            newcol.append(newcol[-1])
+
+        self._transform(newcol)
+
+        grouping = Grouping(obj._get_axis(self._axis), grouper=Series(newcol), obj=obj, sort=True, in_axis=True)
+
+        return None, BaseGrouper(self._axis, [grouping]), obj
+
+    def _transform(self, newcol):
+        pass
+
+class RandomPartitioner(Partitioner):
+    '''
+    TODO
+    '''
+
+    def __init__(self, proportions=(1,1), axis=None, random=None):
+        # Process random_state argument
+        self.rs = com._random_state(random)
+        super(RandomPartitioner, self).__init__(proportions, axis)
+
+    def _transform(self, newcol):
+        self.rs.shuffle(newcol)
+
+
 class GroupByPlot(PandasObject):
     """
     Class implementing the .plot attribute for groupby objects
@@ -658,6 +715,10 @@ def __iter__(self):
         """
         return self.grouper.get_iterator(self.obj, axis=self.axis)
 
+    def split(self):
+        acc = [x for _, x in self]
+        return tuple(acc)
+
     def apply(self, func, *args, **kwargs):
         """
         Apply function and combine results together in an intelligent way. The

diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py
@@ -354,6 +354,12 @@ def test_head_tail(self):
             self._compare(o.head(-3), o.head(7))
             self._compare(o.tail(-3), o.tail(7))
 
+    def test_split(self):
+        o = self._construct(shape=10)
+        a, b = o.split((1, 1), axis=0, random=True)
+        self.assertTrue(a.shape[0] == 5)
+        self.assertTrue(b.shape[0] == 5)
+
     def test_sample(self):
         # Fixes issue: 2419
 

diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -399,8 +399,13 @@ def test_grouper_multilevel_freq(self):
                              pd.Grouper(level=1, freq='W')]).sum()
         assert_frame_equal(result, expected)
 
-    def test_grouper_creation_bug(self):
+    def test_grouper_random(self):
+        df = DataFrame({"A": [0,1,2,3,4,5], "b": [10,11,12,13,14,15]})
+        g = df.groupby(pd.RandomPartitioner((1,2)))
+        a, b = g.split()
+        assert_frame_equal(df, df)
 
+    def test_grouper_creation_bug(self):
         # GH 8795
         df = DataFrame({'A':[0,0,1,1,2,2], 'B':[1,2,3,4,5,6]})
         g = df.groupby('A')