diff --git a/pandas/core/api.py b/pandas/core/api.py index e2ac57e37cba6..e34895af9640c 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -6,7 +6,7 @@ from pandas.core.algorithms import factorize, match, unique, value_counts from pandas.core.common import isnull, notnull from pandas.core.categorical import Categorical -from pandas.core.groupby import Grouper +from pandas.core.groupby import Grouper, RandomPartitioner, Partitioner from pandas.core.format import set_eng_float_format from pandas.core.index import Index, CategoricalIndex, Int64Index, Float64Index, MultiIndex diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2def8180a43e4..246ebcb6cc953 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3,6 +3,8 @@ import operator import weakref import gc +from numbers import Real +from math import floor import numpy as np import pandas.lib as lib @@ -2034,6 +2036,37 @@ def tail(self, n=5): return self return self.iloc[-n:] + def split(self, weights=(50, 50), random=False, axis=None): + """ + Returns a random split from an axis of this object + + Parameters + ---------- + weights : weights: list or tuple or equivalent, optional + The passed collection of weights serves as relative sizes of the splits + of the returned datasets. + Default = (50,50). + random : boolean or int or numpy.random.RandomState, optional + If False (=default value), makes consecutive splits from beginning to end. + If not False, a seed for the random number generator can be provided (if int) or + a numpy RandomState object. If True, default random behavior. + Default = False. + axis : int or string, optional + Axis to sample. Accepts axis number or name. Default is stat axis + for given data type (0 for Series and DataFrames, 1 for Panels). + + Returns + ------- + Multiple objects of the same type as original object. The number of returned objects + is the same as the number of weights provided as parameter. + """ + g = pd.Partitioner(weights, axis) + if random is not False and random is not None: + if random is True: + random = None + g = pd.RandomPartitioner(weights, axis, random) + return self.groupby(g).split() + def sample(self, n=None, frac=None, replace=False, weights=None, random_state=None, axis=None): """ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index add5080a69ee4..d8f5b33a1ad35 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -4,6 +4,7 @@ import datetime import collections import warnings +from numbers import Real from pandas.compat import( zip, builtins, range, long, lzip, @@ -296,6 +297,62 @@ def groups(self): return self.grouper.groups +class Partitioner(Grouper): + ''' + + ''' + + def __init__(self, proportions=(1,1), axis=None): + self._proportions = proportions + self._axis = axis + self.key = None + # check weight type + if len(self._proportions) < 2: + raise ValueError("must split into more than 1 partition") + for w in self._proportions: + if not (com.is_float(w) or com.is_integer(w)) or w <=0: + raise ValueError("weights must be strictly positive real numbers") + + # compute proportions as fractions + self._proportions = np.asarray(self._proportions, dtype="float64") + self._proportions = self._proportions/self._proportions.sum() + super(Partitioner, self).__init__() + + def _get_grouper(self, obj): + if self._axis is None: + self._axis = obj._stat_axis_number + self._axis = obj._get_axis_number(self._axis) + axis_length = obj.shape[self._axis] + + numbers = np.rint(self._proportions * axis_length).astype("int32") + + newcol = reduce(lambda x, y: x + y, [[x]*numbers[x] for x in range(len(numbers))]) + while len(newcol) < axis_length: + newcol.append(newcol[-1]) + + self._transform(newcol) + + grouping = Grouping(obj._get_axis(self._axis), grouper=Series(newcol), obj=obj, sort=True, in_axis=True) + + return None, BaseGrouper(self._axis, [grouping]), obj + + def _transform(self, newcol): + pass + +class RandomPartitioner(Partitioner): + ''' + TODO + ''' + + def __init__(self, proportions=(1,1), axis=None, random=None): + # Process random_state argument + self.rs = com._random_state(random) + super(RandomPartitioner, self).__init__(proportions, axis) + + def _transform(self, newcol): + self.rs.shuffle(newcol) + + class GroupByPlot(PandasObject): """ Class implementing the .plot attribute for groupby objects @@ -658,6 +715,10 @@ def __iter__(self): """ return self.grouper.get_iterator(self.obj, axis=self.axis) + def split(self): + acc = [x for _, x in self] + return tuple(acc) + def apply(self, func, *args, **kwargs): """ Apply function and combine results together in an intelligent way. The diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 3a26be2ca1032..d8860b51d7c6e 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -354,6 +354,12 @@ def test_head_tail(self): self._compare(o.head(-3), o.head(7)) self._compare(o.tail(-3), o.tail(7)) + def test_split(self): + o = self._construct(shape=10) + a, b = o.split((1, 1), axis=0, random=True) + self.assertTrue(a.shape[0] == 5) + self.assertTrue(b.shape[0] == 5) + def test_sample(self): # Fixes issue: 2419 diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 8eb641ce8f494..7a66da080ddf8 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -399,8 +399,13 @@ def test_grouper_multilevel_freq(self): pd.Grouper(level=1, freq='W')]).sum() assert_frame_equal(result, expected) - def test_grouper_creation_bug(self): + def test_grouper_random(self): + df = DataFrame({"A": [0,1,2,3,4,5], "b": [10,11,12,13,14,15]}) + g = df.groupby(pd.RandomPartitioner((1,2))) + a, b = g.split() + assert_frame_equal(df, df) + def test_grouper_creation_bug(self): # GH 8795 df = DataFrame({'A':[0,0,1,1,2,2], 'B':[1,2,3,4,5,6]}) g = df.groupby('A')