Skip to content

added random_split in generic.py, for DataFrames etc. #11253

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/core/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pandas.core.algorithms import factorize, match, unique, value_counts
from pandas.core.common import isnull, notnull
from pandas.core.categorical import Categorical
from pandas.core.groupby import Grouper
from pandas.core.groupby import Grouper, RandomGrouper, OrderedGrouper
from pandas.core.format import set_eng_float_format
from pandas.core.index import Index, CategoricalIndex, Int64Index, Float64Index, MultiIndex

Expand Down
33 changes: 33 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import operator
import weakref
import gc
from numbers import Real
from math import floor

import numpy as np
import pandas.lib as lib
Expand Down Expand Up @@ -2034,6 +2036,37 @@ def tail(self, n=5):
return self
return self.iloc[-n:]

def split(self, weights=(50, 50), random=False, axis=None):
"""
Returns a random split from an axis of this object

Parameters
----------
weights : weights: list or tuple or equivalent, optional
The passed collection of weights serves as relative sizes of the splits
of the returned datasets.
Default = (50,50).
random : boolean or int or numpy.random.RandomState, optional
If False (=default value), makes consecutive splits from beginning to end.
If not False, a seed for the random number generator can be provided (if int) or
a numpy RandomState object. If True, default random behavior.
Default = False.
axis : int or string, optional
Axis to sample. Accepts axis number or name. Default is stat axis
for given data type (0 for Series and DataFrames, 1 for Panels).

Returns
-------
Multiple objects of the same type as original object. The number of returned objects
is the same as the number of weights provided as parameter.
"""
g = pd.OrderedGrouper(weights, axis)
if random is not False and random is not None:
if random is True:
random = None
g = pd.RandomGrouper(weights, axis, random)
return self.groupby(g).split()


def sample(self, n=None, frac=None, replace=False, weights=None, random_state=None, axis=None):
"""
Expand Down
56 changes: 56 additions & 0 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import datetime
import collections
import warnings
from numbers import Real

from pandas.compat import(
zip, builtins, range, long, lzip,
Expand Down Expand Up @@ -296,6 +297,57 @@ def groups(self):
return self.grouper.groups


class OrderedGrouper(Grouper):

def __init__(self, proportions=(1,1), axis=None):
self._proportions = proportions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

obviously add a doc-string :)

self._axis = axis
self.key = None
# check weight type
if len(self._proportions) < 2:
raise ValueError("must split into more than 1 partition")
for w in self._proportions:
if not isinstance(w, Real) or w <=0:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use com.is_float or com.is_integer

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

though I would simply do proportions = np.asarray(proportions) then all the following operations are quite easy

raise ValueError("weights must be strictly positive real numbers")

weights_total = reduce(lambda x, y: x+y, self._proportions, 0)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

e.g. self._proportions.sum() (look at DataFrame.sample for some example of handling these types of weights)

# compute proportions as fractions
self._proportions = [x*1./weights_total for x in self._proportions]
super(OrderedGrouper, self).__init__()

def _get_grouper(self, obj):
if self._axis is None:
self._axis = obj._stat_axis_number
self._axis = obj._get_axis_number(self._axis)
axis_length = obj.shape[self._axis]

numbers = [int(round(prop*axis_length)) for prop in self._proportions]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

all of this because almost trivial


newcol = reduce(lambda x, y: x + y, [[x]*numbers[x] for x in range(len(numbers))])
while len(newcol) < axis_length:
newcol.append(newcol[-1])

self._processidxs(newcol)

grouping = Grouping(obj._get_axis(self._axis), grouper=Series(newcol), obj=obj, sort=True, in_axis=True)

return None, BaseGrouper(self._axis, [grouping]), obj

def _processidxs(self, newcol):
pass

class RandomGrouper(OrderedGrouper):

def __init__(self, proportions=(1,1), axis=None, random=None):
# Process random_state argument
self.rs = com._random_state(random)
super(RandomGrouper, self).__init__(proportions, axis)

def _processidxs(self, newcol):
self.rs.shuffle(newcol)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure what this is for

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

which part?
._processidxs() or the .shuffle()?



class GroupByPlot(PandasObject):
"""
Class implementing the .plot attribute for groupby objects
Expand Down Expand Up @@ -658,6 +710,10 @@ def __iter__(self):
"""
return self.grouper.get_iterator(self.obj, axis=self.axis)

def split(self):
acc = [x for _, x in self]
return tuple(acc)

def apply(self, func, *args, **kwargs):
"""
Apply function and combine results together in an intelligent way. The
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,12 @@ def test_head_tail(self):
self._compare(o.head(-3), o.head(7))
self._compare(o.tail(-3), o.tail(7))

def test_split(self):
o = self._construct(shape=10)
a, b = o.split((1, 1), axis=0, random=True)
self.assertTrue(a.shape[0] == 5)
self.assertTrue(b.shape[0] == 5)

def test_sample(self):
# Fixes issue: 2419

Expand Down
7 changes: 6 additions & 1 deletion pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,8 +399,13 @@ def test_grouper_multilevel_freq(self):
pd.Grouper(level=1, freq='W')]).sum()
assert_frame_equal(result, expected)

def test_grouper_creation_bug(self):
def test_grouper_random(self):
df = DataFrame({"A": [0,1,2,3,4,5], "b": [10,11,12,13,14,15]})
g = df.groupby(pd.RandomGrouper((1,2)))
a, b = g.split()
assert_frame_equal(df, df)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

your test is not testing anything

obviously need lots more tests

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, I know, this was just a placeholder


def test_grouper_creation_bug(self):
# GH 8795
df = DataFrame({'A':[0,0,1,1,2,2], 'B':[1,2,3,4,5,6]})
g = df.groupby('A')
Expand Down