-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: Initial pass at implementing DataFrame.asof, GH 2941 #10266
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,7 +34,7 @@ | |
from pandas.core.indexing import (maybe_droplevels, | ||
convert_to_index_sliceable, | ||
check_bool_indexer) | ||
from pandas.core.internals import (BlockManager, | ||
from pandas.core.internals import (BlockManager, make_block, | ||
create_block_manager_from_arrays, | ||
create_block_manager_from_blocks) | ||
from pandas.core.series import Series | ||
|
@@ -2737,6 +2737,69 @@ def _maybe_casted_values(index, labels=None): | |
#---------------------------------------------------------------------- | ||
# Reindex-based selection methods | ||
|
||
def asof(self, where, skipna='percolumn'): | ||
""" | ||
Return last good (non-null) value for each column of DataFrame for the | ||
request dates. Definition of 'good' value controlled by skipna argument. | ||
If there is no good value, NaN is returned. | ||
Parameters | ||
---------- | ||
where : date or sequence of dates | ||
skipna : {'any', 'all', 'none', 'percolumn'}, default 'percolumn' | ||
* any: Ignore/skip rows where any of the columns are null. | ||
* all: Ignore/skip rows where all of the columns are null. | ||
* none: Don't ignore/skip any rows. | ||
* percolumn: Ignore/skip null rows for each column seperately. | ||
Equivalent to df.apply(lambda s: s.asof(where)). | ||
Notes | ||
----- | ||
Dates are assumed to be sorted | ||
Returns | ||
------- | ||
Series if where is a date, DataFrame if where is a sequence of dates. | ||
""" | ||
if isinstance(where, compat.string_types): | ||
where = datetools.to_datetime(where) | ||
|
||
if skipna == 'percolumn': | ||
return self.apply(lambda s: s.asof(where)) | ||
elif skipna == 'none': | ||
row_mask = np.ones((self.shape[0],), dtype=np.bool) | ||
elif skipna == 'any': | ||
row_mask = ~(self.isnull().any(axis=1).values) | ||
elif skipna == 'all': | ||
row_mask = ~(self.isnull().all(axis=1).values) | ||
else: | ||
raise ValueError("skipna must be one of percolumn, none, any, all.") | ||
|
||
if not hasattr(where, '__iter__'): | ||
loc = self.index.asof_locs(Index([where]), row_mask)[0] | ||
if loc == -1: | ||
return Series(index=self.columns, data=np.nan) | ||
|
||
s = self.iloc[loc, :].copy() | ||
s.name = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The name of this series should probably be the |
||
return s | ||
|
||
locs = self.index.asof_locs(where, row_mask) | ||
|
||
new_blocks = [] | ||
for block in self._data.blocks: | ||
new_values = com.take_2d_multi(block.values, [None, locs]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm presuming you're using this because it speeds things up significantly? In general I try to avoid using the block level interface unless necessary, but I can see how that would make a difference here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. any block logic MUST be in internals, and not here. |
||
# can we use make_block_same_Class? not sure how that interacts with | ||
# needing to cast an int to a float once you get missings | ||
#b = block.make_block_same_class(new_values, block.mgr_locs) | ||
new_block = make_block(new_values, block.mgr_locs) | ||
new_blocks.append(new_block) | ||
new_mgr = create_block_manager_from_blocks(new_blocks, | ||
[self._data.axes[0], where]) | ||
new_df = self._constructor(new_mgr) | ||
return new_df | ||
|
||
def dropna(self, axis=0, how='any', thresh=None, subset=None, | ||
inplace=False): | ||
""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2270,6 +2270,83 @@ def test_get_axis(self): | |
assertRaisesRegexp(ValueError, 'No axis.*None', f._get_axis_name, None) | ||
assertRaisesRegexp(ValueError, 'No axis named', f._get_axis_number, None) | ||
|
||
def test_asof(self): | ||
dates = date_range('2014/01/02', periods=4, freq='3D') | ||
df = pd.DataFrame(data={'a': ["a", None, "b", "c"], | ||
'b': [1, None, 2, 3], | ||
'c': [1, None, None, 3], | ||
'd': [None, None, 2, 3]}, | ||
index=dates) | ||
|
||
test_dates = date_range('2014/01/01', periods=5, freq='3D') | ||
|
||
# test using skipna = none, the simplest case | ||
result_skipna_none = df.asof(test_dates, skipna='none') | ||
# make sure the index matches | ||
self.assertTrue((result_skipna_none.index == test_dates).all()) | ||
# compare with the expected frame | ||
expected_result = pd.DataFrame(data={'a': [None, "a", None, "b", "c"], | ||
'b': [None, 1, None, 2, 3], | ||
'c': [None, 1, None, None, 3], | ||
'd': [None, None, None, 2, 3]}, | ||
index=test_dates) | ||
assert_frame_equal(result_skipna_none, expected_result) | ||
|
||
# test using skipna=any | ||
result_skipna_any = df.asof(test_dates, skipna='any') | ||
# compare with the expected result | ||
expected_result = pd.DataFrame(data={'a': [None, None, None, None, "c"], | ||
'b': [None, None, None, None, 3], | ||
'c': [None, None, None, None, 3], | ||
'd': [None, None, None, None, 3]}, | ||
index=test_dates) | ||
assert_frame_equal(result_skipna_any, expected_result) | ||
|
||
result_skipna_all = df.asof(test_dates, skipna='all') | ||
# compare with expected result | ||
expected_result = pd.DataFrame(data={'a': [None, "a", "a", "b", "c"], | ||
'b': [None, 1, 1, 2, 3], | ||
'c': [None, 1, 1, None, 3], | ||
'd': [None, None, None, 2, 3]}, | ||
index=test_dates) | ||
assert_frame_equal(result_skipna_all, expected_result) | ||
|
||
# finally the most complicated case, skipna=percolumn | ||
result_skipna_percolumn = df.asof(test_dates, skipna='percolumn') | ||
# compare with expected result | ||
expected_result = pd.DataFrame(data={'a': [None, "a", "a", "b", "c"], | ||
'b': [None, 1, 1, 2, 3], | ||
'c': [None, 1, 1, 1, 3], | ||
'd': [None, None, None, 2, 3]}, | ||
index=test_dates) | ||
assert_frame_equal(result_skipna_percolumn, expected_result) | ||
|
||
# test calling with scalar values | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could you add a consistency check to verify that using a scalar value is consistent with using a list of values and taking the first row? e.g., |
||
s1 = df.asof(test_dates[0], skipna='none') | ||
self.assertIsNone(s1.name) | ||
self.assertTrue(isnull(s1).all()) | ||
|
||
s2 = df.asof(test_dates[2], skipna='none') | ||
self.assertIsNone(s2.name) | ||
s2_expected = result_skipna_none.iloc[2, :] | ||
s2_expected.name = None | ||
assert_series_equal(s2_expected, s2) | ||
|
||
s3 = df.asof(test_dates[2], skipna='any') | ||
self.assertIsNone(s3.name) | ||
self.assertTrue(isnull(s3).all()) | ||
|
||
s4 = df.asof(test_dates[2], skipna='all') | ||
self.assertIsNone(s4.name) | ||
s4_expected = result_skipna_all.iloc[2, :] | ||
s4_expected.name = None | ||
assert_series_equal(s4_expected, s4) | ||
|
||
s5 = df.asof(test_dates[2], skipna='percolumn') | ||
self.assertIsNone(s5.name) | ||
s5_expected = df.apply(lambda s: s.asof(test_dates[2])) | ||
assert_series_equal(s5_expected, s5) | ||
|
||
def test_set_index(self): | ||
idx = Index(np.arange(len(self.mixed_frame))) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
might be better simpler just to call
self.asof([where]).iloc[0]
-- though you might need to disable the "setting item with copy warning" in that case.