Skip to content

ENH: Initial pass at implementing DataFrame.asof, GH 2941 #10266

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 64 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from pandas.core.indexing import (maybe_droplevels,
convert_to_index_sliceable,
check_bool_indexer)
from pandas.core.internals import (BlockManager,
from pandas.core.internals import (BlockManager, make_block,
create_block_manager_from_arrays,
create_block_manager_from_blocks)
from pandas.core.series import Series
Expand Down Expand Up @@ -2737,6 +2737,69 @@ def _maybe_casted_values(index, labels=None):
#----------------------------------------------------------------------
# Reindex-based selection methods

def asof(self, where, skipna='percolumn'):
"""
Return last good (non-null) value for each column of DataFrame for the
request dates. Definition of 'good' value controlled by skipna argument.
If there is no good value, NaN is returned.
Parameters
----------
where : date or sequence of dates
skipna : {'any', 'all', 'none', 'percolumn'}, default 'percolumn'
* any: Ignore/skip rows where any of the columns are null.
* all: Ignore/skip rows where all of the columns are null.
* none: Don't ignore/skip any rows.
* percolumn: Ignore/skip null rows for each column seperately.
Equivalent to df.apply(lambda s: s.asof(where)).
Notes
-----
Dates are assumed to be sorted
Returns
-------
Series if where is a date, DataFrame if where is a sequence of dates.
"""
if isinstance(where, compat.string_types):
where = datetools.to_datetime(where)

if skipna == 'percolumn':
return self.apply(lambda s: s.asof(where))
elif skipna == 'none':
row_mask = np.ones((self.shape[0],), dtype=np.bool)
elif skipna == 'any':
row_mask = ~(self.isnull().any(axis=1).values)
elif skipna == 'all':
row_mask = ~(self.isnull().all(axis=1).values)
else:
raise ValueError("skipna must be one of percolumn, none, any, all.")

if not hasattr(where, '__iter__'):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might be better simpler just to call self.asof([where]).iloc[0] -- though you might need to disable the "setting item with copy warning" in that case.

loc = self.index.asof_locs(Index([where]), row_mask)[0]
if loc == -1:
return Series(index=self.columns, data=np.nan)

s = self.iloc[loc, :].copy()
s.name = None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The name of this series should probably be the where argument. This will be necessary for the consistency check I describe below.

return s

locs = self.index.asof_locs(where, row_mask)

new_blocks = []
for block in self._data.blocks:
new_values = com.take_2d_multi(block.values, [None, locs])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm presuming you're using this because it speeds things up significantly? In general I try to avoid using the block level interface unless necessary, but I can see how that would make a difference here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

any block logic MUST be in internals, and not here.

# can we use make_block_same_Class? not sure how that interacts with
# needing to cast an int to a float once you get missings
#b = block.make_block_same_class(new_values, block.mgr_locs)
new_block = make_block(new_values, block.mgr_locs)
new_blocks.append(new_block)
new_mgr = create_block_manager_from_blocks(new_blocks,
[self._data.axes[0], where])
new_df = self._constructor(new_mgr)
return new_df

def dropna(self, axis=0, how='any', thresh=None, subset=None,
inplace=False):
"""
Expand Down
77 changes: 77 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2270,6 +2270,83 @@ def test_get_axis(self):
assertRaisesRegexp(ValueError, 'No axis.*None', f._get_axis_name, None)
assertRaisesRegexp(ValueError, 'No axis named', f._get_axis_number, None)

def test_asof(self):
dates = date_range('2014/01/02', periods=4, freq='3D')
df = pd.DataFrame(data={'a': ["a", None, "b", "c"],
'b': [1, None, 2, 3],
'c': [1, None, None, 3],
'd': [None, None, 2, 3]},
index=dates)

test_dates = date_range('2014/01/01', periods=5, freq='3D')

# test using skipna = none, the simplest case
result_skipna_none = df.asof(test_dates, skipna='none')
# make sure the index matches
self.assertTrue((result_skipna_none.index == test_dates).all())
# compare with the expected frame
expected_result = pd.DataFrame(data={'a': [None, "a", None, "b", "c"],
'b': [None, 1, None, 2, 3],
'c': [None, 1, None, None, 3],
'd': [None, None, None, 2, 3]},
index=test_dates)
assert_frame_equal(result_skipna_none, expected_result)

# test using skipna=any
result_skipna_any = df.asof(test_dates, skipna='any')
# compare with the expected result
expected_result = pd.DataFrame(data={'a': [None, None, None, None, "c"],
'b': [None, None, None, None, 3],
'c': [None, None, None, None, 3],
'd': [None, None, None, None, 3]},
index=test_dates)
assert_frame_equal(result_skipna_any, expected_result)

result_skipna_all = df.asof(test_dates, skipna='all')
# compare with expected result
expected_result = pd.DataFrame(data={'a': [None, "a", "a", "b", "c"],
'b': [None, 1, 1, 2, 3],
'c': [None, 1, 1, None, 3],
'd': [None, None, None, 2, 3]},
index=test_dates)
assert_frame_equal(result_skipna_all, expected_result)

# finally the most complicated case, skipna=percolumn
result_skipna_percolumn = df.asof(test_dates, skipna='percolumn')
# compare with expected result
expected_result = pd.DataFrame(data={'a': [None, "a", "a", "b", "c"],
'b': [None, 1, 1, 2, 3],
'c': [None, 1, 1, 1, 3],
'd': [None, None, None, 2, 3]},
index=test_dates)
assert_frame_equal(result_skipna_percolumn, expected_result)

# test calling with scalar values
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could you add a consistency check to verify that using a scalar value is consistent with using a list of values and taking the first row? e.g., assert_series_equal(df.where(t), df.where([t]).iloc[0])

s1 = df.asof(test_dates[0], skipna='none')
self.assertIsNone(s1.name)
self.assertTrue(isnull(s1).all())

s2 = df.asof(test_dates[2], skipna='none')
self.assertIsNone(s2.name)
s2_expected = result_skipna_none.iloc[2, :]
s2_expected.name = None
assert_series_equal(s2_expected, s2)

s3 = df.asof(test_dates[2], skipna='any')
self.assertIsNone(s3.name)
self.assertTrue(isnull(s3).all())

s4 = df.asof(test_dates[2], skipna='all')
self.assertIsNone(s4.name)
s4_expected = result_skipna_all.iloc[2, :]
s4_expected.name = None
assert_series_equal(s4_expected, s4)

s5 = df.asof(test_dates[2], skipna='percolumn')
self.assertIsNone(s5.name)
s5_expected = df.apply(lambda s: s.asof(test_dates[2]))
assert_series_equal(s5_expected, s5)

def test_set_index(self):
idx = Index(np.arange(len(self.mixed_frame)))

Expand Down