From 5c925454814a93e42d2c5b6faffed4756821df90 Mon Sep 17 00:00:00 2001 From: Bernard Willers Date: Tue, 2 Jun 2015 21:15:12 -0400 Subject: [PATCH] ENH: Initial pass at implementing DataFrame.asof, GH 2941 Implements DataFrame.asof with various possible logics for skipping missing elements. Default case is equivalent to df.apply(lambda s: s.asof(where)) --- pandas/core/frame.py | 65 +++++++++++++++++++++++++++++++- pandas/tests/test_frame.py | 77 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 141 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c2a705ea12608..f47f20b631a25 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -34,7 +34,7 @@ from pandas.core.indexing import (maybe_droplevels, convert_to_index_sliceable, check_bool_indexer) -from pandas.core.internals import (BlockManager, +from pandas.core.internals import (BlockManager, make_block, create_block_manager_from_arrays, create_block_manager_from_blocks) from pandas.core.series import Series @@ -2737,6 +2737,69 @@ def _maybe_casted_values(index, labels=None): #---------------------------------------------------------------------- # Reindex-based selection methods + def asof(self, where, skipna='percolumn'): + """ + Return last good (non-null) value for each column of DataFrame for the + request dates. Definition of 'good' value controlled by skipna argument. + + If there is no good value, NaN is returned. + + Parameters + ---------- + where : date or sequence of dates + skipna : {'any', 'all', 'none', 'percolumn'}, default 'percolumn' + * any: Ignore/skip rows where any of the columns are null. + * all: Ignore/skip rows where all of the columns are null. + * none: Don't ignore/skip any rows. + * percolumn: Ignore/skip null rows for each column seperately. + Equivalent to df.apply(lambda s: s.asof(where)). + + Notes + ----- + Dates are assumed to be sorted + + Returns + ------- + Series if where is a date, DataFrame if where is a sequence of dates. + """ + if isinstance(where, compat.string_types): + where = datetools.to_datetime(where) + + if skipna == 'percolumn': + return self.apply(lambda s: s.asof(where)) + elif skipna == 'none': + row_mask = np.ones((self.shape[0],), dtype=np.bool) + elif skipna == 'any': + row_mask = ~(self.isnull().any(axis=1).values) + elif skipna == 'all': + row_mask = ~(self.isnull().all(axis=1).values) + else: + raise ValueError("skipna must be one of percolumn, none, any, all.") + + if not hasattr(where, '__iter__'): + loc = self.index.asof_locs(Index([where]), row_mask)[0] + if loc == -1: + return Series(index=self.columns, data=np.nan) + + s = self.iloc[loc, :].copy() + s.name = None + return s + + locs = self.index.asof_locs(where, row_mask) + + new_blocks = [] + for block in self._data.blocks: + new_values = com.take_2d_multi(block.values, [None, locs]) + # can we use make_block_same_Class? not sure how that interacts with + # needing to cast an int to a float once you get missings + #b = block.make_block_same_class(new_values, block.mgr_locs) + new_block = make_block(new_values, block.mgr_locs) + new_blocks.append(new_block) + new_mgr = create_block_manager_from_blocks(new_blocks, + [self._data.axes[0], where]) + new_df = self._constructor(new_mgr) + return new_df + def dropna(self, axis=0, how='any', thresh=None, subset=None, inplace=False): """ diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 4b1954a3be64e..928a2b93faa77 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -2270,6 +2270,83 @@ def test_get_axis(self): assertRaisesRegexp(ValueError, 'No axis.*None', f._get_axis_name, None) assertRaisesRegexp(ValueError, 'No axis named', f._get_axis_number, None) + def test_asof(self): + dates = date_range('2014/01/02', periods=4, freq='3D') + df = pd.DataFrame(data={'a': ["a", None, "b", "c"], + 'b': [1, None, 2, 3], + 'c': [1, None, None, 3], + 'd': [None, None, 2, 3]}, + index=dates) + + test_dates = date_range('2014/01/01', periods=5, freq='3D') + + # test using skipna = none, the simplest case + result_skipna_none = df.asof(test_dates, skipna='none') + # make sure the index matches + self.assertTrue((result_skipna_none.index == test_dates).all()) + # compare with the expected frame + expected_result = pd.DataFrame(data={'a': [None, "a", None, "b", "c"], + 'b': [None, 1, None, 2, 3], + 'c': [None, 1, None, None, 3], + 'd': [None, None, None, 2, 3]}, + index=test_dates) + assert_frame_equal(result_skipna_none, expected_result) + + # test using skipna=any + result_skipna_any = df.asof(test_dates, skipna='any') + # compare with the expected result + expected_result = pd.DataFrame(data={'a': [None, None, None, None, "c"], + 'b': [None, None, None, None, 3], + 'c': [None, None, None, None, 3], + 'd': [None, None, None, None, 3]}, + index=test_dates) + assert_frame_equal(result_skipna_any, expected_result) + + result_skipna_all = df.asof(test_dates, skipna='all') + # compare with expected result + expected_result = pd.DataFrame(data={'a': [None, "a", "a", "b", "c"], + 'b': [None, 1, 1, 2, 3], + 'c': [None, 1, 1, None, 3], + 'd': [None, None, None, 2, 3]}, + index=test_dates) + assert_frame_equal(result_skipna_all, expected_result) + + # finally the most complicated case, skipna=percolumn + result_skipna_percolumn = df.asof(test_dates, skipna='percolumn') + # compare with expected result + expected_result = pd.DataFrame(data={'a': [None, "a", "a", "b", "c"], + 'b': [None, 1, 1, 2, 3], + 'c': [None, 1, 1, 1, 3], + 'd': [None, None, None, 2, 3]}, + index=test_dates) + assert_frame_equal(result_skipna_percolumn, expected_result) + + # test calling with scalar values + s1 = df.asof(test_dates[0], skipna='none') + self.assertIsNone(s1.name) + self.assertTrue(isnull(s1).all()) + + s2 = df.asof(test_dates[2], skipna='none') + self.assertIsNone(s2.name) + s2_expected = result_skipna_none.iloc[2, :] + s2_expected.name = None + assert_series_equal(s2_expected, s2) + + s3 = df.asof(test_dates[2], skipna='any') + self.assertIsNone(s3.name) + self.assertTrue(isnull(s3).all()) + + s4 = df.asof(test_dates[2], skipna='all') + self.assertIsNone(s4.name) + s4_expected = result_skipna_all.iloc[2, :] + s4_expected.name = None + assert_series_equal(s4_expected, s4) + + s5 = df.asof(test_dates[2], skipna='percolumn') + self.assertIsNone(s5.name) + s5_expected = df.apply(lambda s: s.asof(test_dates[2])) + assert_series_equal(s5_expected, s5) + def test_set_index(self): idx = Index(np.arange(len(self.mixed_frame)))