ENH: Initial pass at implementing DataFrame.asof, GH 2941

bwillers · bwillers · commit 5c925454814a · 2015-06-11T23:25:16.000-04:00
Implements DataFrame.asof with various possible logics for
skipping missing elements. Default case is equivalent to
	df.apply(lambda s: s.asof(where))
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -34,7 +34,7 @@
 from pandas.core.indexing import (maybe_droplevels,
                                   convert_to_index_sliceable,
                                   check_bool_indexer)
-from pandas.core.internals import (BlockManager,
+from pandas.core.internals import (BlockManager, make_block,
                                    create_block_manager_from_arrays,
                                    create_block_manager_from_blocks)
 from pandas.core.series import Series
@@ -2737,6 +2737,69 @@ def _maybe_casted_values(index, labels=None):
     #----------------------------------------------------------------------
     # Reindex-based selection methods
 
+    def asof(self, where, skipna='percolumn'):
+        """
+        Return last good (non-null) value for each column of DataFrame for the
+        request dates. Definition of 'good' value controlled by skipna argument.
+
+        If there is no good value, NaN is returned.
+
+        Parameters
+        ----------
+        where : date or sequence of dates
+        skipna :  {'any', 'all', 'none', 'percolumn'}, default 'percolumn'
+            * any: Ignore/skip rows where any of the columns are null.
+            * all: Ignore/skip rows where all of the columns are null.
+            * none: Don't ignore/skip any rows.
+            * percolumn:  Ignore/skip null rows for each column seperately.
+                          Equivalent to df.apply(lambda s: s.asof(where)).
+
+        Notes
+        -----
+        Dates are assumed to be sorted
+
+        Returns
+        -------
+        Series if where is a date, DataFrame if where is a sequence of dates.
+        """
+        if isinstance(where, compat.string_types):
+            where = datetools.to_datetime(where)
+
+        if skipna == 'percolumn':
+            return self.apply(lambda s: s.asof(where))
+        elif skipna == 'none':
+            row_mask = np.ones((self.shape[0],), dtype=np.bool)
+        elif skipna == 'any':
+            row_mask = ~(self.isnull().any(axis=1).values)
+        elif skipna == 'all':
+            row_mask = ~(self.isnull().all(axis=1).values)
+        else:
+            raise ValueError("skipna must be one of percolumn, none, any, all.")
+
+        if not hasattr(where, '__iter__'):
+            loc = self.index.asof_locs(Index([where]), row_mask)[0]
+            if loc == -1:
+                return Series(index=self.columns, data=np.nan)
+
+            s = self.iloc[loc, :].copy()
+            s.name = None
+            return s
+
+        locs = self.index.asof_locs(where, row_mask)
+
+        new_blocks = []
+        for block in self._data.blocks:
+            new_values = com.take_2d_multi(block.values, [None, locs])
+            # can we use make_block_same_Class? not sure how that interacts with
+            # needing to cast an int to a float once you get missings
+            #b = block.make_block_same_class(new_values, block.mgr_locs)
+            new_block = make_block(new_values, block.mgr_locs)
+            new_blocks.append(new_block)
+        new_mgr = create_block_manager_from_blocks(new_blocks,
+                                                   [self._data.axes[0], where])
+        new_df = self._constructor(new_mgr)
+        return new_df
+
     def dropna(self, axis=0, how='any', thresh=None, subset=None,
                inplace=False):
         """
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -2270,6 +2270,83 @@ def test_get_axis(self):
         assertRaisesRegexp(ValueError, 'No axis.*None', f._get_axis_name, None)
         assertRaisesRegexp(ValueError, 'No axis named', f._get_axis_number, None)
 
+    def test_asof(self):
+        dates = date_range('2014/01/02', periods=4, freq='3D')
+        df = pd.DataFrame(data={'a': ["a", None, "b", "c"],
+                                'b': [1, None, 2, 3],
+                                'c': [1, None, None, 3],
+                                'd': [None, None, 2, 3]},
+                          index=dates)
+
+        test_dates = date_range('2014/01/01', periods=5, freq='3D')
+
+        # test using skipna = none, the simplest case
+        result_skipna_none = df.asof(test_dates, skipna='none')
+        # make sure the index matches
+        self.assertTrue((result_skipna_none.index == test_dates).all())
+        # compare with the expected frame
+        expected_result = pd.DataFrame(data={'a': [None, "a", None, "b", "c"],
+                                             'b': [None, 1, None, 2, 3],
+                                             'c': [None, 1, None, None, 3],
+                                             'd': [None, None, None, 2, 3]},
+                                       index=test_dates)
+        assert_frame_equal(result_skipna_none, expected_result)
+
+        # test using skipna=any
+        result_skipna_any = df.asof(test_dates, skipna='any')
+        # compare with the expected result
+        expected_result = pd.DataFrame(data={'a': [None, None, None, None, "c"],
+                                             'b': [None, None, None, None, 3],
+                                             'c': [None, None, None, None, 3],
+                                             'd': [None, None, None, None, 3]},
+                                       index=test_dates)
+        assert_frame_equal(result_skipna_any, expected_result)
+
+        result_skipna_all = df.asof(test_dates, skipna='all')
+        # compare with expected result
+        expected_result = pd.DataFrame(data={'a': [None, "a", "a", "b", "c"],
+                                             'b': [None, 1, 1, 2, 3],
+                                             'c': [None, 1, 1, None, 3],
+                                             'd': [None, None, None, 2, 3]},
+                                       index=test_dates)
+        assert_frame_equal(result_skipna_all, expected_result)
+
+        # finally the most complicated case, skipna=percolumn
+        result_skipna_percolumn = df.asof(test_dates, skipna='percolumn')
+        # compare with expected result
+        expected_result = pd.DataFrame(data={'a': [None, "a", "a", "b", "c"],
+                                             'b': [None, 1, 1, 2, 3],
+                                             'c': [None, 1, 1, 1, 3],
+                                             'd': [None, None, None, 2, 3]},
+                                       index=test_dates)
+        assert_frame_equal(result_skipna_percolumn, expected_result)
+
+        # test calling with scalar values
+        s1 = df.asof(test_dates[0], skipna='none')
+        self.assertIsNone(s1.name)
+        self.assertTrue(isnull(s1).all())
+
+        s2 = df.asof(test_dates[2], skipna='none')
+        self.assertIsNone(s2.name)
+        s2_expected = result_skipna_none.iloc[2, :]
+        s2_expected.name = None
+        assert_series_equal(s2_expected, s2)
+
+        s3 = df.asof(test_dates[2], skipna='any')
+        self.assertIsNone(s3.name)
+        self.assertTrue(isnull(s3).all())
+
+        s4 = df.asof(test_dates[2], skipna='all')
+        self.assertIsNone(s4.name)
+        s4_expected = result_skipna_all.iloc[2, :]
+        s4_expected.name = None
+        assert_series_equal(s4_expected, s4)
+
+        s5 = df.asof(test_dates[2], skipna='percolumn')
+        self.assertIsNone(s5.name)
+        s5_expected = df.apply(lambda s: s.asof(test_dates[2]))
+        assert_series_equal(s5_expected, s5)
+
     def test_set_index(self):
         idx = Index(np.arange(len(self.mixed_frame)))