ENH: Initial pass at implementing DataFrame.asof, GH 2941

bwillers · bwillers · commit b9c96d1d6513 · 2015-06-03T22:14:25.000-04:00
This can almost certainly be made quicker, still digging into the
internals to understand the various underlying indexers.
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -2737,6 +2737,64 @@ def _maybe_casted_values(index, labels=None):
     #----------------------------------------------------------------------
     # Reindex-based selection methods
 
+    def asof(self, where, skipna='percolumn'):
+        """
+        Return last good (non-null) value for each column of DataFrame for the
+        request dates. Definition of 'good' value controlled by skipna argument.
+
+        If there is no good value, NaN is returned.
+
+        Parameters
+        ----------
+        where : date or sequence of dates
+        skipna :  {'any', 'all', 'none', 'percolumn'}, default 'percolumn'
+            * any: Ignore/skip rows where any of the columns are null.
+            * all: Ignore/skip rows where all of the columns are null.
+            * none: Don't ignore/skip any rows.
+            * percolumn:  Ignore/skip null rows for each column seperately.
+                          Equivalent to df.apply(lambda s: s.asof(where)).
+
+        Notes
+        -----
+        Dates are assumed to be sorted
+
+        Returns
+        -------
+        Series if where is a date, DataFrame if where is a sequence of dates.
+        """
+        if isinstance(where, compat.string_types):
+            where = datetools.to_datetime(where)
+
+        if skipna == 'percolumn':
+            return self.apply(lambda s: s.asof(where))
+        elif skipna == 'none':
+            row_mask = np.ones((self.shape[0],), dtype=np.bool)
+        elif skipna == 'any':
+            row_mask = ~(self.isnull().any(axis=1).values)
+        elif skipna == 'all':
+            row_mask = ~(self.isnull().all(axis=1).values)
+        else:
+            raise ValueError("skipna must be one of percolumn, none, any, all.")
+
+        if not hasattr(where, '__iter__'):
+            loc = self.index.asof_locs(Index([where]), row_mask)[0]
+            if loc == -1:
+                return Series(index=self.columns, data=np.nan)
+
+            s = self.iloc[loc,:].copy()
+            s.name = None
+            return s
+
+        # take uses tiling from the end with negative values, but we want blanks
+        # this is almost certainly a slow way fo doing it, but it will do for now.
+        locs = self.index.asof_locs(where, row_mask)
+
+        must_blank = np.nonzero(locs == -1)[0]
+        new_df = self.take(locs).copy()
+        new_df.iloc[must_blank, :] = None
+        new_df.index = where
+        return new_df
+
     def dropna(self, axis=0, how='any', thresh=None, subset=None,
                inplace=False):
         """
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -2266,6 +2266,78 @@ def test_get_axis(self):
         assertRaisesRegexp(ValueError, 'No axis.*None', f._get_axis_name, None)
         assertRaisesRegexp(ValueError, 'No axis named', f._get_axis_number, None)
 
+    def test_asof(self):
+        dates = date_range('2014/01/02', periods=4, freq='3D')
+        df = pd.DataFrame(data={'a': ["a", None, "b", "c"],
+                                'b': [1, None, 2, 3],
+                                'c': [1, None, None, 3],
+                                'd': [None, None, 2, 3]},
+                         index=dates)
+
+        test_dates = date_range('2014/01/01', periods=5, freq='3D')
+
+        # test using skipna = none, the simplest case
+        result_skipna_none = df.asof(test_dates, skipna='none')
+        # make sure the index matches
+        self.assertTrue((result_skipna_none.index == test_dates).all())
+        # compare with the expected frame
+        expected_result = pd.DataFrame(data={'a': [None, "a", None, "b", "c"],
+                                             'b': [None, 1, None, 2, 3],
+                                             'c': [None, 1, None, None, 3],
+                                             'd': [None, None, None, 2, 3]},
+                                       index=test_dates)
+        assert_frame_equal(result_skipna_none, expected_result)
+
+        # test using skipna=any
+        result_skipna_any = df.asof(test_dates, skipna='any')
+        # compare with the expected result
+        expected_result = pd.DataFrame(data={'a': [None, None, None, None, "c"],
+                                             'b': [None, None, None, None, 3],
+                                             'c': [None, None, None, None, 3],
+                                             'd': [None, None, None, None, 3]},
+                                       index=test_dates)
+        assert_frame_equal(result_skipna_any, expected_result)
+
+        result_skipna_all = df.asof(test_dates, skipna='all')
+        # compare with expected result
+        expected_result = pd.DataFrame(data={'a': [None, "a", "a", "b", "c"],
+                                             'b': [None, 1, 1, 2, 3],
+                                             'c': [None, 1, 1, None, 3],
+                                             'd': [None, None, None, 2, 3]},
+                                       index=test_dates)
+        assert_frame_equal(result_skipna_all, expected_result)
+
+        # finally the most complicated case, skipna=percolumn
+        result_skipna_percolumn = df.asof(test_dates, skipna='percolumn')
+        # compare with expected result
+        expected_result = pd.DataFrame(data={'a': [None, "a", "a", "b", "c"],
+                                             'b': [None, 1, 1, 2, 3],
+                                             'c': [None, 1, 1, 1, 3],
+                                             'd': [None, None, None, 2, 3]},
+                                       index=test_dates)
+        assert_frame_equal(result_skipna_percolumn, expected_result)
+
+        # test calling with scalar values
+        s1 = df.asof(test_dates[0], skipna='none')
+        self.assertIsNone(s1.name)
+        self.assertTrue(isnull(s1).all())
+
+        s2 = df.asof(test_dates[2], skipna='none')
+        self.assertIsNone(s2.name)
+        assert_series_equal(result_skipna_none.iloc[2,:], s2)
+
+        s3 = df.asof(test_dates[2], skipna='any')
+        self.assertIsNone(s3.name)
+        self.assertTrue(isnull(s3).all())
+
+        s4 = df.asof(test_dates[2], skipna='all')
+        self.assertIsNone(s4.name)
+        assert_series_equal(result_skipna_all.iloc[2,:], s4)
+
+        s5 = df.asof(test_dates[2], skipna='percolumn')
+        self.assertIsNone(s5.name)
+        assert_series_equal(result_skipna_percolumn.iloc[2,:], s5)
+
     def test_set_index(self):
         idx = Index(np.arange(len(self.mixed_frame)))