diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 9ea7b740bae8f..a6bafc811d321 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -115,6 +115,7 @@ Other enhancements - ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`) - ``.select_dtypes()`` now allows the string 'datetimetz' to generically select datetimes with tz (:issue:`14910`) +- ``pd.merge_asof()`` gained the option ``direction='backward'|'forward'|'nearest'`` (:issue:`14887`) .. _whatsnew_0200.api_breaking: diff --git a/pandas/src/joins_func_helper.pxi.in b/pandas/src/joins_func_helper.pxi.in index 33926a23f7f41..3b4d43b3ae32d 100644 --- a/pandas/src/joins_func_helper.pxi.in +++ b/pandas/src/joins_func_helper.pxi.in @@ -28,7 +28,8 @@ from hashtable cimport * {{for on_dtype in on_dtypes}} -def asof_join_{{on_dtype}}_by_{{by_dtype}}(ndarray[{{on_dtype}}] left_values, +def asof_join_backward_{{on_dtype}}_by_{{by_dtype}}( + ndarray[{{on_dtype}}] left_values, ndarray[{{on_dtype}}] right_values, ndarray[{{by_dtype}}] left_by_values, ndarray[{{by_dtype}}] right_by_values, @@ -40,6 +41,7 @@ def asof_join_{{on_dtype}}_by_{{by_dtype}}(ndarray[{{on_dtype}}] left_values, ndarray[int64_t] left_indexer, right_indexer bint has_tolerance = 0 {{on_dtype}} tolerance_ + {{on_dtype}} diff {{table_type}} hash_table {{by_dtype}} by_value @@ -62,7 +64,7 @@ def asof_join_{{on_dtype}}_by_{{by_dtype}}(ndarray[{{on_dtype}}] left_values, if right_pos < 0: right_pos = 0 - # find last position in right whose value is less than left's value + # find last position in right whose value is less than left's if allow_exact_matches: while right_pos < right_size and\ right_values[right_pos] <= left_values[left_pos]: @@ -90,6 +92,119 @@ def asof_join_{{on_dtype}}_by_{{by_dtype}}(ndarray[{{on_dtype}}] left_values, return left_indexer, right_indexer + +def asof_join_forward_{{on_dtype}}_by_{{by_dtype}}( + ndarray[{{on_dtype}}] left_values, + ndarray[{{on_dtype}}] right_values, + ndarray[{{by_dtype}}] left_by_values, + ndarray[{{by_dtype}}] right_by_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + {{on_dtype}} tolerance_ + {{on_dtype}} diff + {{table_type}} hash_table + {{by_dtype}} by_value + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + hash_table = {{table_type}}(right_size) + + right_pos = right_size - 1 + for left_pos in range(left_size - 1, -1, -1): + # restart right_pos if it went over in a previous iteration + if right_pos == right_size: + right_pos = right_size - 1 + + # find first position in right whose value is greater than left's + if allow_exact_matches: + while right_pos >= 0 and\ + right_values[right_pos] >= left_values[left_pos]: + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos -= 1 + else: + while right_pos >= 0 and\ + right_values[right_pos] > left_values[left_pos]: + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos -= 1 + right_pos += 1 + + # save positions as the desired index + by_value = left_by_values[left_pos] + found_right_pos = hash_table.get_item(by_value)\ + if by_value in hash_table else -1 + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = found_right_pos + + # if needed, verify that tolerance is met + if has_tolerance and found_right_pos != -1: + diff = right_values[found_right_pos] - left_values[left_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer + + +def asof_join_nearest_{{on_dtype}}_by_{{by_dtype}}( + ndarray[{{on_dtype}}] left_values, + ndarray[{{on_dtype}}] right_values, + ndarray[{{by_dtype}}] left_by_values, + ndarray[{{by_dtype}}] right_by_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_size, right_size, i + ndarray[int64_t] left_indexer, right_indexer, bli, bri, fli, fri + {{on_dtype}} bdiff, fdiff + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + # search both forward and backward + bli, bri =\ + asof_join_backward_{{on_dtype}}_by_{{by_dtype}}(left_values, + right_values, + left_by_values, + right_by_values, + allow_exact_matches, + tolerance) + fli, fri =\ + asof_join_forward_{{on_dtype}}_by_{{by_dtype}}(left_values, + right_values, + left_by_values, + right_by_values, + allow_exact_matches, + tolerance) + + for i in range(len(bri)): + # choose timestamp from right with smaller difference + if bri[i] != -1 and fri[i] != -1: + bdiff = left_values[bli[i]] - right_values[bri[i]] + fdiff = right_values[fri[i]] - left_values[fli[i]] + right_indexer[i] = bri[i] if bdiff <= fdiff else fri[i] + else: + right_indexer[i] = bri[i] if bri[i] != -1 else fri[i] + left_indexer[i] = bli[i] + + return left_indexer, right_indexer + {{endfor}} {{endfor}} @@ -110,7 +225,8 @@ dtypes = ['uint8_t', 'uint16_t', 'uint32_t', 'uint64_t', {{for on_dtype in dtypes}} -def asof_join_{{on_dtype}}(ndarray[{{on_dtype}}] left_values, +def asof_join_backward_{{on_dtype}}( + ndarray[{{on_dtype}}] left_values, ndarray[{{on_dtype}}] right_values, bint allow_exact_matches=1, tolerance=None): @@ -120,6 +236,7 @@ def asof_join_{{on_dtype}}(ndarray[{{on_dtype}}] left_values, ndarray[int64_t] left_indexer, right_indexer bint has_tolerance = 0 {{on_dtype}} tolerance_ + {{on_dtype}} diff # if we are using tolerance, set our objects if tolerance is not None: @@ -138,7 +255,7 @@ def asof_join_{{on_dtype}}(ndarray[{{on_dtype}}] left_values, if right_pos < 0: right_pos = 0 - # find last position in right whose value is less than left's value + # find last position in right whose value is less than left's if allow_exact_matches: while right_pos < right_size and\ right_values[right_pos] <= left_values[left_pos]: @@ -161,5 +278,96 @@ def asof_join_{{on_dtype}}(ndarray[{{on_dtype}}] left_values, return left_indexer, right_indexer + +def asof_join_forward_{{on_dtype}}( + ndarray[{{on_dtype}}] left_values, + ndarray[{{on_dtype}}] right_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + {{on_dtype}} tolerance_ + {{on_dtype}} diff + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + right_pos = right_size - 1 + for left_pos in range(left_size - 1, -1, -1): + # restart right_pos if it went over in a previous iteration + if right_pos == right_size: + right_pos = right_size - 1 + + # find first position in right whose value is greater than left's + if allow_exact_matches: + while right_pos >= 0 and\ + right_values[right_pos] >= left_values[left_pos]: + right_pos -= 1 + else: + while right_pos >= 0 and\ + right_values[right_pos] > left_values[left_pos]: + right_pos -= 1 + right_pos += 1 + + # save positions as the desired index + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = right_pos\ + if right_pos != right_size else -1 + + # if needed, verify that tolerance is met + if has_tolerance and right_pos != right_size: + diff = right_values[right_pos] - left_values[left_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer + + +def asof_join_nearest_{{on_dtype}}( + ndarray[{{on_dtype}}] left_values, + ndarray[{{on_dtype}}] right_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_size, right_size, i + ndarray[int64_t] left_indexer, right_indexer, bli, bri, fli, fri + {{on_dtype}} bdiff, fdiff + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + # search both forward and backward + bli, bri = asof_join_backward_{{on_dtype}}(left_values, right_values, + allow_exact_matches, tolerance) + fli, fri = asof_join_forward_{{on_dtype}}(left_values, right_values, + allow_exact_matches, tolerance) + + for i in range(len(bri)): + # choose timestamp from right with smaller difference + if bri[i] != -1 and fri[i] != -1: + bdiff = left_values[bli[i]] - right_values[bri[i]] + fdiff = right_values[fri[i]] - left_values[fli[i]] + right_indexer[i] = bri[i] if bdiff <= fdiff else fri[i] + else: + right_indexer[i] = bri[i] if bri[i] != -1 else fri[i] + left_indexer[i] = bli[i] + + return left_indexer, right_indexer + {{endfor}} diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 4012629aa3c90..e476a5874486c 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -266,16 +266,29 @@ def merge_asof(left, right, on=None, by=None, left_by=None, right_by=None, suffixes=('_x', '_y'), tolerance=None, - allow_exact_matches=True): + allow_exact_matches=True, + direction='backward'): """Perform an asof merge. This is similar to a left-join except that we match on nearest key rather than equal keys. - For each row in the left DataFrame, we select the last row in the right - DataFrame whose 'on' key is less than or equal to the left's key. Both - DataFrames must be sorted by the key. + Both DataFrames must be sorted by the key. - Optionally match on equivalent keys with 'by' before searching for nearest - match with 'on'. + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + 'on' key is less than or equal to the left's key. + + - A "forward" search selects the first row in the right DataFrame whose + 'on' key is greater than or equal to the left's key. + + - A "nearest" search selects the row in the right DataFrame whose 'on' + key is closest in absolute distance to the left's key. + + The default is "backward" and is the compatible in versions below 0.20.0. + The direction parameter was added in version 0.20.0 and introduces + "forward" and "nearest". + + Optionally match on equivalent keys with 'by' before searching with 'on'. .. versionadded:: 0.19.0 @@ -323,9 +336,14 @@ def merge_asof(left, right, on=None, allow_exact_matches : boolean, default True - If True, allow matching the same 'on' value - (i.e. less-than-or-equal-to) + (i.e. less-than-or-equal-to / greater-than-or-equal-to) - If False, don't match the same 'on' value - (i.e., stricly less-than) + (i.e., stricly less-than / strictly greater-than) + + direction : 'backward' (default), 'forward', or 'nearest' + Whether to search for prior, subsequent, or closest matches. + + .. versionadded:: 0.20.0 Returns ------- @@ -359,17 +377,17 @@ def merge_asof(left, right, on=None, 1 5 b 3.0 2 10 c 7.0 - For this example, we can achieve a similar result thru - ``pd.merge_ordered()``, though its not nearly as performant. - - >>> (pd.merge_ordered(left, right, on='a') - ... .ffill() - ... .drop_duplicates(['left_val']) - ... ) + >>> pd.merge_asof(left, right, on='a', direction='forward') a left_val right_val 0 1 a 1.0 - 3 5 b 3.0 - 6 10 c 7.0 + 1 5 b 6.0 + 2 10 c NaN + + >>> pd.merge_asof(left, right, on='a', direction='nearest') + a left_val right_val + 0 1 a 1 + 1 5 b 6 + 2 10 c 7 We can use indexed DataFrames as well. @@ -467,7 +485,8 @@ def merge_asof(left, right, on=None, by=by, left_by=left_by, right_by=right_by, suffixes=suffixes, how='asof', tolerance=tolerance, - allow_exact_matches=allow_exact_matches) + allow_exact_matches=allow_exact_matches, + direction=direction) return op.get_result() @@ -999,12 +1018,13 @@ def get_result(self): return result -def _asof_function(on_type): - return getattr(_join, 'asof_join_%s' % on_type, None) +def _asof_function(direction, on_type): + return getattr(_join, 'asof_join_%s_%s' % (direction, on_type), None) -def _asof_by_function(on_type, by_type): - return getattr(_join, 'asof_join_%s_by_%s' % (on_type, by_type), None) +def _asof_by_function(direction, on_type, by_type): + return getattr(_join, 'asof_join_%s_%s_by_%s' % + (direction, on_type, by_type), None) _type_casters = { @@ -1056,13 +1076,15 @@ def __init__(self, left, right, on=None, left_on=None, right_on=None, axis=1, suffixes=('_x', '_y'), copy=True, fill_method=None, how='asof', tolerance=None, - allow_exact_matches=True): + allow_exact_matches=True, + direction='backward'): self.by = by self.left_by = left_by self.right_by = right_by self.tolerance = tolerance self.allow_exact_matches = allow_exact_matches + self.direction = direction _OrderedMerge.__init__(self, left, right, on=on, left_on=left_on, right_on=right_on, left_index=left_index, @@ -1108,6 +1130,10 @@ def _validate_specification(self): self.left_on = self.left_by + list(self.left_on) self.right_on = self.right_by + list(self.right_on) + # check 'direction' is valid + if self.direction not in ['backward', 'forward', 'nearest']: + raise MergeError('direction invalid: ' + self.direction) + @property def _asof_key(self): """ This is our asof key, the 'on' """ @@ -1204,7 +1230,7 @@ def flip(xs): # choose appropriate function by type on_type = _get_cython_type(left_values.dtype) - func = _asof_by_function(on_type, by_type) + func = _asof_by_function(self.direction, on_type, by_type) return func(left_values, right_values, left_by_values, @@ -1214,7 +1240,7 @@ def flip(xs): else: # choose appropriate function by type on_type = _get_cython_type(left_values.dtype) - func = _asof_function(on_type) + func = _asof_function(self.direction, on_type) return func(left_values, right_values, self.allow_exact_matches, diff --git a/pandas/tools/tests/test_merge_asof.py b/pandas/tools/tests/test_merge_asof.py index bbbf1a3bdfff9..6d0b608cbd4e2 100644 --- a/pandas/tools/tests/test_merge_asof.py +++ b/pandas/tools/tests/test_merge_asof.py @@ -42,7 +42,12 @@ def test_examples1(self): right = pd.DataFrame({'a': [1, 2, 3, 6, 7], 'right_val': [1, 2, 3, 6, 7]}) - pd.merge_asof(left, right, on='a') + expected = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c'], + 'right_val': [1, 3, 7]}) + + result = pd.merge_asof(left, right, on='a') + assert_frame_equal(result, expected) def test_examples2(self): """ doc-string examples """ @@ -94,6 +99,38 @@ def test_examples2(self): tolerance=pd.Timedelta('10ms'), allow_exact_matches=False) + def test_examples3(self): + """ doc-string examples """ + # GH14887 + + left = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c']}) + right = pd.DataFrame({'a': [1, 2, 3, 6, 7], + 'right_val': [1, 2, 3, 6, 7]}) + + expected = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c'], + 'right_val': [1, 6, np.nan]}) + + result = pd.merge_asof(left, right, on='a', direction='forward') + assert_frame_equal(result, expected) + + def test_examples4(self): + """ doc-string examples """ + # GH14887 + + left = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c']}) + right = pd.DataFrame({'a': [1, 2, 3, 6, 7], + 'right_val': [1, 2, 3, 6, 7]}) + + expected = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c'], + 'right_val': [1, 6, 7]}) + + result = pd.merge_asof(left, right, on='a', direction='nearest') + assert_frame_equal(result, expected) + def test_basic(self): expected = self.asof @@ -495,6 +532,38 @@ def test_tolerance(self): expected = self.tolerance assert_frame_equal(result, expected) + def test_tolerance_forward(self): + # GH14887 + + left = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c']}) + right = pd.DataFrame({'a': [1, 2, 3, 7, 11], + 'right_val': [1, 2, 3, 7, 11]}) + + expected = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c'], + 'right_val': [1, np.nan, 11]}) + + result = pd.merge_asof(left, right, on='a', direction='forward', + tolerance=1) + assert_frame_equal(result, expected) + + def test_tolerance_nearest(self): + # GH14887 + + left = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c']}) + right = pd.DataFrame({'a': [1, 2, 3, 7, 11], + 'right_val': [1, 2, 3, 7, 11]}) + + expected = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c'], + 'right_val': [1, np.nan, 11]}) + + result = pd.merge_asof(left, right, on='a', direction='nearest', + tolerance=1) + assert_frame_equal(result, expected) + def test_tolerance_tz(self): # GH 14844 left = pd.DataFrame( @@ -527,6 +596,38 @@ def test_allow_exact_matches(self): expected = self.allow_exact_matches assert_frame_equal(result, expected) + def test_allow_exact_matches_forward(self): + # GH14887 + + left = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c']}) + right = pd.DataFrame({'a': [1, 2, 3, 7, 11], + 'right_val': [1, 2, 3, 7, 11]}) + + expected = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c'], + 'right_val': [2, 7, 11]}) + + result = pd.merge_asof(left, right, on='a', direction='forward', + allow_exact_matches=False) + assert_frame_equal(result, expected) + + def test_allow_exact_matches_nearest(self): + # GH14887 + + left = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c']}) + right = pd.DataFrame({'a': [1, 2, 3, 7, 11], + 'right_val': [1, 2, 3, 7, 11]}) + + expected = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c'], + 'right_val': [2, 3, 11]}) + + result = pd.merge_asof(left, right, on='a', direction='nearest', + allow_exact_matches=False) + assert_frame_equal(result, expected) + def test_allow_exact_matches_and_tolerance(self): result = merge_asof(self.trades, self.quotes, @@ -589,6 +690,76 @@ def test_allow_exact_matches_and_tolerance3(self): 'version': [np.nan, np.nan]}) assert_frame_equal(result, expected) + def test_allow_exact_matches_and_tolerance_forward(self): + # GH14887 + + left = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c']}) + right = pd.DataFrame({'a': [1, 3, 4, 6, 11], + 'right_val': [1, 3, 4, 6, 11]}) + + expected = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c'], + 'right_val': [np.nan, 6, 11]}) + + result = pd.merge_asof(left, right, on='a', direction='forward', + allow_exact_matches=False, tolerance=1) + assert_frame_equal(result, expected) + + def test_allow_exact_matches_and_tolerance_nearest(self): + # GH14887 + + left = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c']}) + right = pd.DataFrame({'a': [1, 3, 4, 6, 11], + 'right_val': [1, 3, 4, 7, 11]}) + + expected = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c'], + 'right_val': [np.nan, 4, 11]}) + + result = pd.merge_asof(left, right, on='a', direction='nearest', + allow_exact_matches=False, tolerance=1) + assert_frame_equal(result, expected) + + def test_forward_by(self): + # GH14887 + + left = pd.DataFrame({'a': [1, 5, 10, 12, 15], + 'b': ['X', 'X', 'Y', 'Z', 'Y'], + 'left_val': ['a', 'b', 'c', 'd', 'e']}) + right = pd.DataFrame({'a': [1, 6, 11, 15, 16], + 'b': ['X', 'Z', 'Y', 'Z', 'Y'], + 'right_val': [1, 6, 11, 15, 16]}) + + expected = pd.DataFrame({'a': [1, 5, 10, 12, 15], + 'b': ['X', 'X', 'Y', 'Z', 'Y'], + 'left_val': ['a', 'b', 'c', 'd', 'e'], + 'right_val': [1, np.nan, 11, 15, 16]}) + + result = pd.merge_asof(left, right, on='a', by='b', + direction='forward') + assert_frame_equal(result, expected) + + def test_nearest_by(self): + # GH14887 + + left = pd.DataFrame({'a': [1, 5, 10, 12, 15], + 'b': ['X', 'X', 'Z', 'Z', 'Y'], + 'left_val': ['a', 'b', 'c', 'd', 'e']}) + right = pd.DataFrame({'a': [1, 6, 11, 15, 16], + 'b': ['X', 'Z', 'Z', 'Z', 'Y'], + 'right_val': [1, 6, 11, 15, 16]}) + + expected = pd.DataFrame({'a': [1, 5, 10, 12, 15], + 'b': ['X', 'X', 'Z', 'Z', 'Y'], + 'left_val': ['a', 'b', 'c', 'd', 'e'], + 'right_val': [1, 1, 11, 11, 16]}) + + result = pd.merge_asof(left, right, on='a', by='b', + direction='nearest') + assert_frame_equal(result, expected) + def test_by_int(self): # we specialize by type, so test that this is correct df1 = pd.DataFrame({