diff --git a/ci/lint.sh b/ci/lint.sh index 3adfa8d1e3d33..61d74ae28377e 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -20,7 +20,7 @@ if [ "$LINT" ]; then echo "Linting *.py DONE" echo "Linting *.pyx" - for path in 'window.pyx' + for path in 'window.pyx' "src/join.pyx" do echo "linting -> pandas/$path" flake8 pandas/$path --filename '*.pyx' --select=E501,E302,E203,E226,E111,E114,E221,E303,E128,E231,E126 diff --git a/pandas/algos.pyx b/pandas/algos.pyx index cccc5377d0dec..44288ab9621f1 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -1340,10 +1340,7 @@ cdef inline float64_t _median_linear(float64_t* a, int n): return result -include "join.pyx" - # generated from template include "algos_common_helper.pxi" include "algos_groupby_helper.pxi" -include "algos_join_helper.pxi" include "algos_take_helper.pxi" diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 32bcb0bcc732f..de7780d25b1e5 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -5,6 +5,7 @@ import numpy as np import pandas.tslib as tslib import pandas.lib as lib +import pandas._join as _join import pandas.algos as _algos import pandas.index as _index from pandas.lib import Timestamp, Timedelta, is_datetime_array @@ -110,10 +111,10 @@ class Index(IndexOpsMixin, StringAccessorMixin, PandasObject): # Cython methods _groupby = _algos.groupby_object _arrmap = _algos.arrmap_object - _left_indexer_unique = _algos.left_join_indexer_unique_object - _left_indexer = _algos.left_join_indexer_object - _inner_indexer = _algos.inner_join_indexer_object - _outer_indexer = _algos.outer_join_indexer_object + _left_indexer_unique = _join.left_join_indexer_unique_object + _left_indexer = _join.left_join_indexer_object + _inner_indexer = _join.inner_join_indexer_object + _outer_indexer = _join.outer_join_indexer_object _box_scalars = False _typ = 'index' diff --git a/pandas/indexes/numeric.py b/pandas/indexes/numeric.py index 86d22e141f781..82a6ec0b28ac9 100644 --- a/pandas/indexes/numeric.py +++ b/pandas/indexes/numeric.py @@ -1,5 +1,6 @@ import numpy as np import pandas.lib as lib +import pandas._join as _join import pandas.algos as _algos import pandas.index as _index @@ -114,10 +115,10 @@ class Int64Index(NumericIndex): _typ = 'int64index' _groupby = _algos.groupby_int64 _arrmap = _algos.arrmap_int64 - _left_indexer_unique = _algos.left_join_indexer_unique_int64 - _left_indexer = _algos.left_join_indexer_int64 - _inner_indexer = _algos.inner_join_indexer_int64 - _outer_indexer = _algos.outer_join_indexer_int64 + _left_indexer_unique = _join.left_join_indexer_unique_int64 + _left_indexer = _join.left_join_indexer_int64 + _inner_indexer = _join.inner_join_indexer_int64 + _outer_indexer = _join.outer_join_indexer_int64 _can_hold_na = False @@ -211,10 +212,10 @@ class Float64Index(NumericIndex): _engine_type = _index.Float64Engine _groupby = _algos.groupby_float64 _arrmap = _algos.arrmap_float64 - _left_indexer_unique = _algos.left_join_indexer_unique_float64 - _left_indexer = _algos.left_join_indexer_float64 - _inner_indexer = _algos.inner_join_indexer_float64 - _outer_indexer = _algos.outer_join_indexer_float64 + _left_indexer_unique = _join.left_join_indexer_unique_float64 + _left_indexer = _join.left_join_indexer_float64 + _inner_indexer = _join.inner_join_indexer_float64 + _outer_indexer = _join.outer_join_indexer_float64 _default_dtype = np.float64 diff --git a/pandas/src/join.pyx b/pandas/src/join.pyx index fbbef8a31071f..f3c7577ef528a 100644 --- a/pandas/src/join.pyx +++ b/pandas/src/join.pyx @@ -1,3 +1,40 @@ +# cython: profile=False + +from numpy cimport * +cimport numpy as np +import numpy as np + +cimport cython + +import_array() + +cimport util + +from numpy cimport NPY_INT8 as NPY_int8 +from numpy cimport NPY_INT16 as NPY_int16 +from numpy cimport NPY_INT32 as NPY_int32 +from numpy cimport NPY_INT64 as NPY_int64 +from numpy cimport NPY_FLOAT16 as NPY_float16 +from numpy cimport NPY_FLOAT32 as NPY_float32 +from numpy cimport NPY_FLOAT64 as NPY_float64 + +from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, + uint32_t, uint64_t, float16_t, float32_t, float64_t) + +int8 = np.dtype(np.int8) +int16 = np.dtype(np.int16) +int32 = np.dtype(np.int32) +int64 = np.dtype(np.int64) +float16 = np.dtype(np.float16) +float32 = np.dtype(np.float32) +float64 = np.dtype(np.float64) + +cdef double NaN = np.NaN +cdef double nan = NaN + +from pandas.algos import groupsort_indexer + + def inner_join(ndarray[int64_t] left, ndarray[int64_t] right, Py_ssize_t max_groups): cdef: @@ -48,6 +85,7 @@ def inner_join(ndarray[int64_t] left, ndarray[int64_t] right, return (_get_result_indexer(left_sorter, left_indexer), _get_result_indexer(right_sorter, right_indexer)) + def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, Py_ssize_t max_groups, sort=True): cdef: @@ -117,14 +155,13 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, rev, _ = groupsort_indexer(left_indexer, len(left)) if rev.dtype != np.int_: - rev = rev.astype(np.int_) + rev = rev.astype(np.int_) right_indexer = right_indexer.take(rev) left_indexer = left_indexer.take(rev) return left_indexer, right_indexer - def left_outer_asof_join(ndarray[int64_t] left, ndarray[int64_t] right, Py_ssize_t max_groups, # ignored bint allow_exact_matches=1, @@ -140,7 +177,8 @@ def left_outer_asof_join(ndarray[int64_t] left, ndarray[int64_t] right, int64_t tolerance_ # if we are using tolerance, set our objects - if left_values is not None and right_values is not None and tolerance is not None: + if (left_values is not None and right_values is not None and + tolerance is not None): has_tolerance = 1 left_values_ = left_values right_values_ = right_values @@ -160,10 +198,12 @@ def left_outer_asof_join(ndarray[int64_t] left, ndarray[int64_t] right, # find last position in right whose value is less than left's value if allow_exact_matches: - while right_pos < right_size and right[right_pos] <= left[left_pos]: + while (right_pos < right_size and + right[right_pos] <= left[left_pos]): right_pos += 1 else: - while right_pos < right_size and right[right_pos] < left[left_pos]: + while (right_pos < right_size and + right[right_pos] < left[left_pos]): right_pos += 1 right_pos -= 1 @@ -243,7 +283,6 @@ def full_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, _get_result_indexer(right_sorter, right_indexer)) - def _get_result_indexer(sorter, indexer): if indexer.dtype != np.int_: indexer = indexer.astype(np.int_) @@ -258,7 +297,6 @@ def _get_result_indexer(sorter, indexer): return res - def ffill_indexer(ndarray[int64_t] indexer): cdef: Py_ssize_t i, n = len(indexer) @@ -301,3 +339,6 @@ def ffill_by_group(ndarray[int64_t] indexer, ndarray[int64_t] group_ids, last_obs[gid] = val return result + + +include "join_helper.pxi" diff --git a/pandas/src/algos_join_helper.pxi b/pandas/src/join_helper.pxi similarity index 100% rename from pandas/src/algos_join_helper.pxi rename to pandas/src/join_helper.pxi diff --git a/pandas/src/algos_join_helper.pxi.in b/pandas/src/join_helper.pxi.in similarity index 100% rename from pandas/src/algos_join_helper.pxi.in rename to pandas/src/join_helper.pxi.in diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 490f4fe81ecbd..66fd1861f08f9 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -12,6 +12,7 @@ import pandas.algos as _algos from pandas.compat import lrange import pandas.core.algorithms as algos +import pandas._join as _join import pandas.util.testing as tm import pandas.hashtable as hashtable from pandas.compat.numpy import np_array_datetime64_compat @@ -303,11 +304,11 @@ class TestIndexer(tm.TestCase): _multiprocess_can_split_ = True def test_outer_join_indexer(self): - typemap = [('int32', algos.algos.outer_join_indexer_int32), - ('int64', algos.algos.outer_join_indexer_int64), - ('float32', algos.algos.outer_join_indexer_float32), - ('float64', algos.algos.outer_join_indexer_float64), - ('object', algos.algos.outer_join_indexer_object)] + typemap = [('int32', _join.outer_join_indexer_int32), + ('int64', _join.outer_join_indexer_int64), + ('float32', _join.outer_join_indexer_float32), + ('float64', _join.outer_join_indexer_float64), + ('object', _join.outer_join_indexer_object)] for dtype, indexer in typemap: left = np.arange(3, dtype=dtype) @@ -1070,7 +1071,7 @@ def test_left_join_indexer_unique(): a = np.array([1, 2, 3, 4, 5], dtype=np.int64) b = np.array([2, 2, 3, 4, 4], dtype=np.int64) - result = _algos.left_join_indexer_unique_int64(b, a) + result = _join.left_join_indexer_unique_int64(b, a) expected = np.array([1, 1, 2, 3, 3], dtype=np.int64) assert (np.array_equal(result, expected)) @@ -1086,7 +1087,7 @@ def test_left_outer_join_bug(): right = np.array([3, 1], dtype=np.int64) max_groups = 4 - lidx, ridx = _algos.left_outer_join(left, right, max_groups, sort=False) + lidx, ridx = _join.left_outer_join(left, right, max_groups, sort=False) exp_lidx = np.arange(len(left)) exp_ridx = -np.ones(len(left)) @@ -1101,7 +1102,7 @@ def test_inner_join_indexer(): a = np.array([1, 2, 3, 4, 5], dtype=np.int64) b = np.array([0, 3, 5, 7, 9], dtype=np.int64) - index, ares, bres = _algos.inner_join_indexer_int64(a, b) + index, ares, bres = _join.inner_join_indexer_int64(a, b) index_exp = np.array([3, 5], dtype=np.int64) assert_almost_equal(index, index_exp) @@ -1114,7 +1115,7 @@ def test_inner_join_indexer(): a = np.array([5], dtype=np.int64) b = np.array([5], dtype=np.int64) - index, ares, bres = _algos.inner_join_indexer_int64(a, b) + index, ares, bres = _join.inner_join_indexer_int64(a, b) tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) @@ -1124,7 +1125,7 @@ def test_outer_join_indexer(): a = np.array([1, 2, 3, 4, 5], dtype=np.int64) b = np.array([0, 3, 5, 7, 9], dtype=np.int64) - index, ares, bres = _algos.outer_join_indexer_int64(a, b) + index, ares, bres = _join.outer_join_indexer_int64(a, b) index_exp = np.array([0, 1, 2, 3, 4, 5, 7, 9], dtype=np.int64) assert_almost_equal(index, index_exp) @@ -1137,7 +1138,7 @@ def test_outer_join_indexer(): a = np.array([5], dtype=np.int64) b = np.array([5], dtype=np.int64) - index, ares, bres = _algos.outer_join_indexer_int64(a, b) + index, ares, bres = _join.outer_join_indexer_int64(a, b) tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) @@ -1147,7 +1148,7 @@ def test_left_join_indexer(): a = np.array([1, 2, 3, 4, 5], dtype=np.int64) b = np.array([0, 3, 5, 7, 9], dtype=np.int64) - index, ares, bres = _algos.left_join_indexer_int64(a, b) + index, ares, bres = _join.left_join_indexer_int64(a, b) assert_almost_equal(index, a) @@ -1159,7 +1160,7 @@ def test_left_join_indexer(): a = np.array([5], dtype=np.int64) b = np.array([5], dtype=np.int64) - index, ares, bres = _algos.left_join_indexer_int64(a, b) + index, ares, bres = _join.left_join_indexer_int64(a, b) tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) @@ -1169,7 +1170,7 @@ def test_left_join_indexer2(): idx = Index([1, 1, 2, 5]) idx2 = Index([1, 2, 5, 7, 9]) - res, lidx, ridx = _algos.left_join_indexer_int64(idx2.values, idx.values) + res, lidx, ridx = _join.left_join_indexer_int64(idx2.values, idx.values) exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) assert_almost_equal(res, exp_res) @@ -1185,7 +1186,7 @@ def test_outer_join_indexer2(): idx = Index([1, 1, 2, 5]) idx2 = Index([1, 2, 5, 7, 9]) - res, lidx, ridx = _algos.outer_join_indexer_int64(idx2.values, idx.values) + res, lidx, ridx = _join.outer_join_indexer_int64(idx2.values, idx.values) exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) assert_almost_equal(res, exp_res) @@ -1201,7 +1202,7 @@ def test_inner_join_indexer2(): idx = Index([1, 1, 2, 5]) idx2 = Index([1, 2, 5, 7, 9]) - res, lidx, ridx = _algos.inner_join_indexer_int64(idx2.values, idx.values) + res, lidx, ridx = _join.inner_join_indexer_int64(idx2.values, idx.values) exp_res = np.array([1, 1, 2, 5], dtype=np.int64) assert_almost_equal(res, exp_res) diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index cc0972937b8a2..c037f02f20609 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -208,8 +208,9 @@ def test_float_panel(self): @slow def test_panel4d(self): - self.run_panel(tm.makePanel4D(), np.random.randn() + 0.5, - assert_func=assert_panel4d_equal, binary_comp=3) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.run_panel(tm.makePanel4D(), np.random.randn() + 0.5, + assert_func=assert_panel4d_equal, binary_comp=3) def test_mixed_arithmetic_frame(self): # TODO: FIGURE OUT HOW TO GET IT TO WORK... diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 9f8e27c4d8176..571df70e05c6d 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -39,7 +39,7 @@ import pandas.core.common as com import pandas.types.concat as _concat -import pandas.algos as _algos +import pandas._join as _join import pandas.hashtable as _hash @@ -918,8 +918,8 @@ def get_result(self): rdata.items, rsuf) if self.fill_method == 'ffill': - left_join_indexer = _algos.ffill_indexer(left_indexer) - right_join_indexer = _algos.ffill_indexer(right_indexer) + left_join_indexer = _join.ffill_indexer(left_indexer) + right_join_indexer = _join.ffill_indexer(right_indexer) else: left_join_indexer = left_indexer right_join_indexer = right_indexer @@ -1094,13 +1094,13 @@ def _get_multiindex_indexer(join_keys, index, sort): # factorize keys to a dense i8 space lkey, rkey, count = fkeys(lkey, rkey) - return _algos.left_outer_join(lkey, rkey, count, sort=sort) + return _join.left_outer_join(lkey, rkey, count, sort=sort) def _get_single_indexer(join_key, index, sort=False): left_key, right_key, count = _factorize_keys(join_key, index, sort=sort) - left_indexer, right_indexer = _algos.left_outer_join( + left_indexer, right_indexer = _join.left_outer_join( _ensure_int64(left_key), _ensure_int64(right_key), count, sort=sort) @@ -1135,15 +1135,15 @@ def _left_join_on_index(left_ax, right_ax, join_keys, sort=False): def _right_outer_join(x, y, max_groups): - right_indexer, left_indexer = _algos.left_outer_join(y, x, max_groups) + right_indexer, left_indexer = _join.left_outer_join(y, x, max_groups) return left_indexer, right_indexer _join_functions = { - 'inner': _algos.inner_join, - 'left': _algos.left_outer_join, + 'inner': _join.inner_join, + 'left': _join.left_outer_join, 'right': _right_outer_join, - 'outer': _algos.full_outer_join, - 'asof': _algos.left_outer_asof_join, + 'outer': _join.full_outer_join, + 'asof': _join.left_outer_asof_join, } diff --git a/pandas/tools/tests/test_join.py b/pandas/tools/tests/test_join.py index cb84c1f06653b..f33d5f16cd439 100644 --- a/pandas/tools/tests/test_join.py +++ b/pandas/tools/tests/test_join.py @@ -12,7 +12,7 @@ from pandas.util.testing import assert_frame_equal from pandas import DataFrame, MultiIndex, Series -import pandas.algos as algos +import pandas._join as _join import pandas.util.testing as tm from pandas.tools.tests.test_merge import get_test_data, N, NGROUPS @@ -51,7 +51,7 @@ def test_cython_left_outer_join(self): right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) max_group = 5 - ls, rs = algos.left_outer_join(left, right, max_group) + ls, rs = _join.left_outer_join(left, right, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') @@ -75,7 +75,7 @@ def test_cython_right_outer_join(self): right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) max_group = 5 - rs, ls = algos.left_outer_join(right, left, max_group) + rs, ls = _join.left_outer_join(right, left, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') @@ -101,7 +101,7 @@ def test_cython_inner_join(self): right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) max_group = 5 - ls, rs = algos.inner_join(left, right, max_group) + ls, rs = _join.inner_join(left, right, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 4a7ba0286aab1..aa50fbe316b94 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -46,6 +46,7 @@ import pandas.lib as lib import pandas.tslib as tslib import pandas._period as period +import pandas._join as _join import pandas.algos as _algos import pandas.index as _index @@ -204,11 +205,11 @@ def _join_i8_wrapper(joinf, **kwargs): return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype='M8[ns]', **kwargs) - _inner_indexer = _join_i8_wrapper(_algos.inner_join_indexer_int64) - _outer_indexer = _join_i8_wrapper(_algos.outer_join_indexer_int64) - _left_indexer = _join_i8_wrapper(_algos.left_join_indexer_int64) + _inner_indexer = _join_i8_wrapper(_join.inner_join_indexer_int64) + _outer_indexer = _join_i8_wrapper(_join.outer_join_indexer_int64) + _left_indexer = _join_i8_wrapper(_join.left_join_indexer_int64) _left_indexer_unique = _join_i8_wrapper( - _algos.left_join_indexer_unique_int64, with_indexers=False) + _join.left_join_indexer_unique_int64, with_indexers=False) _arrmap = None __eq__ = _dt_index_cmp('__eq__') diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index 8aad5bdd35f65..921f60b23d187 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -32,7 +32,7 @@ import pandas.lib as lib import pandas.tslib as tslib -import pandas.algos as _algos +import pandas._join as _join import pandas.index as _index Timedelta = tslib.Timedelta @@ -122,11 +122,11 @@ def _join_i8_wrapper(joinf, **kwargs): return DatetimeIndexOpsMixin._join_i8_wrapper( joinf, dtype='m8[ns]', **kwargs) - _inner_indexer = _join_i8_wrapper(_algos.inner_join_indexer_int64) - _outer_indexer = _join_i8_wrapper(_algos.outer_join_indexer_int64) - _left_indexer = _join_i8_wrapper(_algos.left_join_indexer_int64) + _inner_indexer = _join_i8_wrapper(_join.inner_join_indexer_int64) + _outer_indexer = _join_i8_wrapper(_join.outer_join_indexer_int64) + _left_indexer = _join_i8_wrapper(_join.left_join_indexer_int64) _left_indexer_unique = _join_i8_wrapper( - _algos.left_join_indexer_unique_int64, with_indexers=False) + _join.left_join_indexer_unique_int64, with_indexers=False) _arrmap = None _datetimelike_ops = ['days', 'seconds', 'microseconds', 'nanoseconds', 'freq', 'components'] diff --git a/pandas/util/testing.py b/pandas/util/testing.py index c6573934bff57..e95808ddc8225 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1040,7 +1040,7 @@ def assert_numpy_array_equal(left, right, strict_nan=False, Specify object name being compared, internally used to show appropriate assertion message check_same : None|'copy'|'same', default None - Ensure "left" and "right refer/do not refer to the same memory area + Ensure left and right refer/do not refer to the same memory area """ # instance validation diff --git a/setup.py b/setup.py index c985445a08155..5bf188d829d26 100755 --- a/setup.py +++ b/setup.py @@ -107,7 +107,7 @@ def is_platform_mac(): _pxipath = pjoin('pandas', 'src') _pxifiles = ['algos_common_helper.pxi.in', 'algos_groupby_helper.pxi.in', - 'algos_join_helper.pxi.in', 'algos_take_helper.pxi.in', + 'join_helper.pxi.in', 'algos_take_helper.pxi.in', 'hashtable_class_helper.pxi.in', 'hashtable_func_helper.pxi.in', 'sparse_op_helper.pxi.in'] @@ -308,6 +308,7 @@ class CheckSDist(sdist_class): 'pandas/tslib.pyx', 'pandas/index.pyx', 'pandas/algos.pyx', + 'pandas/join.pyx', 'pandas/window.pyx', 'pandas/parser.pyx', 'pandas/src/period.pyx', @@ -464,8 +465,9 @@ def pxd(name): 'sources': ['pandas/src/datetime/np_datetime.c', 'pandas/src/datetime/np_datetime_strings.c']}, algos={'pyxfile': 'algos', - 'pxdfiles': ['src/util'], - 'depends': [srcpath('join', suffix='.pyx')]}, + 'pxdfiles': ['src/util']}, + _join={'pyxfile': 'src/join', + 'pxdfiles': ['src/util']}, _window={'pyxfile': 'window', 'pxdfiles': ['src/skiplist', 'src/util'], 'depends': ['pandas/src/skiplist.pyx',