diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index f20eed4575e91..aff9911961b25 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -1,12 +1,10 @@ import numpy as np import pytest -from pandas._libs import groupby, lib, reduction as libreduction - -from pandas.core.dtypes.common import ensure_int64 +from pandas._libs import lib, reduction as libreduction import pandas as pd -from pandas import Series, isna +from pandas import Series import pandas._testing as tm @@ -103,36 +101,5 @@ def test_generate_bins(binner, closed, expected): tm.assert_numpy_array_equal(result, expected) -def test_group_ohlc(): - def _check(dtype): - obj = np.array(np.random.randn(20), dtype=dtype) - - bins = np.array([6, 12, 20]) - out = np.zeros((3, 4), dtype) - counts = np.zeros(len(out), dtype=np.int64) - labels = ensure_int64(np.repeat(np.arange(3), np.diff(np.r_[0, bins]))) - - func = getattr(groupby, f"group_ohlc_{dtype}") - func(out, counts, obj[:, None], labels) - - def _ohlc(group): - if isna(group).all(): - return np.repeat(np.nan, 4) - return [group[0], group.max(), group.min(), group[-1]] - - expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:])]) - - tm.assert_almost_equal(out, expected) - tm.assert_numpy_array_equal(counts, np.array([6, 6, 8], dtype=np.int64)) - - obj[:6] = np.nan - func(out, counts, obj[:, None], labels) - expected[0] = np.nan - tm.assert_almost_equal(out, expected) - - _check("float32") - _check("float64") - - class TestMoments: pass diff --git a/pandas/tests/groupby/test_libgroupby.py b/pandas/tests/groupby/test_libgroupby.py new file mode 100644 index 0000000000000..28b740355f351 --- /dev/null +++ b/pandas/tests/groupby/test_libgroupby.py @@ -0,0 +1,237 @@ +import numpy as np + +from pandas._libs import groupby as libgroupby +from pandas._libs.groupby import ( + group_cumprod_float64, + group_cumsum, + group_var_float32, + group_var_float64, +) + +from pandas.core.dtypes.common import ensure_int64 + +from pandas import isna +import pandas._testing as tm + + +class GroupVarTestMixin: + def test_group_var_generic_1d(self): + prng = np.random.RandomState(1234) + + out = (np.nan * np.ones((5, 1))).astype(self.dtype) + counts = np.zeros(5, dtype="int64") + values = 10 * prng.rand(15, 1).astype(self.dtype) + labels = np.tile(np.arange(5), (3,)).astype("int64") + + expected_out = ( + np.squeeze(values).reshape((5, 3), order="F").std(axis=1, ddof=1) ** 2 + )[:, np.newaxis] + expected_counts = counts + 3 + + self.algo(out, counts, values, labels) + assert np.allclose(out, expected_out, self.rtol) + tm.assert_numpy_array_equal(counts, expected_counts) + + def test_group_var_generic_1d_flat_labels(self): + prng = np.random.RandomState(1234) + + out = (np.nan * np.ones((1, 1))).astype(self.dtype) + counts = np.zeros(1, dtype="int64") + values = 10 * prng.rand(5, 1).astype(self.dtype) + labels = np.zeros(5, dtype="int64") + + expected_out = np.array([[values.std(ddof=1) ** 2]]) + expected_counts = counts + 5 + + self.algo(out, counts, values, labels) + + assert np.allclose(out, expected_out, self.rtol) + tm.assert_numpy_array_equal(counts, expected_counts) + + def test_group_var_generic_2d_all_finite(self): + prng = np.random.RandomState(1234) + + out = (np.nan * np.ones((5, 2))).astype(self.dtype) + counts = np.zeros(5, dtype="int64") + values = 10 * prng.rand(10, 2).astype(self.dtype) + labels = np.tile(np.arange(5), (2,)).astype("int64") + + expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2 + expected_counts = counts + 2 + + self.algo(out, counts, values, labels) + assert np.allclose(out, expected_out, self.rtol) + tm.assert_numpy_array_equal(counts, expected_counts) + + def test_group_var_generic_2d_some_nan(self): + prng = np.random.RandomState(1234) + + out = (np.nan * np.ones((5, 2))).astype(self.dtype) + counts = np.zeros(5, dtype="int64") + values = 10 * prng.rand(10, 2).astype(self.dtype) + values[:, 1] = np.nan + labels = np.tile(np.arange(5), (2,)).astype("int64") + + expected_out = np.vstack( + [ + values[:, 0].reshape(5, 2, order="F").std(ddof=1, axis=1) ** 2, + np.nan * np.ones(5), + ] + ).T.astype(self.dtype) + expected_counts = counts + 2 + + self.algo(out, counts, values, labels) + tm.assert_almost_equal(out, expected_out, rtol=0.5e-06) + tm.assert_numpy_array_equal(counts, expected_counts) + + def test_group_var_constant(self): + # Regression test from GH 10448. + + out = np.array([[np.nan]], dtype=self.dtype) + counts = np.array([0], dtype="int64") + values = 0.832845131556193 * np.ones((3, 1), dtype=self.dtype) + labels = np.zeros(3, dtype="int64") + + self.algo(out, counts, values, labels) + + assert counts[0] == 3 + assert out[0, 0] >= 0 + tm.assert_almost_equal(out[0, 0], 0.0) + + +class TestGroupVarFloat64(GroupVarTestMixin): + __test__ = True + + algo = staticmethod(group_var_float64) + dtype = np.float64 + rtol = 1e-5 + + def test_group_var_large_inputs(self): + prng = np.random.RandomState(1234) + + out = np.array([[np.nan]], dtype=self.dtype) + counts = np.array([0], dtype="int64") + values = (prng.rand(10 ** 6) + 10 ** 12).astype(self.dtype) + values.shape = (10 ** 6, 1) + labels = np.zeros(10 ** 6, dtype="int64") + + self.algo(out, counts, values, labels) + + assert counts[0] == 10 ** 6 + tm.assert_almost_equal(out[0, 0], 1.0 / 12, rtol=0.5e-3) + + +class TestGroupVarFloat32(GroupVarTestMixin): + __test__ = True + + algo = staticmethod(group_var_float32) + dtype = np.float32 + rtol = 1e-2 + + +def test_group_ohlc(): + def _check(dtype): + obj = np.array(np.random.randn(20), dtype=dtype) + + bins = np.array([6, 12, 20]) + out = np.zeros((3, 4), dtype) + counts = np.zeros(len(out), dtype=np.int64) + labels = ensure_int64(np.repeat(np.arange(3), np.diff(np.r_[0, bins]))) + + func = getattr(libgroupby, f"group_ohlc_{dtype}") + func(out, counts, obj[:, None], labels) + + def _ohlc(group): + if isna(group).all(): + return np.repeat(np.nan, 4) + return [group[0], group.max(), group.min(), group[-1]] + + expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:])]) + + tm.assert_almost_equal(out, expected) + tm.assert_numpy_array_equal(counts, np.array([6, 6, 8], dtype=np.int64)) + + obj[:6] = np.nan + func(out, counts, obj[:, None], labels) + expected[0] = np.nan + tm.assert_almost_equal(out, expected) + + _check("float32") + _check("float64") + + +def _check_cython_group_transform_cumulative(pd_op, np_op, dtype): + """ + Check a group transform that executes a cumulative function. + + Parameters + ---------- + pd_op : callable + The pandas cumulative function. + np_op : callable + The analogous one in NumPy. + dtype : type + The specified dtype of the data. + """ + is_datetimelike = False + + data = np.array([[1], [2], [3], [4]], dtype=dtype) + ans = np.zeros_like(data) + + labels = np.array([0, 0, 0, 0], dtype=np.int64) + ngroups = 1 + pd_op(ans, data, labels, ngroups, is_datetimelike) + + tm.assert_numpy_array_equal(np_op(data), ans[:, 0], check_dtype=False) + + +def test_cython_group_transform_cumsum(any_real_dtype): + # see gh-4095 + dtype = np.dtype(any_real_dtype).type + pd_op, np_op = group_cumsum, np.cumsum + _check_cython_group_transform_cumulative(pd_op, np_op, dtype) + + +def test_cython_group_transform_cumprod(): + # see gh-4095 + dtype = np.float64 + pd_op, np_op = group_cumprod_float64, np.cumproduct + _check_cython_group_transform_cumulative(pd_op, np_op, dtype) + + +def test_cython_group_transform_algos(): + # see gh-4095 + is_datetimelike = False + + # with nans + labels = np.array([0, 0, 0, 0, 0], dtype=np.int64) + ngroups = 1 + + data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64") + actual = np.zeros_like(data) + actual.fill(np.nan) + group_cumprod_float64(actual, data, labels, ngroups, is_datetimelike) + expected = np.array([1, 2, 6, np.nan, 24], dtype="float64") + tm.assert_numpy_array_equal(actual[:, 0], expected) + + actual = np.zeros_like(data) + actual.fill(np.nan) + group_cumsum(actual, data, labels, ngroups, is_datetimelike) + expected = np.array([1, 3, 6, np.nan, 10], dtype="float64") + tm.assert_numpy_array_equal(actual[:, 0], expected) + + # timedelta + is_datetimelike = True + data = np.array([np.timedelta64(1, "ns")] * 5, dtype="m8[ns]")[:, None] + actual = np.zeros_like(data, dtype="int64") + group_cumsum(actual, data.view("int64"), labels, ngroups, is_datetimelike) + expected = np.array( + [ + np.timedelta64(1, "ns"), + np.timedelta64(2, "ns"), + np.timedelta64(3, "ns"), + np.timedelta64(4, "ns"), + np.timedelta64(5, "ns"), + ] + ) + tm.assert_numpy_array_equal(actual[:, 0].view("m8[ns]"), expected) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 946e60d17e0bb..cd3c2771db8a4 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._libs.groupby import group_cumprod_float64, group_cumsum - from pandas.core.dtypes.common import ensure_platform_int, is_timedelta64_dtype import pandas as pd @@ -515,83 +513,6 @@ def f(group): tm.assert_frame_equal(res, result.loc[key]) -def _check_cython_group_transform_cumulative(pd_op, np_op, dtype): - """ - Check a group transform that executes a cumulative function. - - Parameters - ---------- - pd_op : callable - The pandas cumulative function. - np_op : callable - The analogous one in NumPy. - dtype : type - The specified dtype of the data. - """ - is_datetimelike = False - - data = np.array([[1], [2], [3], [4]], dtype=dtype) - ans = np.zeros_like(data) - - labels = np.array([0, 0, 0, 0], dtype=np.int64) - ngroups = 1 - pd_op(ans, data, labels, ngroups, is_datetimelike) - - tm.assert_numpy_array_equal(np_op(data), ans[:, 0], check_dtype=False) - - -def test_cython_group_transform_cumsum(any_real_dtype): - # see gh-4095 - dtype = np.dtype(any_real_dtype).type - pd_op, np_op = group_cumsum, np.cumsum - _check_cython_group_transform_cumulative(pd_op, np_op, dtype) - - -def test_cython_group_transform_cumprod(): - # see gh-4095 - dtype = np.float64 - pd_op, np_op = group_cumprod_float64, np.cumproduct - _check_cython_group_transform_cumulative(pd_op, np_op, dtype) - - -def test_cython_group_transform_algos(): - # see gh-4095 - is_datetimelike = False - - # with nans - labels = np.array([0, 0, 0, 0, 0], dtype=np.int64) - ngroups = 1 - - data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64") - actual = np.zeros_like(data) - actual.fill(np.nan) - group_cumprod_float64(actual, data, labels, ngroups, is_datetimelike) - expected = np.array([1, 2, 6, np.nan, 24], dtype="float64") - tm.assert_numpy_array_equal(actual[:, 0], expected) - - actual = np.zeros_like(data) - actual.fill(np.nan) - group_cumsum(actual, data, labels, ngroups, is_datetimelike) - expected = np.array([1, 3, 6, np.nan, 10], dtype="float64") - tm.assert_numpy_array_equal(actual[:, 0], expected) - - # timedelta - is_datetimelike = True - data = np.array([np.timedelta64(1, "ns")] * 5, dtype="m8[ns]")[:, None] - actual = np.zeros_like(data, dtype="int64") - group_cumsum(actual, data.view("int64"), labels, ngroups, is_datetimelike) - expected = np.array( - [ - np.timedelta64(1, "ns"), - np.timedelta64(2, "ns"), - np.timedelta64(3, "ns"), - np.timedelta64(4, "ns"), - np.timedelta64(5, "ns"), - ] - ) - tm.assert_numpy_array_equal(actual[:, 0].view("m8[ns]"), expected) - - @pytest.mark.parametrize( "op, args, targop", [ diff --git a/pandas/tests/libs/__init__.py b/pandas/tests/libs/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/test_join.py b/pandas/tests/libs/test_join.py similarity index 68% rename from pandas/tests/test_join.py rename to pandas/tests/libs/test_join.py index 03198ec3289dd..95d6dcbaf3baf 100644 --- a/pandas/tests/test_join.py +++ b/pandas/tests/libs/test_join.py @@ -2,8 +2,8 @@ import pytest from pandas._libs import join as libjoin +from pandas._libs.join import inner_join, left_outer_join -from pandas import Categorical, DataFrame, Index, merge import pandas._testing as tm @@ -42,6 +42,98 @@ def test_outer_join_indexer(self, dtype): exp = np.array([-1, -1, -1], dtype=np.int64) tm.assert_numpy_array_equal(rindexer, exp) + def test_cython_left_outer_join(self): + left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) + right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) + max_group = 5 + + ls, rs = left_outer_join(left, right, max_group) + + exp_ls = left.argsort(kind="mergesort") + exp_rs = right.argsort(kind="mergesort") + + exp_li = np.array([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10]) + exp_ri = np.array( + [0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1] + ) + + exp_ls = exp_ls.take(exp_li) + exp_ls[exp_li == -1] = -1 + + exp_rs = exp_rs.take(exp_ri) + exp_rs[exp_ri == -1] = -1 + + tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) + tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) + + def test_cython_right_outer_join(self): + left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) + right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) + max_group = 5 + + rs, ls = left_outer_join(right, left, max_group) + + exp_ls = left.argsort(kind="mergesort") + exp_rs = right.argsort(kind="mergesort") + + # 0 1 1 1 + exp_li = np.array( + [ + 0, + 1, + 2, + 3, + 4, + 5, + 3, + 4, + 5, + 3, + 4, + 5, + # 2 2 4 + 6, + 7, + 8, + 6, + 7, + 8, + -1, + ] + ) + exp_ri = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6]) + + exp_ls = exp_ls.take(exp_li) + exp_ls[exp_li == -1] = -1 + + exp_rs = exp_rs.take(exp_ri) + exp_rs[exp_ri == -1] = -1 + + tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) + tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) + + def test_cython_inner_join(self): + left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) + right = np.array([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) + max_group = 5 + + ls, rs = inner_join(left, right, max_group) + + exp_ls = left.argsort(kind="mergesort") + exp_rs = right.argsort(kind="mergesort") + + exp_li = np.array([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8]) + exp_ri = np.array([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5]) + + exp_ls = exp_ls.take(exp_li) + exp_ls[exp_li == -1] = -1 + + exp_rs = exp_rs.take(exp_ri) + exp_rs[exp_ri == -1] = -1 + + tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) + tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) + def test_left_join_indexer_unique(): a = np.array([1, 2, 3, 4, 5], dtype=np.int64) @@ -243,10 +335,10 @@ def test_left_join_indexer(): def test_left_join_indexer2(): - idx = Index([1, 1, 2, 5]) - idx2 = Index([1, 2, 5, 7, 9]) + idx = np.array([1, 1, 2, 5], dtype=np.int64) + idx2 = np.array([1, 2, 5, 7, 9], dtype=np.int64) - res, lidx, ridx = libjoin.left_join_indexer(idx2.values, idx.values) + res, lidx, ridx = libjoin.left_join_indexer(idx2, idx) exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) tm.assert_almost_equal(res, exp_res) @@ -259,10 +351,10 @@ def test_left_join_indexer2(): def test_outer_join_indexer2(): - idx = Index([1, 1, 2, 5]) - idx2 = Index([1, 2, 5, 7, 9]) + idx = np.array([1, 1, 2, 5], dtype=np.int64) + idx2 = np.array([1, 2, 5, 7, 9], dtype=np.int64) - res, lidx, ridx = libjoin.outer_join_indexer(idx2.values, idx.values) + res, lidx, ridx = libjoin.outer_join_indexer(idx2, idx) exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) tm.assert_almost_equal(res, exp_res) @@ -275,10 +367,10 @@ def test_outer_join_indexer2(): def test_inner_join_indexer2(): - idx = Index([1, 1, 2, 5]) - idx2 = Index([1, 2, 5, 7, 9]) + idx = np.array([1, 1, 2, 5], dtype=np.int64) + idx2 = np.array([1, 2, 5, 7, 9], dtype=np.int64) - res, lidx, ridx = libjoin.inner_join_indexer(idx2.values, idx.values) + res, lidx, ridx = libjoin.inner_join_indexer(idx2, idx) exp_res = np.array([1, 1, 2, 5], dtype=np.int64) tm.assert_almost_equal(res, exp_res) @@ -288,59 +380,3 @@ def test_inner_join_indexer2(): exp_ridx = np.array([0, 1, 2, 3], dtype=np.int64) tm.assert_almost_equal(ridx, exp_ridx) - - -def test_merge_join_categorical_multiindex(): - # From issue 16627 - a = { - "Cat1": Categorical(["a", "b", "a", "c", "a", "b"], ["a", "b", "c"]), - "Int1": [0, 1, 0, 1, 0, 0], - } - a = DataFrame(a) - - b = { - "Cat": Categorical(["a", "b", "c", "a", "b", "c"], ["a", "b", "c"]), - "Int": [0, 0, 0, 1, 1, 1], - "Factor": [1.1, 1.2, 1.3, 1.4, 1.5, 1.6], - } - b = DataFrame(b).set_index(["Cat", "Int"])["Factor"] - - expected = merge( - a, - b.reset_index(), - left_on=["Cat1", "Int1"], - right_on=["Cat", "Int"], - how="left", - ) - result = a.join(b, on=["Cat1", "Int1"]) - expected = expected.drop(["Cat", "Int"], axis=1) - tm.assert_frame_equal(expected, result) - - # Same test, but with ordered categorical - a = { - "Cat1": Categorical( - ["a", "b", "a", "c", "a", "b"], ["b", "a", "c"], ordered=True - ), - "Int1": [0, 1, 0, 1, 0, 0], - } - a = DataFrame(a) - - b = { - "Cat": Categorical( - ["a", "b", "c", "a", "b", "c"], ["b", "a", "c"], ordered=True - ), - "Int": [0, 0, 0, 1, 1, 1], - "Factor": [1.1, 1.2, 1.3, 1.4, 1.5, 1.6], - } - b = DataFrame(b).set_index(["Cat", "Int"])["Factor"] - - expected = merge( - a, - b.reset_index(), - left_on=["Cat1", "Int1"], - right_on=["Cat", "Int"], - how="left", - ) - result = a.join(b, on=["Cat1", "Int1"]) - expected = expected.drop(["Cat", "Int"], axis=1) - tm.assert_frame_equal(expected, result) diff --git a/pandas/tests/test_lib.py b/pandas/tests/libs/test_lib.py similarity index 100% rename from pandas/tests/test_lib.py rename to pandas/tests/libs/test_lib.py diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 8108cd14b872a..af1e95313f365 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -2,8 +2,6 @@ from numpy.random import randn import pytest -from pandas._libs.join import inner_join, left_outer_join - import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, concat, merge import pandas._testing as tm @@ -43,96 +41,6 @@ def setup_method(self, method): {"MergedA": data["A"], "MergedD": data["D"]}, index=data["C"] ) - def test_cython_left_outer_join(self): - left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) - right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) - max_group = 5 - - ls, rs = left_outer_join(left, right, max_group) - - exp_ls = left.argsort(kind="mergesort") - exp_rs = right.argsort(kind="mergesort") - - exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10]) - exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1]) - - exp_ls = exp_ls.take(exp_li) - exp_ls[exp_li == -1] = -1 - - exp_rs = exp_rs.take(exp_ri) - exp_rs[exp_ri == -1] = -1 - - tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) - tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) - - def test_cython_right_outer_join(self): - left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) - right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) - max_group = 5 - - rs, ls = left_outer_join(right, left, max_group) - - exp_ls = left.argsort(kind="mergesort") - exp_rs = right.argsort(kind="mergesort") - - # 0 1 1 1 - exp_li = a_( - [ - 0, - 1, - 2, - 3, - 4, - 5, - 3, - 4, - 5, - 3, - 4, - 5, - # 2 2 4 - 6, - 7, - 8, - 6, - 7, - 8, - -1, - ] - ) - exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6]) - - exp_ls = exp_ls.take(exp_li) - exp_ls[exp_li == -1] = -1 - - exp_rs = exp_rs.take(exp_ri) - exp_rs[exp_ri == -1] = -1 - - tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) - tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) - - def test_cython_inner_join(self): - left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) - right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) - max_group = 5 - - ls, rs = inner_join(left, right, max_group) - - exp_ls = left.argsort(kind="mergesort") - exp_rs = right.argsort(kind="mergesort") - - exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8]) - exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5]) - - exp_ls = exp_ls.take(exp_li) - exp_ls[exp_li == -1] = -1 - - exp_rs = exp_rs.take(exp_ri) - exp_rs[exp_ri == -1] = -1 - - tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) - tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) - def test_left_outer_join(self): joined_key2 = merge(self.df, self.df2, on="key2") _check_join(self.df, self.df2, joined_key2, ["key2"], how="left") diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 7d701d26185f1..c4c9b0e516192 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2227,3 +2227,59 @@ def test_categorical_non_unique_monotonic(n_categories): index=left_index, ) tm.assert_frame_equal(expected, result) + + +def test_merge_join_categorical_multiindex(): + # From issue 16627 + a = { + "Cat1": Categorical(["a", "b", "a", "c", "a", "b"], ["a", "b", "c"]), + "Int1": [0, 1, 0, 1, 0, 0], + } + a = DataFrame(a) + + b = { + "Cat": Categorical(["a", "b", "c", "a", "b", "c"], ["a", "b", "c"]), + "Int": [0, 0, 0, 1, 1, 1], + "Factor": [1.1, 1.2, 1.3, 1.4, 1.5, 1.6], + } + b = DataFrame(b).set_index(["Cat", "Int"])["Factor"] + + expected = merge( + a, + b.reset_index(), + left_on=["Cat1", "Int1"], + right_on=["Cat", "Int"], + how="left", + ) + expected = expected.drop(["Cat", "Int"], axis=1) + result = a.join(b, on=["Cat1", "Int1"]) + tm.assert_frame_equal(expected, result) + + # Same test, but with ordered categorical + a = { + "Cat1": Categorical( + ["a", "b", "a", "c", "a", "b"], ["b", "a", "c"], ordered=True + ), + "Int1": [0, 1, 0, 1, 0, 0], + } + a = DataFrame(a) + + b = { + "Cat": Categorical( + ["a", "b", "c", "a", "b", "c"], ["b", "a", "c"], ordered=True + ), + "Int": [0, 0, 0, 1, 1, 1], + "Factor": [1.1, 1.2, 1.3, 1.4, 1.5, 1.6], + } + b = DataFrame(b).set_index(["Cat", "Int"])["Factor"] + + expected = merge( + a, + b.reset_index(), + left_on=["Cat1", "Int1"], + right_on=["Cat", "Int"], + how="left", + ) + expected = expected.drop(["Cat", "Int"], axis=1) + result = a.join(b, on=["Cat1", "Int1"]) + tm.assert_frame_equal(expected, result) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 3a1279c481a1d..ee8e2385fe698 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -3,11 +3,9 @@ import struct import numpy as np -from numpy.random import RandomState import pytest from pandas._libs import algos as libalgos, hashtable as ht -from pandas._libs.groupby import group_var_float32, group_var_float64 from pandas.compat import IS64 from pandas.compat.numpy import np_array_datetime64_compat import pandas.util._test_decorators as td @@ -1409,122 +1407,6 @@ def test_unique_tuples(self, arr, unique): tm.assert_numpy_array_equal(result, expected) -class GroupVarTestMixin: - def test_group_var_generic_1d(self): - prng = RandomState(1234) - - out = (np.nan * np.ones((5, 1))).astype(self.dtype) - counts = np.zeros(5, dtype="int64") - values = 10 * prng.rand(15, 1).astype(self.dtype) - labels = np.tile(np.arange(5), (3,)).astype("int64") - - expected_out = ( - np.squeeze(values).reshape((5, 3), order="F").std(axis=1, ddof=1) ** 2 - )[:, np.newaxis] - expected_counts = counts + 3 - - self.algo(out, counts, values, labels) - assert np.allclose(out, expected_out, self.rtol) - tm.assert_numpy_array_equal(counts, expected_counts) - - def test_group_var_generic_1d_flat_labels(self): - prng = RandomState(1234) - - out = (np.nan * np.ones((1, 1))).astype(self.dtype) - counts = np.zeros(1, dtype="int64") - values = 10 * prng.rand(5, 1).astype(self.dtype) - labels = np.zeros(5, dtype="int64") - - expected_out = np.array([[values.std(ddof=1) ** 2]]) - expected_counts = counts + 5 - - self.algo(out, counts, values, labels) - - assert np.allclose(out, expected_out, self.rtol) - tm.assert_numpy_array_equal(counts, expected_counts) - - def test_group_var_generic_2d_all_finite(self): - prng = RandomState(1234) - - out = (np.nan * np.ones((5, 2))).astype(self.dtype) - counts = np.zeros(5, dtype="int64") - values = 10 * prng.rand(10, 2).astype(self.dtype) - labels = np.tile(np.arange(5), (2,)).astype("int64") - - expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2 - expected_counts = counts + 2 - - self.algo(out, counts, values, labels) - assert np.allclose(out, expected_out, self.rtol) - tm.assert_numpy_array_equal(counts, expected_counts) - - def test_group_var_generic_2d_some_nan(self): - prng = RandomState(1234) - - out = (np.nan * np.ones((5, 2))).astype(self.dtype) - counts = np.zeros(5, dtype="int64") - values = 10 * prng.rand(10, 2).astype(self.dtype) - values[:, 1] = np.nan - labels = np.tile(np.arange(5), (2,)).astype("int64") - - expected_out = np.vstack( - [ - values[:, 0].reshape(5, 2, order="F").std(ddof=1, axis=1) ** 2, - np.nan * np.ones(5), - ] - ).T.astype(self.dtype) - expected_counts = counts + 2 - - self.algo(out, counts, values, labels) - tm.assert_almost_equal(out, expected_out, rtol=0.5e-06) - tm.assert_numpy_array_equal(counts, expected_counts) - - def test_group_var_constant(self): - # Regression test from GH 10448. - - out = np.array([[np.nan]], dtype=self.dtype) - counts = np.array([0], dtype="int64") - values = 0.832845131556193 * np.ones((3, 1), dtype=self.dtype) - labels = np.zeros(3, dtype="int64") - - self.algo(out, counts, values, labels) - - assert counts[0] == 3 - assert out[0, 0] >= 0 - tm.assert_almost_equal(out[0, 0], 0.0) - - -class TestGroupVarFloat64(GroupVarTestMixin): - __test__ = True - - algo = staticmethod(group_var_float64) - dtype = np.float64 - rtol = 1e-5 - - def test_group_var_large_inputs(self): - - prng = RandomState(1234) - - out = np.array([[np.nan]], dtype=self.dtype) - counts = np.array([0], dtype="int64") - values = (prng.rand(10 ** 6) + 10 ** 12).astype(self.dtype) - values.shape = (10 ** 6, 1) - labels = np.zeros(10 ** 6, dtype="int64") - - self.algo(out, counts, values, labels) - - assert counts[0] == 10 ** 6 - tm.assert_almost_equal(out[0, 0], 1.0 / 12, rtol=0.5e-3) - - -class TestGroupVarFloat32(GroupVarTestMixin): - __test__ = True - - algo = staticmethod(group_var_float32) - dtype = np.float32 - rtol = 1e-2 - - class TestHashTable: def test_string_hashtable_set_item_signature(self): # GH#30419 fix typing in StringHashTable.set_item to prevent segfault