From e419141e1101a5d06b996bf3fb4eab18d4246eb9 Mon Sep 17 00:00:00 2001 From: Martin Winkel Date: Thu, 27 Feb 2020 22:21:31 +0100 Subject: [PATCH 1/4] refactored test_factorize --- pandas/tests/base/test_ops.py | 71 ++++++++--------------------------- 1 file changed, 15 insertions(+), 56 deletions(-) diff --git a/pandas/tests/base/test_ops.py b/pandas/tests/base/test_ops.py index f85d823cb2fac..0b597600c29ab 100644 --- a/pandas/tests/base/test_ops.py +++ b/pandas/tests/base/test_ops.py @@ -557,66 +557,25 @@ def test_value_counts_datetime64(self, index_or_series): result2 = td2.value_counts() tm.assert_series_equal(result2, expected_s) - def test_factorize(self): - for orig in self.objs: - o = orig.copy() - - if isinstance(o, Index) and o.is_boolean(): - exp_arr = np.array([0, 1] + [0] * 8, dtype=np.intp) - exp_uniques = o - exp_uniques = Index([False, True]) - else: - exp_arr = np.array(range(len(o)), dtype=np.intp) - exp_uniques = o - codes, uniques = o.factorize() - - tm.assert_numpy_array_equal(codes, exp_arr) - if isinstance(o, Series): - tm.assert_index_equal(uniques, Index(orig), check_names=False) - else: - # factorize explicitly resets name - tm.assert_index_equal(uniques, exp_uniques, check_names=False) - - def test_factorize_repeated(self): - for orig in self.objs: - o = orig.copy() - - # don't test boolean - if isinstance(o, Index) and o.is_boolean(): - continue - - # sort by value, and create duplicates - if isinstance(o, Series): - o = o.sort_values() - n = o.iloc[5:].append(o) - else: - indexer = o.argsort() - o = o.take(indexer) - n = o[5:].append(o) + @pytest.mark.parametrize("sort", [True, False]) + def test_factorize(self, index_or_series_obj, sort): + obj = index_or_series_obj + result_codes, result_uniques = obj.factorize(sort=sort) - exp_arr = np.array( - [5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.intp - ) - codes, uniques = n.factorize(sort=True) + constructor = pd.Index + if isinstance(obj, pd.MultiIndex): + constructor = pd.MultiIndex.from_tuples + expected_uniques = constructor(obj.unique()) - tm.assert_numpy_array_equal(codes, exp_arr) - if isinstance(o, Series): - tm.assert_index_equal( - uniques, Index(orig).sort_values(), check_names=False - ) - else: - tm.assert_index_equal(uniques, o, check_names=False) + if sort: + expected_uniques = expected_uniques.sort_values() - exp_arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4], np.intp) - codes, uniques = n.factorize(sort=False) - tm.assert_numpy_array_equal(codes, exp_arr) + expected_uniques_list = list(expected_uniques) + expected_codes = [expected_uniques_list.index(val) for val in obj] + expected_codes = np.asarray(expected_codes, dtype=np.int64) - if isinstance(o, Series): - expected = Index(o.iloc[5:10].append(o.iloc[:5])) - tm.assert_index_equal(uniques, expected, check_names=False) - else: - expected = o[5:10].append(o[:5]) - tm.assert_index_equal(uniques, expected, check_names=False) + tm.assert_numpy_array_equal(result_codes, expected_codes) + tm.assert_index_equal(result_uniques, expected_uniques) def test_duplicated_drop_duplicates_index(self): # GH 4060 From cdca771557dfff4f2d21be1ece04bbe7725654b3 Mon Sep 17 00:00:00 2001 From: Martin Winkel Date: Thu, 27 Feb 2020 23:35:56 +0100 Subject: [PATCH 2/4] fixing ci --- pandas/tests/base/test_ops.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/base/test_ops.py b/pandas/tests/base/test_ops.py index 0b597600c29ab..ca2c51731018d 100644 --- a/pandas/tests/base/test_ops.py +++ b/pandas/tests/base/test_ops.py @@ -572,7 +572,10 @@ def test_factorize(self, index_or_series_obj, sort): expected_uniques_list = list(expected_uniques) expected_codes = [expected_uniques_list.index(val) for val in obj] - expected_codes = np.asarray(expected_codes, dtype=np.int64) + + # CI: on linux 32bit the dtype is int32, otherwise int64 + assert result_codes.dtype in [np.int32, np.int64] + expected_codes = np.asarray(expected_codes, dtype=result_codes.dtype) tm.assert_numpy_array_equal(result_codes, expected_codes) tm.assert_index_equal(result_uniques, expected_uniques) From 1b4825b53575c61e7c86987d83f742d1e83c7418 Mon Sep 17 00:00:00 2001 From: Martin Winkel Date: Thu, 27 Feb 2020 23:48:36 +0100 Subject: [PATCH 3/4] switched to using intp instead of workaround to make CI happy --- pandas/tests/base/test_ops.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/tests/base/test_ops.py b/pandas/tests/base/test_ops.py index ca2c51731018d..450c8380dc458 100644 --- a/pandas/tests/base/test_ops.py +++ b/pandas/tests/base/test_ops.py @@ -572,10 +572,7 @@ def test_factorize(self, index_or_series_obj, sort): expected_uniques_list = list(expected_uniques) expected_codes = [expected_uniques_list.index(val) for val in obj] - - # CI: on linux 32bit the dtype is int32, otherwise int64 - assert result_codes.dtype in [np.int32, np.int64] - expected_codes = np.asarray(expected_codes, dtype=result_codes.dtype) + expected_codes = np.asarray(expected_codes, dtype=np.intp) tm.assert_numpy_array_equal(result_codes, expected_codes) tm.assert_index_equal(result_uniques, expected_uniques) From 9c22a50f86e3b88531ade03644310d3ea6412f5d Mon Sep 17 00:00:00 2001 From: Martin Winkel Date: Wed, 4 Mar 2020 18:02:51 +0100 Subject: [PATCH 4/4] review comments --- pandas/tests/base/test_ops.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/base/test_ops.py b/pandas/tests/base/test_ops.py index 450c8380dc458..2dfd1d14965e2 100644 --- a/pandas/tests/base/test_ops.py +++ b/pandas/tests/base/test_ops.py @@ -570,6 +570,8 @@ def test_factorize(self, index_or_series_obj, sort): if sort: expected_uniques = expected_uniques.sort_values() + # construct an integer ndarray so that + # `expected_uniques.take(expected_codes)` is equal to `obj` expected_uniques_list = list(expected_uniques) expected_codes = [expected_uniques_list.index(val) for val in obj] expected_codes = np.asarray(expected_codes, dtype=np.intp)