From 1af240f8c4ad2348220409b3b6dab04cbbfee3a0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 5 Nov 2022 17:19:37 +0100 Subject: [PATCH 1/7] ENH: Avoid object path for merge and masked arrays --- pandas/_libs/hashtable.pyx | 4 ++-- pandas/_libs/hashtable_class_helper.pxi.in | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 7aaeee043c72b..e5bd71d3dc565 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -137,7 +137,7 @@ cdef class Int64Factorizer(Factorizer): self.uniques = Int64Vector() def factorize(self, const int64_t[:] values, sort=False, - na_sentinel=-1, na_value=None) -> np.ndarray: + na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray: """ Returns ------- @@ -160,7 +160,7 @@ cdef class Int64Factorizer(Factorizer): self.uniques = uniques labels = self.table.get_labels(values, self.uniques, self.count, na_sentinel, - na_value=na_value) + na_value=na_value, mask=mask) # sort on if sort: diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index c6d8783d6f115..bda8cd83c0605 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -821,11 +821,11 @@ cdef class {{name}}HashTable(HashTable): def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None): + object na_value=None, object mask=None): # -> np.ndarray[np.intp] _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, - ignore_na=True, return_inverse=True) + ignore_na=True, return_inverse=True, mask=mask) return labels {{if dtype == 'int64'}} From 35e769a0699c28512944a499a519f483cb90a5e5 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 5 Nov 2022 20:19:11 +0100 Subject: [PATCH 2/7] ENH: Support mask in Int64Factorizer --- pandas/_libs/hashtable.pyi | 1 + pandas/tests/test_algos.py | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index e60ccdb29c6b2..c4feb5b68d50f 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -36,6 +36,7 @@ class Int64Factorizer(Factorizer): sort: bool = ..., na_sentinel=..., na_value=..., + mask=..., ) -> npt.NDArray[np.intp]: ... class Int64Vector: diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index a6b765117f616..0eeefc974cfdf 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -246,6 +246,27 @@ def test_factorize_nan(self): assert len(set(key)) == len(set(expected)) tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel) + def test_factorizer_with_mask(self): + # GH# + data = np.array([1, 2, 3, 1, 1, 0]) + mask = np.array([False, False, False, False, False, True]) + rizer = ht.Int64Factorizer(len(data)) + result = rizer.factorize(data, mask=mask) + expected = np.array([0, 1, 2, 0, 0, -1]) + tm.assert_numpy_array_equal(result, expected) + expected_uniques = np.array([1, 2, 3]) + tm.assert_numpy_array_equal(rizer.uniques.to_array(), expected_uniques) + + def test_factorizer_object_with_nan(self): + # GH# + data = np.array([1, 2, 3, 1, np.nan]) + rizer = ht.ObjectFactorizer(len(data)) + result = rizer.factorize(data.astype(object)) + expected = np.array([0, 1, 2, 0, -1]) + tm.assert_numpy_array_equal(result, expected) + expected_uniques = np.array([1, 2, 3], dtype=object) + tm.assert_numpy_array_equal(rizer.uniques.to_array(), expected_uniques) + @pytest.mark.parametrize( "data, expected_codes, expected_uniques", [ From 63f2e910ef992ab5996275ad8607395a25afbd31 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 5 Nov 2022 20:23:19 +0100 Subject: [PATCH 3/7] Fix mypy --- pandas/_libs/hashtable.pyi | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index c4feb5b68d50f..18ebc1ff2bd1f 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -138,6 +138,7 @@ class HashTable: count_prior: int = ..., na_sentinel: int = ..., na_value: object = ..., + mask=..., ) -> npt.NDArray[np.intp]: ... def unique( self, From b3e8403662216f9cbc6e4f4c6ef08e098cb8b6a2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 5 Nov 2022 20:23:45 +0100 Subject: [PATCH 4/7] Add gh ref --- pandas/tests/test_algos.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 0eeefc974cfdf..a3aa423bd560d 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -247,7 +247,7 @@ def test_factorize_nan(self): tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel) def test_factorizer_with_mask(self): - # GH# + # GH#49549 data = np.array([1, 2, 3, 1, 1, 0]) mask = np.array([False, False, False, False, False, True]) rizer = ht.Int64Factorizer(len(data)) @@ -258,7 +258,7 @@ def test_factorizer_with_mask(self): tm.assert_numpy_array_equal(rizer.uniques.to_array(), expected_uniques) def test_factorizer_object_with_nan(self): - # GH# + # GH#49549 data = np.array([1, 2, 3, 1, np.nan]) rizer = ht.ObjectFactorizer(len(data)) result = rizer.factorize(data.astype(object)) From c12aa86c32cc2611e47b1b4980e2132af1d050ca Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 5 Nov 2022 22:38:09 +0100 Subject: [PATCH 5/7] Fix tests --- pandas/tests/test_algos.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index a3aa423bd560d..ac89a67c3a59b 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -248,13 +248,13 @@ def test_factorize_nan(self): def test_factorizer_with_mask(self): # GH#49549 - data = np.array([1, 2, 3, 1, 1, 0]) + data = np.array([1, 2, 3, 1, 1, 0], dtype="int64") mask = np.array([False, False, False, False, False, True]) rizer = ht.Int64Factorizer(len(data)) result = rizer.factorize(data, mask=mask) - expected = np.array([0, 1, 2, 0, 0, -1]) + expected = np.array([0, 1, 2, 0, 0, -1], dtype="int64") tm.assert_numpy_array_equal(result, expected) - expected_uniques = np.array([1, 2, 3]) + expected_uniques = np.array([1, 2, 3], dtype="int64") tm.assert_numpy_array_equal(rizer.uniques.to_array(), expected_uniques) def test_factorizer_object_with_nan(self): @@ -262,7 +262,7 @@ def test_factorizer_object_with_nan(self): data = np.array([1, 2, 3, 1, np.nan]) rizer = ht.ObjectFactorizer(len(data)) result = rizer.factorize(data.astype(object)) - expected = np.array([0, 1, 2, 0, -1]) + expected = np.array([0, 1, 2, 0, -1], dtype="int64") tm.assert_numpy_array_equal(result, expected) expected_uniques = np.array([1, 2, 3], dtype=object) tm.assert_numpy_array_equal(rizer.uniques.to_array(), expected_uniques) From ab1f31ba4c0cd5e3a12039e48573d46ae67d5a33 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 6 Nov 2022 00:52:50 +0100 Subject: [PATCH 6/7] Fix tests --- pandas/tests/test_algos.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index ac89a67c3a59b..319983969f957 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -9,7 +9,10 @@ algos as libalgos, hashtable as ht, ) -from pandas.compat import pa_version_under7p0 +from pandas.compat import ( + IS64, + pa_version_under7p0, +) from pandas.errors import PerformanceWarning import pandas.util._test_decorators as td @@ -252,7 +255,8 @@ def test_factorizer_with_mask(self): mask = np.array([False, False, False, False, False, True]) rizer = ht.Int64Factorizer(len(data)) result = rizer.factorize(data, mask=mask) - expected = np.array([0, 1, 2, 0, 0, -1], dtype="int64") + exp_dtype = "int64" if IS64 else "int32" + expected = np.array([0, 1, 2, 0, 0, -1], dtype=exp_dtype) tm.assert_numpy_array_equal(result, expected) expected_uniques = np.array([1, 2, 3], dtype="int64") tm.assert_numpy_array_equal(rizer.uniques.to_array(), expected_uniques) @@ -262,7 +266,8 @@ def test_factorizer_object_with_nan(self): data = np.array([1, 2, 3, 1, np.nan]) rizer = ht.ObjectFactorizer(len(data)) result = rizer.factorize(data.astype(object)) - expected = np.array([0, 1, 2, 0, -1], dtype="int64") + exp_dtype = "int64" if IS64 else "int32" + expected = np.array([0, 1, 2, 0, -1], dtype=exp_dtype) tm.assert_numpy_array_equal(result, expected) expected_uniques = np.array([1, 2, 3], dtype=object) tm.assert_numpy_array_equal(rizer.uniques.to_array(), expected_uniques) From 9307d8cfc37bb1283e6fa1df4c8ebb4ce84c466e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 6 Nov 2022 00:57:17 +0100 Subject: [PATCH 7/7] Fix tests --- pandas/tests/test_algos.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 319983969f957..ac81956756a30 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -9,10 +9,7 @@ algos as libalgos, hashtable as ht, ) -from pandas.compat import ( - IS64, - pa_version_under7p0, -) +from pandas.compat import pa_version_under7p0 from pandas.errors import PerformanceWarning import pandas.util._test_decorators as td @@ -255,8 +252,7 @@ def test_factorizer_with_mask(self): mask = np.array([False, False, False, False, False, True]) rizer = ht.Int64Factorizer(len(data)) result = rizer.factorize(data, mask=mask) - exp_dtype = "int64" if IS64 else "int32" - expected = np.array([0, 1, 2, 0, 0, -1], dtype=exp_dtype) + expected = np.array([0, 1, 2, 0, 0, -1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) expected_uniques = np.array([1, 2, 3], dtype="int64") tm.assert_numpy_array_equal(rizer.uniques.to_array(), expected_uniques) @@ -266,8 +262,7 @@ def test_factorizer_object_with_nan(self): data = np.array([1, 2, 3, 1, np.nan]) rizer = ht.ObjectFactorizer(len(data)) result = rizer.factorize(data.astype(object)) - exp_dtype = "int64" if IS64 else "int32" - expected = np.array([0, 1, 2, 0, -1], dtype=exp_dtype) + expected = np.array([0, 1, 2, 0, -1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) expected_uniques = np.array([1, 2, 3], dtype=object) tm.assert_numpy_array_equal(rizer.uniques.to_array(), expected_uniques)