From fd09de41c16267beaa5bfa35ae6061aa04e5411c Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 23 Nov 2020 21:14:04 -0800 Subject: [PATCH 1/5] ENH: preserve RangeIndex in factorize --- pandas/core/algorithms.py | 3 ++ pandas/core/indexes/range.py | 13 +++++++- .../arrays/categorical/test_constructors.py | 10 ++++++ .../tests/indexes/multi/test_constructors.py | 8 +++++ pandas/tests/test_algos.py | 33 +++++++++++++++++++ 5 files changed, 66 insertions(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b79905796f7cd..447333862c609 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -52,6 +52,7 @@ ABCIndex, ABCIndexClass, ABCMultiIndex, + ABCRangeIndex, ABCSeries, ) from pandas.core.dtypes.missing import isna, na_value_for_dtype @@ -670,6 +671,8 @@ def factorize( na_sentinel = -1 dropna = False + if isinstance(values, ABCRangeIndex): + return values.factorize(sort=sort) if is_extension_array_dtype(values.dtype): values = extract_array(values) codes, uniques = values.factorize(na_sentinel=na_sentinel) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 8e12f84895361..376620ec469f5 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1,7 +1,7 @@ from datetime import timedelta import operator from sys import getsizeof -from typing import Any, List +from typing import Any, List, Optional, Tuple import warnings import numpy as np @@ -461,6 +461,17 @@ def argsort(self, *args, **kwargs) -> np.ndarray: else: return np.arange(len(self) - 1, -1, -1) + @doc(Int64Index.factorize) + def factorize( + self, sort: bool = False, na_sentinel: Optional[int] = -1 + ) -> Tuple[np.ndarray, "RangeIndex"]: + codes = np.arange(len(self)) + uniques = self + if sort and self.step < 0: + codes = codes[::-1] + uniques = uniques[::-1] + return codes, uniques + def equals(self, other: object) -> bool: """ Determines if two Index objects contain the same elements. diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index dab7fc51f2537..753c15bde6bba 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -290,6 +290,16 @@ def test_constructor_with_generator(self): cat = Categorical([0, 1, 2], categories=range(3)) tm.assert_categorical_equal(cat, exp) + def test_constructor_with_rangeindex(self): + # RangeIndex is preserved in Categories + rng = Index(range(3)) + + cat = Categorical(rng) + tm.assert_index_equal(cat.categories, rng, exact=True) + + cat = Categorical([1, 2, 0], categories=rng) + tm.assert_index_equal(cat.categories, rng, exact=True) + @pytest.mark.parametrize( "dtl", [ diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index a2ca686d0412d..df15e2e78aef0 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -477,6 +477,14 @@ def test_from_product_datetimeindex(): tm.assert_numpy_array_equal(mi.values, etalon) +def test_from_product_rangeindex(): + # RangeIndex is preserved by factorize, so preserved in levels + rng = Index(range(5)) + other = ["a", "b"] + mi = MultiIndex.from_product([rng, other]) + tm.assert_index_equal(mi._levels[0], rng, exact=True) + + @pytest.mark.parametrize("ordered", [False, True]) @pytest.mark.parametrize("f", [lambda x: x, lambda x: Series(x), lambda x: x.values]) def test_from_product_index_series_categorical(ordered, f): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index c76369c213a70..cd129f50019dc 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -307,6 +307,39 @@ def test_datetime64_factorize(self, writable): tm.assert_numpy_array_equal(codes, expected_codes) tm.assert_numpy_array_equal(uniques, expected_uniques) + @pytest.mark.parametrize("sort", [True, False]) + def test_factorize_rangeindex(self, sort): + # increasing -> sort doesn't matter + ri = pd.RangeIndex.from_range(range(10)) + expected = np.arange(10), ri + + result = algos.factorize(ri, sort=sort) + tm.assert_numpy_array_equal(result[0], expected[0]) + tm.assert_index_equal(result[1], expected[1], exact=True) + + result = ri.factorize(sort=sort) + tm.assert_numpy_array_equal(result[0], expected[0]) + tm.assert_index_equal(result[1], expected[1], exact=True) + + @pytest.mark.parametrize("sort", [True, False]) + def test_factorize_rangeindex_decreasing(self, sort): + # decreasing -> sort matters + ri = pd.RangeIndex.from_range(range(10)) + expected = np.arange(10), ri + + ri2 = ri[::-1] + expected = expected[0], ri2 + if sort: + expected = expected[0][::-1], expected[1][::-1] + + result = algos.factorize(ri2, sort=sort) + tm.assert_numpy_array_equal(result[0], expected[0]) + tm.assert_index_equal(result[1], expected[1], exact=True) + + result = ri2.factorize(sort=sort) + tm.assert_numpy_array_equal(result[0], expected[0]) + tm.assert_index_equal(result[1], expected[1], exact=True) + def test_deprecate_order(self): # gh 19727 - check warning is raised for deprecated keyword, order. # Test not valid once order keyword is removed. From 13cb696eabfb7accc2eba16bc4158fa1a0bac666 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 23 Nov 2020 21:15:55 -0800 Subject: [PATCH 2/5] dedoc --- pandas/core/indexes/range.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 376620ec469f5..fc8b8641fc091 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -461,7 +461,6 @@ def argsort(self, *args, **kwargs) -> np.ndarray: else: return np.arange(len(self) - 1, -1, -1) - @doc(Int64Index.factorize) def factorize( self, sort: bool = False, na_sentinel: Optional[int] = -1 ) -> Tuple[np.ndarray, "RangeIndex"]: From d4003f283d853e98aa14ac70fcff323ea668f00a Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 23 Nov 2020 22:15:46 -0800 Subject: [PATCH 3/5] ensure np.intp --- pandas/core/indexes/range.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index fc8b8641fc091..3222f44dd4415 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -464,7 +464,7 @@ def argsort(self, *args, **kwargs) -> np.ndarray: def factorize( self, sort: bool = False, na_sentinel: Optional[int] = -1 ) -> Tuple[np.ndarray, "RangeIndex"]: - codes = np.arange(len(self)) + codes = np.arange(len(self), dtype=np.intp) uniques = self if sort and self.step < 0: codes = codes[::-1] From 231702ac433342d1b1ee23ae57e2ebdadf76879c Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 24 Nov 2020 07:45:31 -0800 Subject: [PATCH 4/5] 32bit compat --- pandas/tests/test_algos.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index cd129f50019dc..787cdf74b7d51 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -311,7 +311,7 @@ def test_datetime64_factorize(self, writable): def test_factorize_rangeindex(self, sort): # increasing -> sort doesn't matter ri = pd.RangeIndex.from_range(range(10)) - expected = np.arange(10), ri + expected = np.arange(10, dtype=np.intp), ri result = algos.factorize(ri, sort=sort) tm.assert_numpy_array_equal(result[0], expected[0]) @@ -325,7 +325,7 @@ def test_factorize_rangeindex(self, sort): def test_factorize_rangeindex_decreasing(self, sort): # decreasing -> sort matters ri = pd.RangeIndex.from_range(range(10)) - expected = np.arange(10), ri + expected = np.arange(10, dtype=np.intp), ri ri2 = ri[::-1] expected = expected[0], ri2 From bb2ffc0592369d8e73591b566f380f4b0e280248 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 26 Nov 2020 11:01:51 -0800 Subject: [PATCH 5/5] if->elif --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b78d6d55ca1f8..afd6bbb6c57e0 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -685,7 +685,7 @@ def factorize( if isinstance(values, ABCRangeIndex): return values.factorize(sort=sort) - if is_extension_array_dtype(values.dtype): + elif is_extension_array_dtype(values.dtype): values = extract_array(values) codes, uniques = values.factorize(na_sentinel=na_sentinel) dtype = original.dtype