PERF: Allow np.integer Series/Index to convert to RangeIndex (pandas-dev#58016)

mroeschke · pmhatre1 · commit e66c15f6f6a5 · 2024-05-06T23:13:38.000-07:00
* PERF: Allow np.integer Series/Index to convert to RangeIndex

* cast Series to array

* missing not

* Remove int32 casting in stata tests

* Add casting

* Specify int64

* don't overwrite sequence
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -7115,17 +7115,22 @@ def maybe_sequence_to_range(sequence) -> Any | range:
     -------
     Any : input or range
     """
-    if isinstance(sequence, (ABCSeries, Index, range, ExtensionArray)):
+    if isinstance(sequence, (range, ExtensionArray)):
         return sequence
     elif len(sequence) == 1 or lib.infer_dtype(sequence, skipna=False) != "integer":
         return sequence
-    elif len(sequence) == 0:
+    elif isinstance(sequence, (ABCSeries, Index)) and not (
+        isinstance(sequence.dtype, np.dtype) and sequence.dtype.kind == "i"
+    ):
+        return sequence
+    if len(sequence) == 0:
         return range(0)
-    diff = sequence[1] - sequence[0]
+    np_sequence = np.asarray(sequence, dtype=np.int64)
+    diff = np_sequence[1] - np_sequence[0]
     if diff == 0:
         return sequence
-    elif len(sequence) == 2 or lib.is_sequence_range(np.asarray(sequence), diff):
-        return range(sequence[0], sequence[-1] + diff, diff)
+    elif len(sequence) == 2 or lib.is_sequence_range(np_sequence, diff):
+        return range(np_sequence[0], np_sequence[-1] + diff, diff)
     else:
         return sequence
 
diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py
@@ -148,7 +148,7 @@ def test_set_index_dst(self):
 
     def test_set_index(self, float_string_frame):
         df = float_string_frame
-        idx = Index(np.arange(len(df))[::-1])
+        idx = Index(np.arange(len(df) - 1, -1, -1, dtype=np.int64))
 
         df = df.set_index(idx)
         tm.assert_index_equal(df.index, idx)
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -513,7 +513,6 @@ def test_read_write_reread_dta14(self, file, parsed_114, version, datapath):
             written_and_read_again = self.read_dta(path)
 
         expected = parsed_114.copy()
-        expected.index = expected.index.astype(np.int32)
         tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
 
     @pytest.mark.parametrize(
@@ -576,7 +575,6 @@ def test_numeric_column_names(self):
         written_and_read_again.columns = map(convert_col_name, columns)
 
         expected = original
-        expected.index = expected.index.astype(np.int32)
         tm.assert_frame_equal(expected, written_and_read_again)
 
     @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
@@ -594,7 +592,6 @@ def test_nan_to_missing_value(self, version):
 
         written_and_read_again = written_and_read_again.set_index("index")
         expected = original
-        expected.index = expected.index.astype(np.int32)
         tm.assert_frame_equal(written_and_read_again, expected)
 
     def test_no_index(self):
@@ -617,7 +614,6 @@ def test_string_no_dates(self):
             written_and_read_again = self.read_dta(path)
 
         expected = original
-        expected.index = expected.index.astype(np.int32)
         tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
 
     def test_large_value_conversion(self):
@@ -637,7 +633,6 @@ def test_large_value_conversion(self):
         modified["s1"] = Series(modified["s1"], dtype=np.int16)
         modified["s2"] = Series(modified["s2"], dtype=np.int32)
         modified["s3"] = Series(modified["s3"], dtype=np.float64)
-        modified.index = original.index.astype(np.int32)
         tm.assert_frame_equal(written_and_read_again.set_index("index"), modified)
 
     def test_dates_invalid_column(self):
@@ -713,7 +708,7 @@ def test_write_missing_strings(self):
 
         expected = DataFrame(
             [["1"], [""]],
-            index=pd.Index([0, 1], dtype=np.int32, name="index"),
+            index=pd.RangeIndex(2, name="index"),
             columns=["foo"],
         )
 
@@ -746,7 +741,6 @@ def test_bool_uint(self, byteorder, version):
         written_and_read_again = written_and_read_again.set_index("index")
 
         expected = original
-        expected.index = expected.index.astype(np.int32)
         expected_types = (
             np.int8,
             np.int8,
@@ -1030,7 +1024,7 @@ def test_categorical_writing(self, version):
         res = written_and_read_again.set_index("index")
 
         expected = original
-        expected.index = expected.index.set_names("index").astype(np.int32)
+        expected.index = expected.index.set_names("index")
 
         expected["incompletely_labeled"] = expected["incompletely_labeled"].apply(str)
         expected["unlabeled"] = expected["unlabeled"].apply(str)
@@ -1094,7 +1088,6 @@ def test_categorical_with_stata_missing_values(self, version):
             new_cats = cat.remove_unused_categories().categories
             cat = cat.set_categories(new_cats, ordered=True)
             expected[col] = cat
-        expected.index = expected.index.astype(np.int32)
         tm.assert_frame_equal(res, expected)
 
     @pytest.mark.parametrize("file", ["stata10_115", "stata10_117"])
@@ -1544,7 +1537,6 @@ def test_out_of_range_float(self):
 
         original["ColumnTooBig"] = original["ColumnTooBig"].astype(np.float64)
         expected = original
-        expected.index = expected.index.astype(np.int32)
         tm.assert_frame_equal(reread.set_index("index"), expected)
 
     @pytest.mark.parametrize("infval", [np.inf, -np.inf])
@@ -1669,7 +1661,6 @@ def test_writer_117(self):
         original["int32"] = original["int32"].astype(np.int32)
         original["float32"] = Series(original["float32"], dtype=np.float32)
         original.index.name = "index"
-        original.index = original.index.astype(np.int32)
         copy = original.copy()
         with tm.ensure_clean() as path:
             original.to_stata(
@@ -1962,7 +1953,7 @@ def test_read_write_ea_dtypes(self, dtype_backend):
                 # stata stores with ms unit, so unit does not round-trip exactly
                 "e": pd.date_range("2020-12-31", periods=3, freq="D", unit="ms"),
             },
-            index=pd.Index([0, 1, 2], name="index", dtype=np.int32),
+            index=pd.RangeIndex(range(3), name="index"),
         )
 
         tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
@@ -2049,7 +2040,6 @@ def test_compression(compression, version, use_dict, infer, compression_to_exten
         reread = read_stata(fp, index_col="index")
 
     expected = df
-    expected.index = expected.index.astype(np.int32)
     tm.assert_frame_equal(reread, expected)
 
 
@@ -2075,7 +2065,6 @@ def test_compression_dict(method, file_ext):
         reread = read_stata(fp, index_col="index")
 
     expected = df
-    expected.index = expected.index.astype(np.int32)
     tm.assert_frame_equal(reread, expected)
 
 
@@ -2085,7 +2074,6 @@ def test_chunked_categorical(version):
     df.index.name = "index"
 
     expected = df.copy()
-    expected.index = expected.index.astype(np.int32)
 
     with tm.ensure_clean() as path:
         df.to_stata(path, version=version)
@@ -2094,7 +2082,9 @@ def test_chunked_categorical(version):
                 block = block.set_index("index")
                 assert "cats" in block
                 tm.assert_series_equal(
-                    block.cats, expected.cats.iloc[2 * i : 2 * (i + 1)]
+                    block.cats,
+                    expected.cats.iloc[2 * i : 2 * (i + 1)],
+                    check_index_type=len(block) > 1,
                 )
 
 
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
@@ -2192,23 +2192,28 @@ def test_merge_on_indexes(self, how, sort, expected):
 
 @pytest.mark.parametrize(
     "index",
-    [Index([1, 2], dtype=dtyp, name="index_col") for dtyp in tm.ALL_REAL_NUMPY_DTYPES]
+    [
+        Index([1, 2, 4], dtype=dtyp, name="index_col")
+        for dtyp in tm.ALL_REAL_NUMPY_DTYPES
+    ]
     + [
-        CategoricalIndex(["A", "B"], categories=["A", "B"], name="index_col"),
-        RangeIndex(start=0, stop=2, name="index_col"),
-        DatetimeIndex(["2018-01-01", "2018-01-02"], name="index_col"),
+        CategoricalIndex(["A", "B", "C"], categories=["A", "B", "C"], name="index_col"),
+        RangeIndex(start=0, stop=3, name="index_col"),
+        DatetimeIndex(["2018-01-01", "2018-01-02", "2018-01-03"], name="index_col"),
     ],
     ids=lambda x: f"{type(x).__name__}[{x.dtype}]",
 )
 def test_merge_index_types(index):
     # gh-20777
     # assert key access is consistent across index types
-    left = DataFrame({"left_data": [1, 2]}, index=index)
-    right = DataFrame({"right_data": [1.0, 2.0]}, index=index)
+    left = DataFrame({"left_data": [1, 2, 3]}, index=index)
+    right = DataFrame({"right_data": [1.0, 2.0, 3.0]}, index=index)
 
     result = left.merge(right, on=["index_col"])
 
-    expected = DataFrame({"left_data": [1, 2], "right_data": [1.0, 2.0]}, index=index)
+    expected = DataFrame(
+        {"left_data": [1, 2, 3], "right_data": [1.0, 2.0, 3.0]}, index=index
+    )
     tm.assert_frame_equal(result, expected)