Skip to content

Commit e66c15f

Browse files
mroeschkepmhatre1
authored andcommitted
PERF: Allow np.integer Series/Index to convert to RangeIndex (pandas-dev#58016)
* PERF: Allow np.integer Series/Index to convert to RangeIndex * cast Series to array * missing not * Remove int32 casting in stata tests * Add casting * Specify int64 * don't overwrite sequence
1 parent 2c30be4 commit e66c15f

File tree

4 files changed

+29
-29
lines changed

4 files changed

+29
-29
lines changed

pandas/core/indexes/base.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -7115,17 +7115,22 @@ def maybe_sequence_to_range(sequence) -> Any | range:
71157115
-------
71167116
Any : input or range
71177117
"""
7118-
if isinstance(sequence, (ABCSeries, Index, range, ExtensionArray)):
7118+
if isinstance(sequence, (range, ExtensionArray)):
71197119
return sequence
71207120
elif len(sequence) == 1 or lib.infer_dtype(sequence, skipna=False) != "integer":
71217121
return sequence
7122-
elif len(sequence) == 0:
7122+
elif isinstance(sequence, (ABCSeries, Index)) and not (
7123+
isinstance(sequence.dtype, np.dtype) and sequence.dtype.kind == "i"
7124+
):
7125+
return sequence
7126+
if len(sequence) == 0:
71237127
return range(0)
7124-
diff = sequence[1] - sequence[0]
7128+
np_sequence = np.asarray(sequence, dtype=np.int64)
7129+
diff = np_sequence[1] - np_sequence[0]
71257130
if diff == 0:
71267131
return sequence
7127-
elif len(sequence) == 2 or lib.is_sequence_range(np.asarray(sequence), diff):
7128-
return range(sequence[0], sequence[-1] + diff, diff)
7132+
elif len(sequence) == 2 or lib.is_sequence_range(np_sequence, diff):
7133+
return range(np_sequence[0], np_sequence[-1] + diff, diff)
71297134
else:
71307135
return sequence
71317136

pandas/tests/frame/methods/test_set_index.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ def test_set_index_dst(self):
148148

149149
def test_set_index(self, float_string_frame):
150150
df = float_string_frame
151-
idx = Index(np.arange(len(df))[::-1])
151+
idx = Index(np.arange(len(df) - 1, -1, -1, dtype=np.int64))
152152

153153
df = df.set_index(idx)
154154
tm.assert_index_equal(df.index, idx)

pandas/tests/io/test_stata.py

+6-16
Original file line numberDiff line numberDiff line change
@@ -513,7 +513,6 @@ def test_read_write_reread_dta14(self, file, parsed_114, version, datapath):
513513
written_and_read_again = self.read_dta(path)
514514

515515
expected = parsed_114.copy()
516-
expected.index = expected.index.astype(np.int32)
517516
tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
518517

519518
@pytest.mark.parametrize(
@@ -576,7 +575,6 @@ def test_numeric_column_names(self):
576575
written_and_read_again.columns = map(convert_col_name, columns)
577576

578577
expected = original
579-
expected.index = expected.index.astype(np.int32)
580578
tm.assert_frame_equal(expected, written_and_read_again)
581579

582580
@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
@@ -594,7 +592,6 @@ def test_nan_to_missing_value(self, version):
594592

595593
written_and_read_again = written_and_read_again.set_index("index")
596594
expected = original
597-
expected.index = expected.index.astype(np.int32)
598595
tm.assert_frame_equal(written_and_read_again, expected)
599596

600597
def test_no_index(self):
@@ -617,7 +614,6 @@ def test_string_no_dates(self):
617614
written_and_read_again = self.read_dta(path)
618615

619616
expected = original
620-
expected.index = expected.index.astype(np.int32)
621617
tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
622618

623619
def test_large_value_conversion(self):
@@ -637,7 +633,6 @@ def test_large_value_conversion(self):
637633
modified["s1"] = Series(modified["s1"], dtype=np.int16)
638634
modified["s2"] = Series(modified["s2"], dtype=np.int32)
639635
modified["s3"] = Series(modified["s3"], dtype=np.float64)
640-
modified.index = original.index.astype(np.int32)
641636
tm.assert_frame_equal(written_and_read_again.set_index("index"), modified)
642637

643638
def test_dates_invalid_column(self):
@@ -713,7 +708,7 @@ def test_write_missing_strings(self):
713708

714709
expected = DataFrame(
715710
[["1"], [""]],
716-
index=pd.Index([0, 1], dtype=np.int32, name="index"),
711+
index=pd.RangeIndex(2, name="index"),
717712
columns=["foo"],
718713
)
719714

@@ -746,7 +741,6 @@ def test_bool_uint(self, byteorder, version):
746741
written_and_read_again = written_and_read_again.set_index("index")
747742

748743
expected = original
749-
expected.index = expected.index.astype(np.int32)
750744
expected_types = (
751745
np.int8,
752746
np.int8,
@@ -1030,7 +1024,7 @@ def test_categorical_writing(self, version):
10301024
res = written_and_read_again.set_index("index")
10311025

10321026
expected = original
1033-
expected.index = expected.index.set_names("index").astype(np.int32)
1027+
expected.index = expected.index.set_names("index")
10341028

10351029
expected["incompletely_labeled"] = expected["incompletely_labeled"].apply(str)
10361030
expected["unlabeled"] = expected["unlabeled"].apply(str)
@@ -1094,7 +1088,6 @@ def test_categorical_with_stata_missing_values(self, version):
10941088
new_cats = cat.remove_unused_categories().categories
10951089
cat = cat.set_categories(new_cats, ordered=True)
10961090
expected[col] = cat
1097-
expected.index = expected.index.astype(np.int32)
10981091
tm.assert_frame_equal(res, expected)
10991092

11001093
@pytest.mark.parametrize("file", ["stata10_115", "stata10_117"])
@@ -1544,7 +1537,6 @@ def test_out_of_range_float(self):
15441537

15451538
original["ColumnTooBig"] = original["ColumnTooBig"].astype(np.float64)
15461539
expected = original
1547-
expected.index = expected.index.astype(np.int32)
15481540
tm.assert_frame_equal(reread.set_index("index"), expected)
15491541

15501542
@pytest.mark.parametrize("infval", [np.inf, -np.inf])
@@ -1669,7 +1661,6 @@ def test_writer_117(self):
16691661
original["int32"] = original["int32"].astype(np.int32)
16701662
original["float32"] = Series(original["float32"], dtype=np.float32)
16711663
original.index.name = "index"
1672-
original.index = original.index.astype(np.int32)
16731664
copy = original.copy()
16741665
with tm.ensure_clean() as path:
16751666
original.to_stata(
@@ -1962,7 +1953,7 @@ def test_read_write_ea_dtypes(self, dtype_backend):
19621953
# stata stores with ms unit, so unit does not round-trip exactly
19631954
"e": pd.date_range("2020-12-31", periods=3, freq="D", unit="ms"),
19641955
},
1965-
index=pd.Index([0, 1, 2], name="index", dtype=np.int32),
1956+
index=pd.RangeIndex(range(3), name="index"),
19661957
)
19671958

19681959
tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
@@ -2049,7 +2040,6 @@ def test_compression(compression, version, use_dict, infer, compression_to_exten
20492040
reread = read_stata(fp, index_col="index")
20502041

20512042
expected = df
2052-
expected.index = expected.index.astype(np.int32)
20532043
tm.assert_frame_equal(reread, expected)
20542044

20552045

@@ -2075,7 +2065,6 @@ def test_compression_dict(method, file_ext):
20752065
reread = read_stata(fp, index_col="index")
20762066

20772067
expected = df
2078-
expected.index = expected.index.astype(np.int32)
20792068
tm.assert_frame_equal(reread, expected)
20802069

20812070

@@ -2085,7 +2074,6 @@ def test_chunked_categorical(version):
20852074
df.index.name = "index"
20862075

20872076
expected = df.copy()
2088-
expected.index = expected.index.astype(np.int32)
20892077

20902078
with tm.ensure_clean() as path:
20912079
df.to_stata(path, version=version)
@@ -2094,7 +2082,9 @@ def test_chunked_categorical(version):
20942082
block = block.set_index("index")
20952083
assert "cats" in block
20962084
tm.assert_series_equal(
2097-
block.cats, expected.cats.iloc[2 * i : 2 * (i + 1)]
2085+
block.cats,
2086+
expected.cats.iloc[2 * i : 2 * (i + 1)],
2087+
check_index_type=len(block) > 1,
20982088
)
20992089

21002090

pandas/tests/reshape/merge/test_merge.py

+12-7
Original file line numberDiff line numberDiff line change
@@ -2192,23 +2192,28 @@ def test_merge_on_indexes(self, how, sort, expected):
21922192

21932193
@pytest.mark.parametrize(
21942194
"index",
2195-
[Index([1, 2], dtype=dtyp, name="index_col") for dtyp in tm.ALL_REAL_NUMPY_DTYPES]
2195+
[
2196+
Index([1, 2, 4], dtype=dtyp, name="index_col")
2197+
for dtyp in tm.ALL_REAL_NUMPY_DTYPES
2198+
]
21962199
+ [
2197-
CategoricalIndex(["A", "B"], categories=["A", "B"], name="index_col"),
2198-
RangeIndex(start=0, stop=2, name="index_col"),
2199-
DatetimeIndex(["2018-01-01", "2018-01-02"], name="index_col"),
2200+
CategoricalIndex(["A", "B", "C"], categories=["A", "B", "C"], name="index_col"),
2201+
RangeIndex(start=0, stop=3, name="index_col"),
2202+
DatetimeIndex(["2018-01-01", "2018-01-02", "2018-01-03"], name="index_col"),
22002203
],
22012204
ids=lambda x: f"{type(x).__name__}[{x.dtype}]",
22022205
)
22032206
def test_merge_index_types(index):
22042207
# gh-20777
22052208
# assert key access is consistent across index types
2206-
left = DataFrame({"left_data": [1, 2]}, index=index)
2207-
right = DataFrame({"right_data": [1.0, 2.0]}, index=index)
2209+
left = DataFrame({"left_data": [1, 2, 3]}, index=index)
2210+
right = DataFrame({"right_data": [1.0, 2.0, 3.0]}, index=index)
22082211

22092212
result = left.merge(right, on=["index_col"])
22102213

2211-
expected = DataFrame({"left_data": [1, 2], "right_data": [1.0, 2.0]}, index=index)
2214+
expected = DataFrame(
2215+
{"left_data": [1, 2, 3], "right_data": [1.0, 2.0, 3.0]}, index=index
2216+
)
22122217
tm.assert_frame_equal(result, expected)
22132218

22142219

0 commit comments

Comments
 (0)