Skip to content

PERF: Index._shallow_copy shares _cache with copies of self #36840

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Oct 6, 2020
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,7 @@ Performance improvements
- ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`)
- Performance improvement in :meth:`pd.to_datetime` with non-ns time unit for ``float`` ``dtype`` columns (:issue:`20445`)
- Performance improvement in setting values on a :class:`IntervalArray` (:issue:`36310`)
- Performance improvement for the :meth:`~Index.equals` method on all index classes, when compared to copies of the same index (:issue:`36840`)

.. ---------------------------------------------------------------------------

Expand Down Expand Up @@ -358,6 +359,7 @@ Indexing
- Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`)
- Bug in :meth:`Index.get_indexer` and :meth:`Index.get_indexer_non_unique` where int64 arrays are returned instead of intp. (:issue:`36359`)
- Bug in :meth:`DataFrame.sort_index` where parameter ascending passed as a list on a single level index gives wrong result. (:issue:`32334`)
- Bug in :meth:`Index.equals`, where it required the name to the be equal. This method should not compare names for equality (:issue:`36840`)

Missing
^^^^^^^
Expand Down
11 changes: 6 additions & 5 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,12 +561,13 @@ def _shallow_copy(self, values=None, name: Label = no_default):
name : Label, defaults to self.name
"""
name = self.name if name is no_default else name
cache = self._cache.copy() if values is None else {}
if values is None:
values = self._values

result = self._simple_new(values, name=name)
result._cache = cache
if values is not None:
return self._simple_new(values, name=name)

result = self._simple_new(self._values, name=name)
result._cache = self._cache.copy()
result._id = self._id
return result

def is_(self, other) -> bool:
Expand Down
15 changes: 7 additions & 8 deletions pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -671,17 +671,16 @@ def _with_freq(self, freq):

def _shallow_copy(self, values=None, name: Label = lib.no_default):
name = self.name if name is lib.no_default else name
cache = self._cache.copy() if values is None else {}

if values is None:
values = self._data

if isinstance(values, np.ndarray):
if values is not None:
# TODO: We would rather not get here
values = type(self._data)(values, dtype=self.dtype)
if isinstance(values, np.ndarray):
values = type(self._data)(values, dtype=self.dtype)
return self._simple_new(values, name=name)

result = type(self)._simple_new(values, name=name)
result._cache = cache
result = self._simple_new(self._data, name=name)
result._cache = self._cache.copy()
result._id = self._id
return result

# --------------------------------------------------------------------
Expand Down
10 changes: 5 additions & 5 deletions pandas/core/indexes/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,12 +335,12 @@ def _shallow_copy(
self, values: Optional[IntervalArray] = None, name: Label = lib.no_default
):
name = self.name if name is lib.no_default else name
cache = self._cache.copy() if values is None else {}
if values is None:
values = self._data
if values is not None:
return self._simple_new(values, name=name)

result = self._simple_new(values, name=name)
result._cache = cache
result = self._simple_new(self._data, name=name)
result._cache = self._cache.copy()
result._id = self._id
return result

@cache_readonly
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1089,6 +1089,7 @@ def _shallow_copy(
)
result._cache = self._cache.copy()
result._cache.pop("levels", None) # GH32669
result._id = self._id
return result

def symmetric_difference(self, other, result_name=None, sort=None):
Expand Down Expand Up @@ -1192,7 +1193,6 @@ def __array__(self, dtype=None) -> np.ndarray:
def view(self, cls=None):
""" this is defined as a copy with the same identity """
result = self.copy()
result._id = self._id
return result

@doc(Index.__contains__)
Expand Down
11 changes: 6 additions & 5 deletions pandas/core/indexes/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,12 +260,13 @@ def _has_complex_internals(self) -> bool:

def _shallow_copy(self, values=None, name: Label = no_default):
name = name if name is not no_default else self.name
cache = self._cache.copy() if values is None else {}
if values is None:
values = self._data

result = self._simple_new(values, name=name)
result._cache = cache
if values is not None:
return self._simple_new(values, name=name)

result = self._simple_new(self._data, name=name)
result._cache = self._cache.copy()
result._id = self._id
return result

def _maybe_convert_timedelta(self, other):
Expand Down
11 changes: 6 additions & 5 deletions pandas/core/indexes/range.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,13 +397,14 @@ def __iter__(self):
def _shallow_copy(self, values=None, name: Label = no_default):
name = self.name if name is no_default else name

if values is None:
result = self._simple_new(self._range, name=name)
result._cache = self._cache.copy()
return result
else:
if values is not None:
return Int64Index._simple_new(values, name=name)

result = self._simple_new(self._range, name=name)
result._cache = self._cache.copy()
result._id = self._id
return result

@doc(Int64Index.copy)
def copy(self, name=None, deep=False, dtype=None, names=None):
name = self._validate_names(name=name, names=names, deep=deep)[0]
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/indexes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -960,3 +960,9 @@ def test_shallow_copy_copies_cache(self):
# cache values should reference the same object
for key, val in idx._cache.items():
assert shallow_copy._cache[key] is val, key

def test_shallow_copy_copies_id(self):
# GH36840
idx = self.create_index()
shallow_copy = idx._shallow_copy()
assert idx._id is shallow_copy._id
2 changes: 1 addition & 1 deletion pandas/tests/indexes/datetimes/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -931,7 +931,7 @@ def test_is_(self):
dti = date_range(start="1/1/2005", end="12/1/2005", freq="M")
assert dti.is_(dti)
assert dti.is_(dti.view())
assert not dti.is_(dti.copy())
assert dti.is_(dti.copy())

def test_index_cast_datetime64_other_units(self):
arr = np.arange(0, 100, 10, dtype=np.int64).view("M8[D]")
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexes/multi/test_equivalence.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def test_is_():
assert mi2.is_(mi)
assert mi.is_(mi2)

assert not mi.is_(mi.set_names(["C", "D"]))
assert mi.is_(mi.set_names(["C", "D"]))
mi2 = mi.view()
mi2.set_names(["E", "F"], inplace=True)
assert mi.is_(mi2)
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,8 +523,8 @@ def test_is_(self):
assert ind.is_(ind)
assert ind.is_(ind.view().view().view().view())
assert not ind.is_(Index(range(10)))
assert not ind.is_(ind.copy())
assert not ind.is_(ind.copy(deep=False))
assert ind.is_(ind.copy())
assert ind.is_(ind.copy(deep=False))
assert not ind.is_(ind[:])
assert not ind.is_(np.array(range(10)))

Expand Down