Skip to content

Commit 3b94119

Browse files
authored
PERF: avoid copy in _obj_with_exclusions, _selected_obj (#51090)
1 parent 337faf3 commit 3b94119

File tree

4 files changed

+42
-9
lines changed

4 files changed

+42
-9
lines changed

doc/source/whatsnew/v2.0.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -979,6 +979,8 @@ Performance improvements
979979
- Performance improvement in :meth:`.SeriesGroupBy.value_counts` with categorical dtype (:issue:`46202`)
980980
- Fixed a reference leak in :func:`read_hdf` (:issue:`37441`)
981981
- Fixed a memory leak in :meth:`DataFrame.to_json` and :meth:`Series.to_json` when serializing datetimes and timedeltas (:issue:`40443`)
982+
- Decreased memory usage in many :class:`DataFrameGroupBy` methods (:issue:`51090`)
983+
-
982984

983985
.. ---------------------------------------------------------------------------
984986
.. _whatsnew_200.bug_fixes:

pandas/core/base.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -213,12 +213,12 @@ def ndim(self) -> int:
213213
@final
214214
@cache_readonly
215215
def _obj_with_exclusions(self):
216-
if self._selection is not None and isinstance(self.obj, ABCDataFrame):
217-
return self.obj[self._selection_list]
218-
219216
if isinstance(self.obj, ABCSeries):
220217
return self.obj
221218

219+
if self._selection is not None:
220+
return self.obj._getitem_nocopy(self._selection_list)
221+
222222
if len(self.exclusions) > 0:
223223
# equivalent to `self.obj.drop(self.exclusions, axis=1)
224224
# but this avoids consolidating and making a copy

pandas/core/frame.py

+19
Original file line numberDiff line numberDiff line change
@@ -3663,6 +3663,25 @@ def _iter_column_arrays(self) -> Iterator[ArrayLike]:
36633663
for i in range(len(self.columns)):
36643664
yield self._get_column_array(i)
36653665

3666+
def _getitem_nocopy(self, key: list):
3667+
"""
3668+
Behaves like __getitem__, but returns a view in cases where __getitem__
3669+
would make a copy.
3670+
"""
3671+
# TODO(CoW): can be removed if/when we are always Copy-on-Write
3672+
indexer = self.columns._get_indexer_strict(key, "columns")[1]
3673+
new_axis = self.columns[indexer]
3674+
3675+
new_mgr = self._mgr.reindex_indexer(
3676+
new_axis,
3677+
indexer,
3678+
axis=0,
3679+
allow_dups=True,
3680+
copy=False,
3681+
only_slice=True,
3682+
)
3683+
return self._constructor(new_mgr)
3684+
36663685
def __getitem__(self, key):
36673686
check_dict_or_set_indexers(key)
36683687
key = lib.item_from_zerodim(key)

pandas/core/groupby/groupby.py

+18-6
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ class providing the base-class of operations.
7676
is_bool_dtype,
7777
is_datetime64_dtype,
7878
is_float_dtype,
79+
is_hashable,
7980
is_integer,
8081
is_integer_dtype,
8182
is_numeric_dtype,
@@ -722,13 +723,24 @@ def _get_index(self, name):
722723
@cache_readonly
723724
def _selected_obj(self):
724725
# Note: _selected_obj is always just `self.obj` for SeriesGroupBy
725-
726-
if self._selection is None or isinstance(self.obj, Series):
727-
if self._group_selection is not None:
728-
return self.obj[self._group_selection]
726+
if isinstance(self.obj, Series):
729727
return self.obj
730-
else:
731-
return self.obj[self._selection]
728+
729+
if self._selection is not None:
730+
if is_hashable(self._selection):
731+
# i.e. a single key, so selecting it will return a Series.
732+
# In this case, _obj_with_exclusions would wrap the key
733+
# in a list and return a single-column DataFrame.
734+
return self.obj[self._selection]
735+
736+
# Otherwise _selection is equivalent to _selection_list, so
737+
# _selected_obj matches _obj_with_exclusions, so we can re-use
738+
# that and avoid making a copy.
739+
return self._obj_with_exclusions
740+
741+
if self._group_selection is not None:
742+
return self._obj_with_exclusions
743+
return self.obj
732744

733745
@final
734746
def _dir_additions(self) -> set[str]:

0 commit comments

Comments
 (0)