Skip to content

CoW: Switch to copy=False everywhere for Series constructor #52068

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 2, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -838,7 +838,7 @@ def value_counts(
if bins is not None:
from pandas.core.reshape.tile import cut

values = Series(values)
values = Series(values, copy=False)
try:
ii = cut(values, bins, include_lowest=True)
except TypeError as err:
Expand All @@ -861,7 +861,7 @@ def value_counts(
else:
if is_extension_array_dtype(values):
# handle Categorical and sparse,
result = Series(values)._values.value_counts(dropna=dropna)
result = Series(values, copy=False)._values.value_counts(dropna=dropna)
result.name = name
result.index.name = index_name
counts = result._values
Expand Down Expand Up @@ -893,7 +893,7 @@ def value_counts(
idx = idx.astype(object)
idx.name = index_name

result = Series(counts, index=idx, name=name)
result = Series(counts, index=idx, name=name, copy=False)

if sort:
result = result.sort_values(ascending=ascending)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ def value_counts(self, dropna: bool = True) -> Series:

index_arr = self._from_backing_data(np.asarray(result.index._data))
index = Index(index_arr, name=result.index.name)
return Series(result._values, index=index, name=result.name)
return Series(result._values, index=index, name=result.name, copy=False)

def _quantile(
self,
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1124,7 +1124,7 @@ def value_counts(self, dropna: bool = True) -> Series:

index = Index(type(self)(values))

return Series(counts, index=index, name="count")
return Series(counts, index=index, name="count", copy=False)

@classmethod
def _concat_same_type(cls, to_concat) -> Self:
Expand Down
12 changes: 8 additions & 4 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1501,7 +1501,9 @@ def value_counts(self, dropna: bool = True) -> Series:
ix = coerce_indexer_dtype(ix, self.dtype.categories)
ix = self._from_backing_data(ix)

return Series(count, index=CategoricalIndex(ix), dtype="int64", name="count")
return Series(
count, index=CategoricalIndex(ix), dtype="int64", name="count", copy=False
)

# error: Argument 2 of "_empty" is incompatible with supertype
# "NDArrayBackedExtensionArray"; supertype defines the argument type as
Expand Down Expand Up @@ -1759,7 +1761,9 @@ def _values_for_rank(self):
# reorder the categories (so rank can use the float codes)
# instead of passing an object array to rank
values = np.array(
self.rename_categories(Series(self.categories).rank().values)
self.rename_categories(
Series(self.categories, copy=False).rank().values
)
)
return values

Expand Down Expand Up @@ -2504,15 +2508,15 @@ def codes(self) -> Series:
"""
from pandas import Series

return Series(self._parent.codes, index=self._index)
return Series(self._parent.codes, index=self._index, copy=False)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should maybe not be a copy=False? (modifying the series should never mutate those codes)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point


def _delegate_method(self, name, *args, **kwargs):
from pandas import Series

method = getattr(self._parent, name)
res = method(*args, **kwargs)
if res is not None:
return Series(res, index=self._index, name=self._name)
return Series(res, index=self._index, name=self._name, copy=False)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are sure that none of the methods where this is used give a view? (didn't check in detail where this is used, just wondering)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think so, but not 100%. Lets leave this out for now



# utility routines
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -995,7 +995,7 @@ def value_counts(self, dropna: bool = True) -> Series:
)

if dropna:
res = Series(value_counts, index=keys, name="count")
res = Series(value_counts, index=keys, name="count", copy=False)
res.index = res.index.astype(self.dtype)
res = res.astype("Int64")
return res
Expand All @@ -1011,7 +1011,7 @@ def value_counts(self, dropna: bool = True) -> Series:
mask = np.zeros(len(counts), dtype="bool")
counts_array = IntegerArray(counts, mask)

return Series(counts_array, index=index, name="count")
return Series(counts_array, index=index, name="count", copy=False)

@doc(ExtensionArray.equals)
def equals(self, other) -> bool:
Expand Down
1 change: 1 addition & 0 deletions pandas/core/arrays/sparse/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,7 @@ def to_dense(self) -> Series:
self._parent.array.to_dense(),
index=self._parent.index,
name=self._parent.name,
copy=False,
)


Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -886,7 +886,7 @@ def value_counts(self, dropna: bool = True) -> Series:
index = Index(keys)
else:
index = keys
return Series(counts, index=index)
return Series(counts, index=index, copy=False)

# --------
# Indexing
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/sparse/scipy_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def coo_to_sparse_series(
from pandas import SparseDtype

try:
ser = Series(A.data, MultiIndex.from_arrays((A.row, A.col)))
ser = Series(A.data, MultiIndex.from_arrays((A.row, A.col)), copy=False)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this keyword get exposed to the user (and with a default of copy=True) ?

I would say this is a similar case as Series(ndarray) where we decided to change the default to True but give the ability for power uses to specify copy=False

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The astype below creates a copy as far as I understand, so copy=False here should be good.

except AttributeError as err:
raise TypeError(
f"Expected coo_matrix. Got {type(A).__name__} instead."
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -707,7 +707,7 @@ def value_counts(
llab = lambda lab, inc: lab[inc]
else:
# lab is a Categorical with categories an IntervalIndex
cat_ser = cut(Series(val), bins, include_lowest=True)
cat_ser = cut(Series(val, copy=False), bins, include_lowest=True)
cat_obj = cast("Categorical", cat_ser._values)
lev = cat_obj.categories
lab = lev.take(
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1486,7 +1486,7 @@ def _agg_py_fallback(

if values.ndim == 1:
# For DataFrameGroupBy we only get here with ExtensionArray
ser = Series(values)
ser = Series(values, copy=False)
else:
# We only get here with values.dtype == object
# TODO: special case not needed with ArrayManager
Expand Down
10 changes: 6 additions & 4 deletions pandas/core/indexes/accessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,9 @@ def _delegate_property_get(self, name):
else:
index = self._parent.index
# return the result as a Series, which is by definition a copy
result = Series(result, index=index, name=self.name).__finalize__(self._parent)
result = Series(result, index=index, name=self.name, copy=False).__finalize__(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment just above should be updated then. But also wondering if we have properties that can be a view, in which case we shouldn't necessarily do copy=False? (but rather attach a ref to self._parent)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking through the datetimelike properties/methods, I think theoretically things like round/ceil/floor could return a view (if nothing needs to be rounded), but that's not an optimization we currently have I think.
Also tz_convert/tz_localize could theoretically return a view (but in practice also do not do that, from a quick test)

In [41]: s = pd.Series(pd.date_range("2012-01-01", periods=3, tz="UTC"))

In [43]: s._values._ndarray
Out[43]: 
array(['2012-01-01T00:00:00.000000000', '2012-01-02T00:00:00.000000000',
       '2012-01-03T00:00:00.000000000'], dtype='datetime64[ns]')

In [44]: s2 = s.dt.tz_convert("Europe/Brussels")

In [45]: np.shares_memory(s._values._ndarray, s2._values._ndarray)
Out[45]: False

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually they return views if it's a no-op. So will remove the copy=False for now

self._parent
)

# setting this object will show a SettingWithCopyWarning/Error
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sidenote: seeing this here, this might also be a case to look into if we can raise a "chained assignment" error

result._is_copy = (
Expand Down Expand Up @@ -130,9 +132,9 @@ def _delegate_method(self, name, *args, **kwargs):
if not is_list_like(result):
return result

result = Series(result, index=self._parent.index, name=self.name).__finalize__(
self._parent
)
result = Series(
result, index=self._parent.index, name=self.name, copy=False
).__finalize__(self._parent)

# setting this object will show a SettingWithCopyWarning/Error
result._is_copy = (
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/reshape/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ def _get_dummies_1d(
from pandas.core.reshape.concat import concat

# Series avoids inconsistent NaN handling
codes, levels = factorize_from_iterable(Series(data))
codes, levels = factorize_from_iterable(Series(data, copy=False))

if dtype is None:
dtype = np.dtype(bool)
Expand Down Expand Up @@ -313,7 +313,7 @@ def get_empty_frame(data) -> DataFrame:
fill_value=fill_value,
dtype=dtype,
)
sparse_series.append(Series(data=sarr, index=index, name=col))
sparse_series.append(Series(data=sarr, index=index, name=col, copy=False))

return concat(sparse_series, axis=1, copy=False)

Expand Down
4 changes: 2 additions & 2 deletions pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ def _get_series_list(self, others):
if isinstance(others, ABCSeries):
return [others]
elif isinstance(others, ABCIndex):
return [Series(others._values, index=idx, dtype=others.dtype)]
return [Series(others, index=idx, dtype=others.dtype)]
elif isinstance(others, ABCDataFrame):
return [others[x] for x in others]
elif isinstance(others, np.ndarray) and others.ndim == 2:
Expand Down Expand Up @@ -634,7 +634,7 @@ def cat(
else:
dtype = self._orig.dtype
res_ser = Series(
result, dtype=dtype, index=data.index, name=self._orig.name
result, dtype=dtype, index=data.index, name=self._orig.name, copy=False
)
out = res_ser.__finalize__(self._orig, method="str_cat")
return out
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/strings/object_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@ def _str_get_dummies(self, sep: str = "|"):
arr = sep + arr.astype(str) + sep

tags: set[str] = set()
for ts in Series(arr).str.split(sep):
for ts in Series(arr, copy=False).str.split(sep):
tags.update(ts)
tags2 = sorted(tags - {""})

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ def _maybe_cache(
cache_dates = convert_listlike(unique_dates, format)
# GH#45319
try:
cache_array = Series(cache_dates, index=unique_dates)
cache_array = Series(cache_dates, index=unique_dates, copy=False)
except OutOfBoundsDatetime:
return cache_array
# GH#39882 and GH#35888 in case of None and NaT we get duplicates
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/window/ewm.py
Original file line number Diff line number Diff line change
Expand Up @@ -733,7 +733,7 @@ def cov_func(x, y):
self.ignore_na,
bias,
)
return Series(result, index=x.index, name=x.name)
return Series(result, index=x.index, name=x.name, copy=False)

return self._apply_pairwise(
self._selected_obj, other, pairwise, cov_func, numeric_only
Expand Down Expand Up @@ -810,7 +810,7 @@ def _cov(X, Y):
x_var = _cov(x_array, x_array)
y_var = _cov(y_array, y_array)
result = cov / zsqrt(x_var * y_var)
return Series(result, index=x.index, name=x.name)
return Series(result, index=x.index, name=x.name, copy=False)

return self._apply_pairwise(
self._selected_obj, other, pairwise, cov_func, numeric_only
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/window/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ def _insert_on_column(self, result: DataFrame, obj: DataFrame) -> None:

if self.on is not None and not self._on.equals(obj.index):
name = self._on.name
extra_col = Series(self._on, index=self.obj.index, name=name)
extra_col = Series(self._on, index=self.obj.index, name=name, copy=False)
if name in result.columns:
# TODO: sure we want to overwrite results?
result[name] = extra_col
Expand Down Expand Up @@ -1418,7 +1418,7 @@ def _generate_cython_apply_func(
def apply_func(values, begin, end, min_periods, raw=raw):
if not raw:
# GH 45912
values = Series(values, index=self._on)
values = Series(values, index=self._on, copy=False)
return window_func(values, begin, end, min_periods)

return apply_func
Expand Down Expand Up @@ -1675,7 +1675,7 @@ def cov_func(x, y):
notna(x_array + y_array).astype(np.float64), start, end, 0
)
result = (mean_x_y - mean_x * mean_y) * (count_x_y / (count_x_y - ddof))
return Series(result, index=x.index, name=x.name)
return Series(result, index=x.index, name=x.name, copy=False)

return self._apply_pairwise(
self._selected_obj, other, pairwise, cov_func, numeric_only
Expand Down Expand Up @@ -1732,7 +1732,7 @@ def corr_func(x, y):
)
denominator = (x_var * y_var) ** 0.5
result = numerator / denominator
return Series(result, index=x.index, name=x.name)
return Series(result, index=x.index, name=x.name, copy=False)

return self._apply_pairwise(
self._selected_obj, other, pairwise, corr_func, numeric_only
Expand Down
4 changes: 2 additions & 2 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1989,7 +1989,7 @@ def _do_convert_categoricals(
# TODO: if we get a non-copying rename_categories, use that
cat_data = cat_data.rename_categories(categories)
except ValueError as err:
vc = Series(categories).value_counts()
vc = Series(categories, copy=False).value_counts()
repeated_cats = list(vc.index[vc > 1])
repeats = "-" * 80 + "\n" + "\n".join(repeated_cats)
# GH 25772
Expand All @@ -2006,7 +2006,7 @@ def _do_convert_categoricals(
"""
raise ValueError(msg) from err
# TODO: is the next line needed above in the data(...) method?
cat_series = Series(cat_data, index=data.index)
cat_series = Series(cat_data, index=data.index, copy=False)
cat_converted_data.append((col, cat_series))
else:
cat_converted_data.append((col, data[col]))
Expand Down
2 changes: 1 addition & 1 deletion pandas/plotting/_matplotlib/boxplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ def _grouped_plot_by_column(
ax_values.append(re_plotf)
ax.grid(grid)

result = pd.Series(ax_values, index=columns)
result = pd.Series(ax_values, index=columns, copy=False)

# Return axes in multiplot case, maybe revisit later # 985
if return_type is None:
Expand Down