Skip to content

Commit e448b56

Browse files
committed
Merge branch 'main' into pandas-devgh-50703
2 parents 2b7ef53 + 627bc40 commit e448b56

21 files changed

+197
-79
lines changed

.github/workflows/python-dev.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ jobs:
7373
run: |
7474
python --version
7575
python -m pip install --upgrade pip setuptools wheel
76-
python -m pip install --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple numpy
76+
python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple numpy
7777
python -m pip install git+https://github.com/nedbat/coveragepy.git
7878
python -m pip install versioneer[toml]
7979
python -m pip install python-dateutil pytz cython hypothesis>=6.34.2 pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17

doc/source/whatsnew/v2.0.0.rst

+33-28
Original file line numberDiff line numberDiff line change
@@ -83,34 +83,40 @@ be set to ``"pyarrow"`` to return pyarrow-backed, nullable :class:`ArrowDtype` (
8383
df_pyarrow = pd.read_csv(data, use_nullable_dtypes=True, engine="pyarrow")
8484
df_pyarrow.dtypes
8585
86-
Copy on write improvements
86+
Copy-on-Write improvements
8787
^^^^^^^^^^^^^^^^^^^^^^^^^^
8888

89-
A new lazy copy mechanism that defers the copy until the object in question is modified
90-
was added to the following methods:
91-
92-
- :meth:`DataFrame.reset_index` / :meth:`Series.reset_index`
93-
- :meth:`DataFrame.set_index` / :meth:`Series.set_index`
94-
- :meth:`DataFrame.set_axis` / :meth:`Series.set_axis`
95-
- :meth:`DataFrame.rename_axis` / :meth:`Series.rename_axis`
96-
- :meth:`DataFrame.rename_columns`
97-
- :meth:`DataFrame.reindex` / :meth:`Series.reindex`
98-
- :meth:`DataFrame.reindex_like` / :meth:`Series.reindex_like`
99-
- :meth:`DataFrame.assign`
100-
- :meth:`DataFrame.drop`
101-
- :meth:`DataFrame.dropna` / :meth:`Series.dropna`
102-
- :meth:`DataFrame.select_dtypes`
103-
- :meth:`DataFrame.align` / :meth:`Series.align`
104-
- :meth:`Series.to_frame`
105-
- :meth:`DataFrame.rename` / :meth:`Series.rename`
106-
- :meth:`DataFrame.add_prefix` / :meth:`Series.add_prefix`
107-
- :meth:`DataFrame.add_suffix` / :meth:`Series.add_suffix`
108-
- :meth:`DataFrame.drop_duplicates` / :meth:`Series.drop_duplicates`
109-
- :meth:`DataFrame.reorder_levels` / :meth:`Series.reorder_levels`
110-
111-
These methods return views when copy on write is enabled, which provides a significant
112-
performance improvement compared to the regular execution (:issue:`49473`). Copy on write
113-
can be enabled through
89+
- A new lazy copy mechanism that defers the copy until the object in question is modified
90+
was added to the following methods:
91+
92+
- :meth:`DataFrame.reset_index` / :meth:`Series.reset_index`
93+
- :meth:`DataFrame.set_index` / :meth:`Series.set_index`
94+
- :meth:`DataFrame.set_axis` / :meth:`Series.set_axis`
95+
- :meth:`DataFrame.rename_axis` / :meth:`Series.rename_axis`
96+
- :meth:`DataFrame.rename_columns`
97+
- :meth:`DataFrame.reindex` / :meth:`Series.reindex`
98+
- :meth:`DataFrame.reindex_like` / :meth:`Series.reindex_like`
99+
- :meth:`DataFrame.assign`
100+
- :meth:`DataFrame.drop`
101+
- :meth:`DataFrame.dropna` / :meth:`Series.dropna`
102+
- :meth:`DataFrame.select_dtypes`
103+
- :meth:`DataFrame.align` / :meth:`Series.align`
104+
- :meth:`Series.to_frame`
105+
- :meth:`DataFrame.rename` / :meth:`Series.rename`
106+
- :meth:`DataFrame.add_prefix` / :meth:`Series.add_prefix`
107+
- :meth:`DataFrame.add_suffix` / :meth:`Series.add_suffix`
108+
- :meth:`DataFrame.drop_duplicates` / :meth:`Series.drop_duplicates`
109+
- :meth:`DataFrame.reorder_levels` / :meth:`Series.reorder_levels`
110+
111+
These methods return views when Copy-on-Write is enabled, which provides a significant
112+
performance improvement compared to the regular execution (:issue:`49473`).
113+
114+
- Accessing a single column of a DataFrame as a Series (e.g. ``df["col"]``) now always
115+
returns a new object every time it is constructed when Copy-on-Write is enabled (not
116+
returning multiple times an identical, cached Series object). This ensures that those
117+
Series objects correctly follow the Copy-on-Write rules (:issue:`49450`)
118+
119+
Copy-on-Write can be enabled through
114120

115121
.. code-block:: python
116122
@@ -563,8 +569,7 @@ Deprecations
563569
~~~~~~~~~~~~
564570
- Deprecated argument ``infer_datetime_format`` in :func:`to_datetime` and :func:`read_csv`, as a strict version of it is now the default (:issue:`48621`)
565571
- Deprecated :func:`pandas.io.sql.execute`(:issue:`50185`)
566-
-
567-
572+
- :meth:`Index.is_integer` has been deprecated. Use :func:`pandas.api.types.is_integer_dtype` instead (:issue:`50042`)
568573
- :meth:`Index.is_floating` has been deprecated. Use :func:`pandas.api.types.is_float_dtype` instead (:issue:`50042`)
569574

570575
.. ---------------------------------------------------------------------------

pandas/_testing/asserters.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
is_bool,
1616
is_categorical_dtype,
1717
is_extension_array_dtype,
18+
is_integer_dtype,
1819
is_interval_dtype,
1920
is_number,
2021
is_numeric_dtype,
@@ -1335,7 +1336,7 @@ def assert_indexing_slices_equivalent(ser: Series, l_slc: slice, i_slc: slice) -
13351336

13361337
assert_series_equal(ser.loc[l_slc], expected)
13371338

1338-
if not ser.index.is_integer():
1339+
if not is_integer_dtype(ser.index):
13391340
# For integer indices, .loc and plain getitem are position-based.
13401341
assert_series_equal(ser[l_slc], expected)
13411342

pandas/core/frame.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,10 @@
3535
import numpy as np
3636
from numpy import ma
3737

38-
from pandas._config import get_option
38+
from pandas._config import (
39+
get_option,
40+
using_copy_on_write,
41+
)
3942

4043
from pandas._libs import (
4144
algos as libalgos,
@@ -4153,6 +4156,10 @@ def _clear_item_cache(self) -> None:
41534156

41544157
def _get_item_cache(self, item: Hashable) -> Series:
41554158
"""Return the cached item, item represents a label indexer."""
4159+
if using_copy_on_write():
4160+
loc = self.columns.get_loc(item)
4161+
return self._ixs(loc, axis=1)
4162+
41564163
cache = self._item_cache
41574164
res = cache.get(item)
41584165
if res is None:

pandas/core/generic.py

+2
Original file line numberDiff line numberDiff line change
@@ -3676,6 +3676,8 @@ def _maybe_update_cacher(
36763676
verify_is_copy : bool, default True
36773677
Provide is_copy checks.
36783678
"""
3679+
if using_copy_on_write():
3680+
return
36793681

36803682
if verify_is_copy:
36813683
self._check_setitem_copy(t="referent")

pandas/core/groupby/grouper.py

+9
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414

1515
import numpy as np
1616

17+
from pandas._config import using_copy_on_write
18+
1719
from pandas._typing import (
1820
ArrayLike,
1921
Axis,
@@ -887,6 +889,13 @@ def is_in_axis(key) -> bool:
887889
def is_in_obj(gpr) -> bool:
888890
if not hasattr(gpr, "name"):
889891
return False
892+
if using_copy_on_write():
893+
# For the CoW case, we need an equality check as the identity check
894+
# no longer works (each Series from column access is a new object)
895+
try:
896+
return gpr.equals(obj[gpr.name])
897+
except (AttributeError, KeyError, IndexError, InvalidIndexError):
898+
return False
890899
try:
891900
return gpr is obj[gpr.name]
892901
except (KeyError, IndexError, InvalidIndexError):

pandas/core/indexes/base.py

+17-7
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@
9898
is_float_dtype,
9999
is_hashable,
100100
is_integer,
101+
is_integer_dtype,
101102
is_interval_dtype,
102103
is_iterator,
103104
is_list_like,
@@ -2188,7 +2189,7 @@ def is_boolean(self) -> bool:
21882189
21892190
See Also
21902191
--------
2191-
is_integer : Check if the Index only consists of integers.
2192+
is_integer : Check if the Index only consists of integers (deprecated).
21922193
is_floating : Check if the Index is a floating type (deprecated).
21932194
is_numeric : Check if the Index only consists of numeric data.
21942195
is_object : Check if the Index is of the object dtype.
@@ -2216,6 +2217,9 @@ def is_integer(self) -> bool:
22162217
"""
22172218
Check if the Index only consists of integers.
22182219
2220+
.. deprecated:: 2.0.0
2221+
Use `pandas.api.types.is_integer_dtype` instead.
2222+
22192223
Returns
22202224
-------
22212225
bool
@@ -2244,6 +2248,12 @@ def is_integer(self) -> bool:
22442248
>>> idx.is_integer()
22452249
False
22462250
"""
2251+
warnings.warn(
2252+
f"{type(self).__name__}.is_integer is deprecated. "
2253+
"Use pandas.api.types.is_integer_dtype instead.",
2254+
FutureWarning,
2255+
stacklevel=find_stack_level(),
2256+
)
22472257
return self.inferred_type in ["integer"]
22482258

22492259
@final
@@ -2266,7 +2276,7 @@ def is_floating(self) -> bool:
22662276
See Also
22672277
--------
22682278
is_boolean : Check if the Index only consists of booleans.
2269-
is_integer : Check if the Index only consists of integers.
2279+
is_integer : Check if the Index only consists of integers (deprecated).
22702280
is_numeric : Check if the Index only consists of numeric data.
22712281
is_object : Check if the Index is of the object dtype.
22722282
is_categorical : Check if the Index holds categorical data.
@@ -2311,7 +2321,7 @@ def is_numeric(self) -> bool:
23112321
See Also
23122322
--------
23132323
is_boolean : Check if the Index only consists of booleans.
2314-
is_integer : Check if the Index only consists of integers.
2324+
is_integer : Check if the Index only consists of integers (deprecated).
23152325
is_floating : Check if the Index is a floating type (deprecated).
23162326
is_object : Check if the Index is of the object dtype.
23172327
is_categorical : Check if the Index holds categorical data.
@@ -2354,7 +2364,7 @@ def is_object(self) -> bool:
23542364
See Also
23552365
--------
23562366
is_boolean : Check if the Index only consists of booleans.
2357-
is_integer : Check if the Index only consists of integers.
2367+
is_integer : Check if the Index only consists of integers (deprecated).
23582368
is_floating : Check if the Index is a floating type (deprecated).
23592369
is_numeric : Check if the Index only consists of numeric data.
23602370
is_categorical : Check if the Index holds categorical data.
@@ -2395,7 +2405,7 @@ def is_categorical(self) -> bool:
23952405
--------
23962406
CategoricalIndex : Index for categorical data.
23972407
is_boolean : Check if the Index only consists of booleans.
2398-
is_integer : Check if the Index only consists of integers.
2408+
is_integer : Check if the Index only consists of integers (deprecated).
23992409
is_floating : Check if the Index is a floating type (deprecated).
24002410
is_numeric : Check if the Index only consists of numeric data.
24012411
is_object : Check if the Index is of the object dtype.
@@ -2438,7 +2448,7 @@ def is_interval(self) -> bool:
24382448
--------
24392449
IntervalIndex : Index for Interval objects.
24402450
is_boolean : Check if the Index only consists of booleans.
2441-
is_integer : Check if the Index only consists of integers.
2451+
is_integer : Check if the Index only consists of integers (deprecated).
24422452
is_floating : Check if the Index is a floating type (deprecated).
24432453
is_numeric : Check if the Index only consists of numeric data.
24442454
is_object : Check if the Index is of the object dtype.
@@ -3877,7 +3887,7 @@ def is_int(v):
38773887

38783888
if kind == "getitem":
38793889
# called from the getitem slicers, validate that we are in fact integers
3880-
if self.is_integer() or is_index_slice:
3890+
if is_integer_dtype(self.dtype) or is_index_slice:
38813891
# Note: these checks are redundant if we know is_index_slice
38823892
self._validate_indexer("slice", key.start, "getitem")
38833893
self._validate_indexer("slice", key.stop, "getitem")

pandas/core/series.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -1242,6 +1242,8 @@ def _set_as_cached(self, item, cacher) -> None:
12421242
Set the _cacher attribute on the calling object with a weakref to
12431243
cacher.
12441244
"""
1245+
if using_copy_on_write():
1246+
return
12451247
self._cacher = (item, weakref.ref(cacher))
12461248

12471249
def _clear_item_cache(self) -> None:
@@ -1265,6 +1267,10 @@ def _maybe_update_cacher(
12651267
"""
12661268
See NDFrame._maybe_update_cacher.__doc__
12671269
"""
1270+
# for CoW, we never want to update the parent DataFrame cache
1271+
# if the Series changed, but don't keep track of any cacher
1272+
if using_copy_on_write():
1273+
return
12681274
cacher = getattr(self, "_cacher", None)
12691275
if cacher is not None:
12701276
assert self.ndim == 1
@@ -1274,13 +1280,7 @@ def _maybe_update_cacher(
12741280
# a copy
12751281
if ref is None:
12761282
del self._cacher
1277-
# for CoW, we never want to update the parent DataFrame cache
1278-
# if the Series changed, and always pop the cached item
1279-
elif (
1280-
not using_copy_on_write()
1281-
and len(self) == len(ref)
1282-
and self.name in ref.columns
1283-
):
1283+
elif len(self) == len(ref) and self.name in ref.columns:
12841284
# GH#42530 self.name must be in ref.columns
12851285
# to ensure column still in dataframe
12861286
# otherwise, either self or ref has swapped in new arrays

pandas/tests/copy_view/test_indexing.py

+40
Original file line numberDiff line numberDiff line change
@@ -820,6 +820,46 @@ def test_column_as_series_set_with_upcast(using_copy_on_write, using_array_manag
820820
tm.assert_frame_equal(df, df_orig)
821821

822822

823+
@pytest.mark.parametrize(
824+
"method",
825+
[
826+
lambda df: df["a"],
827+
lambda df: df.loc[:, "a"],
828+
lambda df: df.iloc[:, 0],
829+
],
830+
ids=["getitem", "loc", "iloc"],
831+
)
832+
def test_column_as_series_no_item_cache(
833+
request, method, using_copy_on_write, using_array_manager
834+
):
835+
# Case: selecting a single column (which now also uses Copy-on-Write to protect
836+
# the view) should always give a new object (i.e. not make use of a cache)
837+
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
838+
df_orig = df.copy()
839+
840+
s1 = method(df)
841+
s2 = method(df)
842+
843+
is_iloc = request.node.callspec.id == "iloc"
844+
if using_copy_on_write or is_iloc:
845+
assert s1 is not s2
846+
else:
847+
assert s1 is s2
848+
849+
if using_copy_on_write or using_array_manager:
850+
s1.iloc[0] = 0
851+
else:
852+
with pd.option_context("chained_assignment", "warn"):
853+
with tm.assert_produces_warning(SettingWithCopyWarning):
854+
s1.iloc[0] = 0
855+
856+
if using_copy_on_write:
857+
tm.assert_series_equal(s2, df_orig["a"])
858+
tm.assert_frame_equal(df, df_orig)
859+
else:
860+
assert s2.iloc[0] == 0
861+
862+
823863
# TODO add tests for other indexing methods on the Series
824864

825865

pandas/tests/frame/indexing/test_xs.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ def four_level_index_dataframe():
3636

3737

3838
class TestXS:
39-
def test_xs(self, float_frame, datetime_frame):
39+
def test_xs(self, float_frame, datetime_frame, using_copy_on_write):
40+
float_frame_orig = float_frame.copy()
4041
idx = float_frame.index[5]
4142
xs = float_frame.xs(idx)
4243
for item, value in xs.items():
@@ -66,7 +67,12 @@ def test_xs(self, float_frame, datetime_frame):
6667
# view is returned if possible
6768
series = float_frame.xs("A", axis=1)
6869
series[:] = 5
69-
assert (expected == 5).all()
70+
if using_copy_on_write:
71+
# but with CoW the view shouldn't propagate mutations
72+
tm.assert_series_equal(float_frame["A"], float_frame_orig["A"])
73+
assert not (expected == 5).all()
74+
else:
75+
assert (expected == 5).all()
7076

7177
def test_xs_corner(self):
7278
# pathological mixed-type reordering case

pandas/tests/frame/methods/test_cov_corr.py

+11-6
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ def test_corr_nullable_integer(self, nullable_column, other_column, method):
206206
expected = DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"])
207207
tm.assert_frame_equal(result, expected)
208208

209-
def test_corr_item_cache(self):
209+
def test_corr_item_cache(self, using_copy_on_write):
210210
# Check that corr does not lead to incorrect entries in item_cache
211211

212212
df = DataFrame({"A": range(10)})
@@ -217,11 +217,16 @@ def test_corr_item_cache(self):
217217

218218
_ = df.corr(numeric_only=True)
219219

220-
# Check that the corr didn't break link between ser and df
221-
ser.values[0] = 99
222-
assert df.loc[0, "A"] == 99
223-
assert df["A"] is ser
224-
assert df.values[0, 0] == 99
220+
if using_copy_on_write:
221+
# TODO(CoW) we should disallow this, so `df` doesn't get updated
222+
ser.values[0] = 99
223+
assert df.loc[0, "A"] == 99
224+
else:
225+
# Check that the corr didn't break link between ser and df
226+
ser.values[0] = 99
227+
assert df.loc[0, "A"] == 99
228+
assert df["A"] is ser
229+
assert df.values[0, 0] == 99
225230

226231
@pytest.mark.parametrize("length", [2, 20, 200, 2000])
227232
def test_corr_for_constant_columns(self, length):

0 commit comments

Comments
 (0)