Skip to content

Commit a0d4193

Browse files
authored
Merge branch 'main' into issue-50395
2 parents 85d31ec + 6c50f70 commit a0d4193

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+545
-163
lines changed

.github/workflows/wheels.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -173,8 +173,8 @@ jobs:
173173
pip install hypothesis>=6.34.2 pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-asyncio>=0.17
174174
cd .. # Not a good idea to test within the src tree
175175
python -c "import pandas; print(pandas.__version__);
176-
pandas.test(extra_args=['-m not clipboard and not single_cpu and not slow and not network and not db', '-n 2', '--no-strict-data-files']);
177-
pandas.test(extra_args=['-m not clipboard and single_cpu and not slow and not network and not db', '--no-strict-data-files'])"
176+
pandas.test(extra_args=['-m not clipboard and not single_cpu and not slow and not network and not db', '-n 2']);
177+
pandas.test(extra_args=['-m not clipboard and single_cpu and not slow and not network and not db'])"
178178
- uses: actions/upload-artifact@v3
179179
with:
180180
name: sdist

asv_bench/benchmarks/strings.py

-7
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ def setup(self, dtype):
3434

3535
# GH37371. Testing construction of string series/frames from ExtensionArrays
3636
self.series_cat_arr = Categorical(self.series_arr)
37-
self.frame_cat_arr = Categorical(self.frame_arr)
3837

3938
def time_series_construction(self, dtype):
4039
Series(self.series_arr, dtype=dtype)
@@ -54,12 +53,6 @@ def time_cat_series_construction(self, dtype):
5453
def peakmem_cat_series_construction(self, dtype):
5554
Series(self.series_cat_arr, dtype=dtype)
5655

57-
def time_cat_frame_construction(self, dtype):
58-
DataFrame(self.frame_cat_arr, dtype=dtype)
59-
60-
def peakmem_cat_frame_construction(self, dtype):
61-
DataFrame(self.frame_cat_arr, dtype=dtype)
62-
6356

6457
class Methods(Dtypes):
6558
def time_center(self, dtype):

ci/code_checks.sh

-3
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
8686
MSG='Partially validate docstrings (EX01)' ; echo $MSG
8787
$BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX01 --ignore_functions \
8888
pandas.Series.index \
89-
pandas.Series.hasnans \
90-
pandas.Series.to_list \
9189
pandas.Series.__iter__ \
9290
pandas.Series.keys \
9391
pandas.Series.item \
@@ -309,7 +307,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
309307
pandas_object \
310308
pandas.api.interchange.from_dataframe \
311309
pandas.Index.values \
312-
pandas.Index.hasnans \
313310
pandas.Index.dtype \
314311
pandas.Index.inferred_type \
315312
pandas.Index.shape \

ci/test_wheels.py

-2
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,10 @@
4141
multi_args = [
4242
"-m not clipboard and not single_cpu and not slow and not network and not db",
4343
"-n 2",
44-
"--no-strict-data-files",
4544
]
4645
pd.test(extra_args=multi_args)
4746
pd.test(
4847
extra_args=[
4948
"-m not clipboard and single_cpu and not slow and not network and not db",
50-
"--no-strict-data-files",
5149
]
5250
)

ci/test_wheels_windows.bat

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
set test_command=import pandas as pd; print(pd.__version__); ^
2-
pd.test(extra_args=['-m not clipboard and not single_cpu and not slow and not network and not db', '--no-strict-data-files', '-n=2']); ^
3-
pd.test(extra_args=['-m not clipboard and single_cpu and not slow and not network and not db', '--no-strict-data-files'])
2+
pd.test(extra_args=['-m not clipboard and not single_cpu and not slow and not network and not db', '-n 2']); ^
3+
pd.test(extra_args=['-m not clipboard and single_cpu and not slow and not network and not db'])
44

55
python --version
66
pip install pytz six numpy python-dateutil tzdata>=2022.1

doc/source/development/contributing_codebase.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -812,7 +812,8 @@ install pandas) by typing::
812812
your installation is probably fine and you can start contributing!
813813

814814
Often it is worth running only a subset of tests first around your changes before running the
815-
entire suite.
815+
entire suite (tip: you can use the [pandas-coverage app](https://pandas-coverage.herokuapp.com/)
816+
to find out which tests hit the lines of code you've modified, and then run only those).
816817

817818
The easiest way to do this is with::
818819

doc/source/reference/arrays.rst

+4-3
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,10 @@ PyArrow type pandas extension type NumPy
9393

9494
.. note::
9595

96-
For string types (``pyarrow.string()``, ``string[pyarrow]``), PyArrow support is still facilitated
97-
by :class:`arrays.ArrowStringArray` and ``StringDtype("pyarrow")``. See the :ref:`string section <api.arrays.string>`
98-
below.
96+
Pyarrow-backed string support is provided by both ``pd.StringDtype("pyarrow")`` and ``pd.ArrowDtype(pa.string())``.
97+
``pd.StringDtype("pyarrow")`` is described below in the :ref:`string section <api.arrays.string>`
98+
and will be returned if the string alias ``"string[pyarrow]"`` is specified. ``pd.ArrowDtype(pa.string())``
99+
generally has better interoperability with :class:`ArrowDtype` of different types.
99100

100101
While individual values in an :class:`arrays.ArrowExtensionArray` are stored as a PyArrow objects, scalars are **returned**
101102
as Python scalars corresponding to the data type, e.g. a PyArrow int64 will be returned as Python int, or :class:`NA` for missing

doc/source/user_guide/pyarrow.rst

+19-1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,23 @@ which is similar to a NumPy array. To construct these from the main pandas data
3535
df = pd.DataFrame([[1, 2], [3, 4]], dtype="uint64[pyarrow]")
3636
df
3737
38+
.. note::
39+
40+
The string alias ``"string[pyarrow]"`` maps to ``pd.StringDtype("pyarrow")`` which is not equivalent to
41+
specifying ``dtype=pd.ArrowDtype(pa.string())``. Generally, operations on the data will behave similarly
42+
except ``pd.StringDtype("pyarrow")`` can return NumPy-backed nullable types while ``pd.ArrowDtype(pa.string())``
43+
will return :class:`ArrowDtype`.
44+
45+
.. ipython:: python
46+
47+
import pyarrow as pa
48+
data = list("abc")
49+
ser_sd = pd.Series(data, dtype="string[pyarrow]")
50+
ser_ad = pd.Series(data, dtype=pd.ArrowDtype(pa.string()))
51+
ser_ad.dtype == ser_sd.dtype
52+
ser_sd.str.contains("a")
53+
ser_ad.str.contains("a")
54+
3855
For PyArrow types that accept parameters, you can pass in a PyArrow type with those parameters
3956
into :class:`ArrowDtype` to use in the ``dtype`` parameter.
4057

@@ -106,6 +123,7 @@ The following are just some examples of operations that are accelerated by nativ
106123

107124
.. ipython:: python
108125
126+
import pyarrow as pa
109127
ser = pd.Series([-1.545, 0.211, None], dtype="float32[pyarrow]")
110128
ser.mean()
111129
ser + ser
@@ -115,7 +133,7 @@ The following are just some examples of operations that are accelerated by nativ
115133
ser.isna()
116134
ser.fillna(0)
117135
118-
ser_str = pd.Series(["a", "b", None], dtype="string[pyarrow]")
136+
ser_str = pd.Series(["a", "b", None], dtype=pd.ArrowDtype(pa.string()))
119137
ser_str.str.startswith("a")
120138
121139
from datetime import datetime

doc/source/whatsnew/v2.1.0.rst

+8-1
Original file line numberDiff line numberDiff line change
@@ -100,15 +100,19 @@ Deprecations
100100
~~~~~~~~~~~~
101101
- Deprecated silently dropping unrecognized timezones when parsing strings to datetimes (:issue:`18702`)
102102
- Deprecated :meth:`DataFrame._data` and :meth:`Series._data`, use public APIs instead (:issue:`33333`)
103+
- Deprecated :meth:`.Groupby.all` and :meth:`.GroupBy.any` with datetime64 or :class:`PeriodDtype` values, matching the :class:`Series` and :class:`DataFrame` deprecations (:issue:`34479`)
103104
- Deprecating pinning ``group.name`` to each group in :meth:`SeriesGroupBy.aggregate` aggregations; if your operation requires utilizing the groupby keys, iterate over the groupby object instead (:issue:`41090`)
104105
- Deprecated the default of ``observed=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby`; this will default to ``True`` in a future version (:issue:`43999`)
105106
- Deprecated :meth:`DataFrameGroupBy.dtypes`, check ``dtypes`` on the underlying object instead (:issue:`51045`)
106107
- Deprecated ``axis=1`` in :meth:`DataFrame.groupby` and in :class:`Grouper` constructor, do ``frame.T.groupby(...)`` instead (:issue:`51203`)
108+
- Deprecated :meth:`Categorical.to_list`, use ``obj.tolist()`` instead (:issue:`51254`)
107109
- Deprecated passing a :class:`DataFrame` to :meth:`DataFrame.from_records`, use :meth:`DataFrame.set_index` or :meth:`DataFrame.drop` instead (:issue:`51353`)
108110
- Deprecated accepting slices in :meth:`DataFrame.take`, call ``obj[slicer]`` or pass a sequence of integers instead (:issue:`51539`)
109111
- Deprecated ``axis=1`` in :meth:`DataFrame.ewm`, :meth:`DataFrame.rolling`, :meth:`DataFrame.expanding`, transpose before calling the method instead (:issue:`51778`)
110112
- Deprecated the ``axis`` keyword in :meth:`DataFrame.ewm`, :meth:`Series.ewm`, :meth:`DataFrame.rolling`, :meth:`Series.rolling`, :meth:`DataFrame.expanding`, :meth:`Series.expanding` (:issue:`51778`)
113+
- Deprecated the ``axis`` keyword in :meth:`DataFrame.resample`, :meth:`Series.resample` (:issue:`51778`)
111114
- Deprecated 'method', 'limit', and 'fill_axis' keywords in :meth:`DataFrame.align` and :meth:`Series.align`, explicitly call ``fillna`` on the alignment results instead (:issue:`51856`)
115+
- Deprecated 'broadcast_axis' keyword in :meth:`Series.align` and :meth:`DataFrame.align`, upcast before calling ``align`` with ``left = DataFrame({col: left for col in right.columns}, index=right.index)`` (:issue:`51856`)
112116
- Deprecated the 'axis' keyword in :meth:`.GroupBy.idxmax`, :meth:`.GroupBy.idxmin`, :meth:`.GroupBy.fillna`, :meth:`.GroupBy.take`, :meth:`.GroupBy.skew`, :meth:`.GroupBy.rank`, :meth:`.GroupBy.cumprod`, :meth:`.GroupBy.cumsum`, :meth:`.GroupBy.cummax`, :meth:`.GroupBy.cummin`, :meth:`.GroupBy.pct_change`, :meth:`GroupBy.diff`, :meth:`.GroupBy.shift`, and :meth:`DataFrameGroupBy.corrwith`; for ``axis=1`` operate on the underlying :class:`DataFrame` instead (:issue:`50405`, :issue:`51046`)
113117
-
114118

@@ -196,11 +200,12 @@ Missing
196200

197201
MultiIndex
198202
^^^^^^^^^^
199-
-
203+
- Bug in :meth:`MultiIndex.set_levels` not preserving dtypes for :class:`Categorical` (:issue:`52125`)
200204
-
201205

202206
I/O
203207
^^^
208+
- Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`)
204209
- :meth:`DataFrame.to_orc` now raising ``ValueError`` when non-default :class:`Index` is given (:issue:`51828`)
205210
-
206211

@@ -226,6 +231,7 @@ Groupby/resample/rolling
226231
grouped :class:`Series` or :class:`DataFrame` was a :class:`DatetimeIndex`, :class:`TimedeltaIndex`
227232
or :class:`PeriodIndex`, and the ``groupby`` method was given a function as its first argument,
228233
the function operated on the whole index rather than each element of the index. (:issue:`51979`)
234+
- Bug in :meth:`GroupBy.var` failing to raise ``TypeError`` when called with datetime64 or :class:`PeriodDtype` values (:issue:`52128`)
229235
-
230236

231237
Reshaping
@@ -252,6 +258,7 @@ Styler
252258
Other
253259
^^^^^
254260
- Bug in :func:`assert_almost_equal` now throwing assertion error for two unequal sets (:issue:`51727`)
261+
- Bug in :meth:`Series.memory_usage` when ``deep=True`` throw an error with Series of objects and the returned value is incorrect, as it does not take into account GC corrections (:issue:`51858`)
255262

256263
.. ***DO NOT USE THIS SECTION***
257264

pandas/_libs/lib.pyx

+8-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from collections import abc
22
from decimal import Decimal
33
from enum import Enum
4+
from sys import getsizeof
45
from typing import (
56
Literal,
67
_GenericAlias,
@@ -159,7 +160,7 @@ def memory_usage_of_objects(arr: object[:]) -> int64_t:
159160

160161
n = len(arr)
161162
for i in range(n):
162-
size += arr[i].__sizeof__()
163+
size += getsizeof(arr[i])
163164
return size
164165

165166

@@ -2325,10 +2326,14 @@ def maybe_convert_numeric(
23252326
if not seen.coerce_numeric:
23262327
raise type(err)(f"{err} at position {i}")
23272328

2328-
seen.saw_null()
2329-
floats[i] = NaN
23302329
mask[i] = 1
23312330

2331+
if allow_null_in_int:
2332+
seen.null_ = True
2333+
else:
2334+
seen.saw_null()
2335+
floats[i] = NaN
2336+
23322337
if seen.check_uint64_conflict():
23332338
return (values, None)
23342339

pandas/_libs/tslibs/offsets.pyx

+11-17
Original file line numberDiff line numberDiff line change
@@ -2546,7 +2546,6 @@ cdef class MonthEnd(MonthOffset):
25462546
DateOffset of one month end.
25472547
25482548
MonthEnd goes to the next date which is an end of the month.
2549-
To get the end of the current month pass the parameter n equals 0.
25502549
25512550
See Also
25522551
--------
@@ -2562,10 +2561,10 @@ cdef class MonthEnd(MonthOffset):
25622561
>>> ts + pd.offsets.MonthEnd()
25632562
Timestamp('2022-02-28 00:00:00')
25642563
2565-
If you want to get the end of the current month pass the parameter n equals 0:
2564+
If you want to get the end of the current month:
25662565
25672566
>>> ts = pd.Timestamp(2022, 1, 31)
2568-
>>> ts + pd.offsets.MonthEnd(0)
2567+
>>> pd.offsets.MonthEnd().rollforward(ts)
25692568
Timestamp('2022-01-31 00:00:00')
25702569
"""
25712570
_period_dtype_code = PeriodDtypeCode.M
@@ -2578,7 +2577,6 @@ cdef class MonthBegin(MonthOffset):
25782577
DateOffset of one month at beginning.
25792578
25802579
MonthBegin goes to the next date which is a start of the month.
2581-
To get the start of the current month pass the parameter n equals 0.
25822580
25832581
See Also
25842582
--------
@@ -2594,10 +2592,10 @@ cdef class MonthBegin(MonthOffset):
25942592
>>> ts + pd.offsets.MonthBegin()
25952593
Timestamp('2023-01-01 00:00:00')
25962594
2597-
If you want to get the start of the current month pass the parameter n equals 0:
2595+
If you want to get the start of the current month:
25982596
25992597
>>> ts = pd.Timestamp(2022, 12, 1)
2600-
>>> ts + pd.offsets.MonthBegin(0)
2598+
>>> pd.offsets.MonthBegin().rollback(ts)
26012599
Timestamp('2022-12-01 00:00:00')
26022600
"""
26032601
_prefix = "MS"
@@ -2609,7 +2607,6 @@ cdef class BusinessMonthEnd(MonthOffset):
26092607
DateOffset increments between the last business day of the month.
26102608
26112609
BusinessMonthEnd goes to the next date which is the last business day of the month.
2612-
To get the last business day of the current month pass the parameter n equals 0.
26132610
26142611
Examples
26152612
--------
@@ -2621,11 +2618,10 @@ cdef class BusinessMonthEnd(MonthOffset):
26212618
>>> ts + pd.offsets.BMonthEnd()
26222619
Timestamp('2022-12-30 00:00:00')
26232620
2624-
If you want to get the end of the current business month
2625-
pass the parameter n equals 0:
2621+
If you want to get the end of the current business month:
26262622
26272623
>>> ts = pd.Timestamp(2022, 11, 30)
2628-
>>> ts + pd.offsets.BMonthEnd(0)
2624+
>>> pd.offsets.BMonthEnd().rollforward(ts)
26292625
Timestamp('2022-11-30 00:00:00')
26302626
"""
26312627
_prefix = "BM"
@@ -2637,8 +2633,7 @@ cdef class BusinessMonthBegin(MonthOffset):
26372633
DateOffset of one month at the first business day.
26382634
26392635
BusinessMonthBegin goes to the next date which is the first business day
2640-
of the month. To get the first business day of the current month pass
2641-
the parameter n equals 0.
2636+
of the month.
26422637
26432638
Examples
26442639
--------
@@ -2650,11 +2645,10 @@ cdef class BusinessMonthBegin(MonthOffset):
26502645
>>> ts + pd.offsets.BMonthBegin()
26512646
Timestamp('2023-01-02 00:00:00')
26522647
2653-
If you want to get the start of the current business month pass
2654-
the parameter n equals 0:
2648+
If you want to get the start of the current business month:
26552649
26562650
>>> ts = pd.Timestamp(2022, 12, 1)
2657-
>>> ts + pd.offsets.BMonthBegin(0)
2651+
>>> pd.offsets.BMonthBegin().rollback(ts)
26582652
Timestamp('2022-12-01 00:00:00')
26592653
"""
26602654
_prefix = "BMS"
@@ -2821,10 +2815,10 @@ cdef class SemiMonthEnd(SemiMonthOffset):
28212815
>>> ts + pd.offsets.SemiMonthEnd()
28222816
Timestamp('2022-02-15 00:00:00')
28232817
2824-
If you want to get the result for the current month pass the parameter n equals 0:
2818+
If you want to get the result for the current month:
28252819
28262820
>>> ts = pd.Timestamp(2022, 1, 15)
2827-
>>> ts + pd.offsets.SemiMonthEnd(0)
2821+
>>> pd.offsets.SemiMonthEnd().rollforward(ts)
28282822
Timestamp('2022-01-15 00:00:00')
28292823
"""
28302824
_prefix = "SM"

pandas/_testing/__init__.py

+17
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,23 @@
177177
np.uint32,
178178
]
179179

180+
PYTHON_DATA_TYPES = [
181+
str,
182+
int,
183+
float,
184+
complex,
185+
list,
186+
tuple,
187+
range,
188+
dict,
189+
set,
190+
frozenset,
191+
bool,
192+
bytes,
193+
bytearray,
194+
memoryview,
195+
]
196+
180197
ENDIAN = {"little": "<", "big": ">"}[byteorder]
181198

182199
NULL_OBJECTS = [None, np.nan, pd.NaT, float("nan"), pd.NA, Decimal("NaN")]

0 commit comments

Comments
 (0)