Skip to content

Commit f970e3f

Browse files
committed
Merge branch 'main' into ref-dedup
2 parents feb30a4 + 50ac190 commit f970e3f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+533
-277
lines changed

.github/workflows/unit-tests.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,7 @@ jobs:
380380
fetch-depth: 0
381381

382382
- name: Set up Python Free-threading Version
383-
uses: deadsnakes/action@v3.1.0
383+
uses: deadsnakes/action@v3.2.0
384384
with:
385385
python-version: 3.13-dev
386386
nogil: true

ci/code_checks.sh

-15
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
7575
-i "pandas.Period.ordinal GL08" \
7676
-i "pandas.PeriodDtype.freq SA01" \
7777
-i "pandas.RangeIndex.from_range PR01,SA01" \
78-
-i "pandas.RangeIndex.start SA01" \
7978
-i "pandas.RangeIndex.step SA01" \
80-
-i "pandas.RangeIndex.stop SA01" \
8179
-i "pandas.Series.cat.add_categories PR01,PR02" \
8280
-i "pandas.Series.cat.as_ordered PR01" \
8381
-i "pandas.Series.cat.as_unordered PR01" \
@@ -92,10 +90,8 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
9290
-i "pandas.Series.dt.floor PR01,PR02" \
9391
-i "pandas.Series.dt.freq GL08" \
9492
-i "pandas.Series.dt.month_name PR01,PR02" \
95-
-i "pandas.Series.dt.nanoseconds SA01" \
9693
-i "pandas.Series.dt.normalize PR01" \
9794
-i "pandas.Series.dt.round PR01,PR02" \
98-
-i "pandas.Series.dt.seconds SA01" \
9995
-i "pandas.Series.dt.strftime PR01,PR02" \
10096
-i "pandas.Series.dt.to_period PR01,PR02" \
10197
-i "pandas.Series.dt.total_seconds PR01" \
@@ -113,8 +109,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
113109
-i "pandas.Timedelta.resolution PR02" \
114110
-i "pandas.Timedelta.to_timedelta64 SA01" \
115111
-i "pandas.Timedelta.total_seconds SA01" \
116-
-i "pandas.TimedeltaIndex.nanoseconds SA01" \
117-
-i "pandas.TimedeltaIndex.seconds SA01" \
118112
-i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \
119113
-i "pandas.Timestamp.max PR02" \
120114
-i "pandas.Timestamp.min PR02" \
@@ -123,13 +117,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
123117
-i "pandas.Timestamp.tzinfo GL08" \
124118
-i "pandas.Timestamp.year GL08" \
125119
-i "pandas.api.extensions.ExtensionArray.interpolate PR01,SA01" \
126-
-i "pandas.api.types.is_bool PR01,SA01" \
127-
-i "pandas.api.types.is_categorical_dtype SA01" \
128-
-i "pandas.api.types.is_complex PR01,SA01" \
129-
-i "pandas.api.types.is_complex_dtype SA01" \
130-
-i "pandas.api.types.is_datetime64_dtype SA01" \
131-
-i "pandas.api.types.is_datetime64_ns_dtype SA01" \
132-
-i "pandas.api.types.is_datetime64tz_dtype SA01" \
133120
-i "pandas.api.types.is_dict_like PR07,SA01" \
134121
-i "pandas.api.types.is_extension_array_dtype SA01" \
135122
-i "pandas.api.types.is_file_like PR07,SA01" \
@@ -163,7 +150,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
163150
-i "pandas.core.groupby.DataFrameGroupBy.agg RT03" \
164151
-i "pandas.core.groupby.DataFrameGroupBy.aggregate RT03" \
165152
-i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \
166-
-i "pandas.core.groupby.DataFrameGroupBy.filter SA01" \
167153
-i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \
168154
-i "pandas.core.groupby.DataFrameGroupBy.groups SA01" \
169155
-i "pandas.core.groupby.DataFrameGroupBy.hist RT03" \
@@ -179,7 +165,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
179165
-i "pandas.core.groupby.SeriesGroupBy.__iter__ RT03,SA01" \
180166
-i "pandas.core.groupby.SeriesGroupBy.agg RT03" \
181167
-i "pandas.core.groupby.SeriesGroupBy.aggregate RT03" \
182-
-i "pandas.core.groupby.SeriesGroupBy.filter PR01,SA01" \
183168
-i "pandas.core.groupby.SeriesGroupBy.get_group RT03,SA01" \
184169
-i "pandas.core.groupby.SeriesGroupBy.groups SA01" \
185170
-i "pandas.core.groupby.SeriesGroupBy.indices SA01" \

doc/source/whatsnew/v2.3.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,9 @@ Conversion
103103
Strings
104104
^^^^^^^
105105
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
106+
- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
106107
- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
107-
108+
-
108109

109110
Interval
110111
^^^^^^^^

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ Other enhancements
5555
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
5656
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
5757
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
58+
- :meth:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
5859
- Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`)
5960
- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
6061
- Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`)

pandas/_libs/lib.pyx

+35-6
Original file line numberDiff line numberDiff line change
@@ -733,7 +733,9 @@ cpdef ndarray[object] ensure_string_array(
733733
convert_na_value : bool, default True
734734
If False, existing na values will be used unchanged in the new array.
735735
copy : bool, default True
736-
Whether to ensure that a new array is returned.
736+
Whether to ensure that a new array is returned. When True, a new array
737+
is always returned. When False, a new array is only returned when needed
738+
to avoid mutating the input array.
737739
skipna : bool, default True
738740
Whether or not to coerce nulls to their stringified form
739741
(e.g. if False, NaN becomes 'nan').
@@ -762,11 +764,15 @@ cpdef ndarray[object] ensure_string_array(
762764

763765
result = np.asarray(arr, dtype="object")
764766

765-
if copy and (result is arr or np.shares_memory(arr, result)):
766-
# GH#54654
767-
result = result.copy()
768-
elif not copy and result is arr:
769-
already_copied = False
767+
if result is arr or np.may_share_memory(arr, result):
768+
# if np.asarray(..) did not make a copy of the input arr, we still need
769+
# to do that to avoid mutating the input array
770+
# GH#54654: share_memory check is needed for rare cases where np.asarray
771+
# returns a new object without making a copy of the actual data
772+
if copy:
773+
result = result.copy()
774+
else:
775+
already_copied = False
770776
elif not copy and not result.flags.writeable:
771777
# Weird edge case where result is a view
772778
already_copied = False
@@ -1123,10 +1129,21 @@ def is_bool(obj: object) -> bool:
11231129
"""
11241130
Return True if given object is boolean.
11251131

1132+
Parameters
1133+
----------
1134+
obj : object
1135+
Object to check.
1136+
11261137
Returns
11271138
-------
11281139
bool
11291140

1141+
See Also
1142+
--------
1143+
api.types.is_scalar : Check if the input is a scalar.
1144+
api.types.is_integer : Check if the input is an integer.
1145+
api.types.is_float : Check if the input is a float.
1146+
11301147
Examples
11311148
--------
11321149
>>> pd.api.types.is_bool(True)
@@ -1142,10 +1159,22 @@ def is_complex(obj: object) -> bool:
11421159
"""
11431160
Return True if given object is complex.
11441161

1162+
Parameters
1163+
----------
1164+
obj : object
1165+
Object to check.
1166+
11451167
Returns
11461168
-------
11471169
bool
11481170

1171+
See Also
1172+
--------
1173+
api.types.is_complex_dtype: Check whether the provided array or
1174+
dtype is of a complex dtype.
1175+
api.types.is_number: Check if the object is a number.
1176+
api.types.is_integer: Return True if given object is integer.
1177+
11491178
Examples
11501179
--------
11511180
>>> pd.api.types.is_complex(1 + 1j)

pandas/conftest.py

+28
Original file line numberDiff line numberDiff line change
@@ -1272,6 +1272,34 @@ def string_dtype(request):
12721272
return request.param
12731273

12741274

1275+
@pytest.fixture(
1276+
params=[
1277+
("python", pd.NA),
1278+
pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
1279+
pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
1280+
("python", np.nan),
1281+
],
1282+
ids=[
1283+
"string=string[python]",
1284+
"string=string[pyarrow]",
1285+
"string=str[pyarrow]",
1286+
"string=str[python]",
1287+
],
1288+
)
1289+
def string_dtype_no_object(request):
1290+
"""
1291+
Parametrized fixture for string dtypes.
1292+
* 'string[python]' (NA variant)
1293+
* 'string[pyarrow]' (NA variant)
1294+
* 'str' (NaN variant, with pyarrow)
1295+
* 'str' (NaN variant, without pyarrow)
1296+
"""
1297+
# need to instantiate the StringDtype here instead of in the params
1298+
# to avoid importing pyarrow during test collection
1299+
storage, na_value = request.param
1300+
return pd.StringDtype(storage, na_value)
1301+
1302+
12751303
@pytest.fixture(
12761304
params=[
12771305
"string[python]",

pandas/core/arrays/_arrow_string_mixins.py

+23-9
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
from pandas.compat import (
1313
pa_version_under10p1,
14+
pa_version_under11p0,
1415
pa_version_under13p0,
1516
pa_version_under17p0,
1617
)
@@ -22,10 +23,7 @@
2223
import pyarrow.compute as pc
2324

2425
if TYPE_CHECKING:
25-
from collections.abc import (
26-
Callable,
27-
Sized,
28-
)
26+
from collections.abc import Callable
2927

3028
from pandas._typing import (
3129
Scalar,
@@ -34,7 +32,7 @@
3432

3533

3634
class ArrowStringArrayMixin:
37-
_pa_array: Sized
35+
_pa_array: pa.ChunkedArray
3836

3937
def __init__(self, *args, **kwargs) -> None:
4038
raise NotImplementedError
@@ -127,13 +125,29 @@ def _str_get(self, i: int) -> Self:
127125
selected = pc.utf8_slice_codeunits(
128126
self._pa_array, start=start, stop=stop, step=step
129127
)
130-
null_value = pa.scalar(
131-
None,
132-
type=self._pa_array.type, # type: ignore[attr-defined]
133-
)
128+
null_value = pa.scalar(None, type=self._pa_array.type)
134129
result = pc.if_else(not_out_of_bounds, selected, null_value)
135130
return type(self)(result)
136131

132+
def _str_slice(
133+
self, start: int | None = None, stop: int | None = None, step: int | None = None
134+
) -> Self:
135+
if pa_version_under11p0:
136+
# GH#59724
137+
result = self._apply_elementwise(lambda val: val[start:stop:step])
138+
return type(self)(pa.chunked_array(result, type=self._pa_array.type))
139+
if start is None:
140+
if step is not None and step < 0:
141+
# GH#59710
142+
start = -1
143+
else:
144+
start = 0
145+
if step is None:
146+
step = 1
147+
return type(self)(
148+
pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
149+
)
150+
137151
def _str_slice_replace(
138152
self, start: int | None = None, stop: int | None = None, repl: str | None = None
139153
) -> Self:

pandas/core/arrays/arrow/array.py

+13-13
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
is_list_like,
4242
is_numeric_dtype,
4343
is_scalar,
44+
pandas_dtype,
4445
)
4546
from pandas.core.dtypes.dtypes import DatetimeTZDtype
4647
from pandas.core.dtypes.missing import isna
@@ -2376,17 +2377,6 @@ def _str_rpartition(self, sep: str, expand: bool) -> Self:
23762377
result = self._apply_elementwise(predicate)
23772378
return type(self)(pa.chunked_array(result))
23782379

2379-
def _str_slice(
2380-
self, start: int | None = None, stop: int | None = None, step: int | None = None
2381-
) -> Self:
2382-
if start is None:
2383-
start = 0
2384-
if step is None:
2385-
step = 1
2386-
return type(self)(
2387-
pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
2388-
)
2389-
23902380
def _str_removeprefix(self, prefix: str):
23912381
if not pa_version_under13p0:
23922382
starts_with = pc.starts_with(self._pa_array, pattern=prefix)
@@ -2428,7 +2418,9 @@ def _str_findall(self, pat: str, flags: int = 0) -> Self:
24282418
result = self._apply_elementwise(predicate)
24292419
return type(self)(pa.chunked_array(result))
24302420

2431-
def _str_get_dummies(self, sep: str = "|"):
2421+
def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
2422+
if dtype is None:
2423+
dtype = np.bool_
24322424
split = pc.split_pattern(self._pa_array, sep)
24332425
flattened_values = pc.list_flatten(split)
24342426
uniques = flattened_values.unique()
@@ -2438,7 +2430,15 @@ def _str_get_dummies(self, sep: str = "|"):
24382430
n_cols = len(uniques)
24392431
indices = pc.index_in(flattened_values, uniques_sorted).to_numpy()
24402432
indices = indices + np.arange(n_rows).repeat(lengths) * n_cols
2441-
dummies = np.zeros(n_rows * n_cols, dtype=np.bool_)
2433+
_dtype = pandas_dtype(dtype)
2434+
dummies_dtype: NpDtype
2435+
if isinstance(_dtype, np.dtype):
2436+
dummies_dtype = _dtype
2437+
else:
2438+
dummies_dtype = np.bool_
2439+
dummies = np.zeros(n_rows * n_cols, dtype=dummies_dtype)
2440+
if dtype == str:
2441+
dummies[:] = False
24422442
dummies[indices] = True
24432443
dummies = dummies.reshape((n_rows, n_cols))
24442444
result = type(self)(pa.array(list(dummies)))

pandas/core/arrays/categorical.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -2681,11 +2681,11 @@ def _str_map(
26812681
result = NumpyExtensionArray(categories.to_numpy())._str_map(f, na_value, dtype)
26822682
return take_nd(result, codes, fill_value=na_value)
26832683

2684-
def _str_get_dummies(self, sep: str = "|"):
2684+
def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
26852685
# sep may not be in categories. Just bail on this.
26862686
from pandas.core.arrays import NumpyExtensionArray
26872687

2688-
return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep)
2688+
return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep, dtype)
26892689

26902690
# ------------------------------------------------------------------------
26912691
# GroupBy Methods

pandas/core/arrays/string_arrow.py

+16-11
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
from pandas._typing import (
5656
ArrayLike,
5757
Dtype,
58+
NpDtype,
5859
Self,
5960
npt,
6061
)
@@ -305,6 +306,7 @@ def astype(self, dtype, copy: bool = True):
305306
_str_swapcase = ArrowStringArrayMixin._str_swapcase
306307
_str_slice_replace = ArrowStringArrayMixin._str_slice_replace
307308
_str_len = ArrowStringArrayMixin._str_len
309+
_str_slice = ArrowStringArrayMixin._str_slice
308310

309311
def _str_contains(
310312
self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True
@@ -351,13 +353,6 @@ def _str_repeat(self, repeats: int | Sequence[int]):
351353
else:
352354
return ArrowExtensionArray._str_repeat(self, repeats=repeats)
353355

354-
def _str_slice(
355-
self, start: int | None = None, stop: int | None = None, step: int | None = None
356-
) -> Self:
357-
if stop is None:
358-
return super()._str_slice(start, stop, step)
359-
return ArrowExtensionArray._str_slice(self, start=start, stop=stop, step=step)
360-
361356
def _str_removeprefix(self, prefix: str):
362357
if not pa_version_under13p0:
363358
return ArrowStringArrayMixin._str_removeprefix(self, prefix)
@@ -379,12 +374,22 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None):
379374
return super()._str_find(sub, start, end)
380375
return ArrowStringArrayMixin._str_find(self, sub, start, end)
381376

382-
def _str_get_dummies(self, sep: str = "|"):
383-
dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep)
377+
def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
378+
if dtype is None:
379+
dtype = np.int64
380+
dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(
381+
sep, dtype
382+
)
384383
if len(labels) == 0:
385-
return np.empty(shape=(0, 0), dtype=np.int64), labels
384+
return np.empty(shape=(0, 0), dtype=dtype), labels
386385
dummies = np.vstack(dummies_pa.to_numpy())
387-
return dummies.astype(np.int64, copy=False), labels
386+
_dtype = pandas_dtype(dtype)
387+
dummies_dtype: NpDtype
388+
if isinstance(_dtype, np.dtype):
389+
dummies_dtype = _dtype
390+
else:
391+
dummies_dtype = np.bool_
392+
return dummies.astype(dummies_dtype, copy=False), labels
388393

389394
def _convert_int_result(self, result):
390395
if self.dtype.na_value is np.nan:

0 commit comments

Comments
 (0)