Skip to content

Commit ccabdb3

Browse files
committed
Merge remote-tracking branch 'upstream/main' into string_dtype_tests
# Conflicts: # pandas/core/arrays/string_arrow.py # pandas/core/construction.py # pandas/tests/frame/indexing/test_indexing.py # pandas/tests/frame/methods/test_rank.py # pandas/tests/frame/test_constructors.py
2 parents 1e7b93e + d943c26 commit ccabdb3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+447
-376
lines changed

ci/code_checks.sh

-10
Original file line numberDiff line numberDiff line change
@@ -63,16 +63,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
6363

6464
MSG='Partially validate docstrings (EX03)' ; echo $MSG
6565
$BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX03 --ignore_functions \
66-
pandas.Series.loc \
67-
pandas.Series.iloc \
68-
pandas.Series.pop \
69-
pandas.Series.describe \
70-
pandas.Series.skew \
71-
pandas.Series.var \
72-
pandas.Series.last \
73-
pandas.Series.tz_convert \
74-
pandas.Series.tz_localize \
75-
pandas.Series.dt.month_name \
7666
pandas.Series.dt.day_name \
7767
pandas.Series.str.len \
7868
pandas.Series.cat.set_categories \

doc/source/user_guide/enhancingperf.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ can be improved by passing an ``np.ndarray``.
184184
...: cpdef np.ndarray[double] apply_integrate_f(np.ndarray col_a, np.ndarray col_b,
185185
...: np.ndarray col_N):
186186
...: assert (col_a.dtype == np.float64
187-
...: and col_b.dtype == np.float64 and col_N.dtype == np.int_)
187+
...: and col_b.dtype == np.float64 and col_N.dtype == np.dtype(int))
188188
...: cdef Py_ssize_t i, n = len(col_N)
189189
...: assert (len(col_a) == len(col_b) == n)
190190
...: cdef np.ndarray[double] res = np.empty(n)

doc/source/whatsnew/v2.1.2.rst

+3
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,14 @@ Fixed regressions
2323
Bug fixes
2424
~~~~~~~~~
2525
- Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`)
26+
- Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`)
2627
- Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`)
2728
- Fixed bug in :meth:`DataFrame.interpolate` raising incorrect error message (:issue:`55347`)
2829
- Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`)
2930
- Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`)
3031
- Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`)
32+
- Fixed bug in :meth:`Series.all` and :meth:`Series.any` not treating missing values correctly for ``dtype="string[pyarrow_numpy]"`` (:issue:`55367`)
33+
- Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`55362`)
3134
- Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`)
3235
-
3336

doc/source/whatsnew/v2.2.0.rst

+33-4
Original file line numberDiff line numberDiff line change
@@ -74,12 +74,12 @@ enhancement2
7474
Other enhancements
7575
^^^^^^^^^^^^^^^^^^
7676

77+
- :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`).
7778
- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
7879
- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
7980
- :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`)
8081
- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
8182
- Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`)
82-
-
8383

8484
.. ---------------------------------------------------------------------------
8585
.. _whatsnew_220.notable_bug_fixes:
@@ -133,10 +133,36 @@ and ``sort=False``:
133133
134134
result
135135
136-
.. _whatsnew_220.notable_bug_fixes.notable_bug_fix2:
136+
.. _whatsnew_220.notable_bug_fixes.multiindex_join_different_levels:
137+
138+
:func:`merge` and :meth:`DataFrame.join` no longer reorder levels when levels differ
139+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
140+
141+
In previous versions of pandas, :func:`merge` and :meth:`DataFrame.join` would reorder
142+
index levels when joining on two indexes with different levels (:issue:`34133`).
143+
144+
.. ipython:: python
145+
146+
left = pd.DataFrame({"left": 1}, index=pd.MultiIndex.from_tuples([("x", 1), ("x", 2)], names=["A", "B"]))
147+
right = pd.DataFrame({"right": 2}, index=pd.MultiIndex.from_tuples([(1, 1), (2, 2)], names=["B", "C"]))
148+
result = left.join(right)
137149
138-
notable_bug_fix2
139-
^^^^^^^^^^^^^^^^
150+
*Old Behavior*
151+
152+
.. code-block:: ipython
153+
154+
In [5]: result
155+
Out[5]:
156+
left right
157+
B A C
158+
1 x 1 1 2
159+
2 x 2 1 2
160+
161+
*New Behavior*
162+
163+
.. ipython:: python
164+
165+
result
140166
141167
.. ---------------------------------------------------------------------------
142168
.. _whatsnew_220.api_breaking:
@@ -253,7 +279,9 @@ Bug fixes
253279
~~~~~~~~~
254280
- Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`)
255281
- Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`)
282+
- Bug in :func:`pandas.api.types.is_string_dtype` while checking object array with no elements is of the string dtype (:issue:`54661`)
256283
- Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`)
284+
- Bug in :meth:`pandas.DataFrame.melt` where it would not preserve the datetime (:issue:`55254`)
257285
- Bug in :meth:`pandas.read_excel` with a ODS file without cached formatted cell for float values (:issue:`55219`)
258286

259287
Categorical
@@ -341,6 +369,7 @@ Reshaping
341369
^^^^^^^^^
342370
- Bug in :func:`concat` ignoring ``sort`` parameter when passed :class:`DatetimeIndex` indexes (:issue:`54769`)
343371
- Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`)
372+
-
344373

345374
Sparse
346375
^^^^^^

pandas/_typing.py

+9
Original file line numberDiff line numberDiff line change
@@ -509,3 +509,12 @@ def closed(self) -> bool:
509509

510510
# Offsets
511511
OffsetCalendar = Union[np.busdaycalendar, "AbstractHolidayCalendar"]
512+
513+
# read_csv: usecols
514+
UsecolsArgType = Union[
515+
SequenceNotStr[Hashable],
516+
range,
517+
AnyArrayLike,
518+
Callable[[HashableT], bool],
519+
None,
520+
]

pandas/compat/numpy/__init__.py

+23
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
""" support numpy compatibility across versions """
2+
import warnings
3+
24
import numpy as np
35

46
from pandas.util.version import Version
@@ -21,6 +23,27 @@
2123
)
2224

2325

26+
np_long: type
27+
np_ulong: type
28+
29+
if _nlv >= Version("2.0.0.dev0"):
30+
try:
31+
with warnings.catch_warnings():
32+
warnings.filterwarnings(
33+
"ignore",
34+
r".*In the future `np\.long` will be defined as.*",
35+
FutureWarning,
36+
)
37+
np_long = np.long # type: ignore[attr-defined]
38+
np_ulong = np.ulong # type: ignore[attr-defined]
39+
except AttributeError:
40+
np_long = np.int_
41+
np_ulong = np.uint
42+
else:
43+
np_long = np.int_
44+
np_ulong = np.uint
45+
46+
2447
__all__ = [
2548
"np",
2649
"_np_version",

pandas/core/algorithms.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1636,7 +1636,7 @@ def safe_sort(
16361636
else:
16371637
mask = None
16381638
else:
1639-
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
1639+
reverse_indexer = np.empty(len(sorter), dtype=int)
16401640
reverse_indexer.put(sorter, np.arange(len(sorter)))
16411641
# Out of bound indices will be masked with `-1` next, so we
16421642
# may deal with them here without performance loss using `mode='wrap'`

pandas/core/arrays/categorical.py

+44-39
Original file line numberDiff line numberDiff line change
@@ -1819,23 +1819,27 @@ def _empty( # type: ignore[override]
18191819

18201820
return arr._from_backing_data(backing)
18211821

1822-
def _internal_get_values(self):
1822+
def _internal_get_values(self) -> ArrayLike:
18231823
"""
18241824
Return the values.
18251825
18261826
For internal compatibility with pandas formatting.
18271827
18281828
Returns
18291829
-------
1830-
np.ndarray or Index
1831-
A numpy array of the same dtype as categorical.categories.dtype or
1832-
Index if datetime / periods.
1830+
np.ndarray or ExtensionArray
1831+
A numpy array or ExtensionArray of the same dtype as
1832+
categorical.categories.dtype.
18331833
"""
18341834
# if we are a datetime and period index, return Index to keep metadata
18351835
if needs_i8_conversion(self.categories.dtype):
1836-
return self.categories.take(self._codes, fill_value=NaT)
1836+
return self.categories.take(self._codes, fill_value=NaT)._values
18371837
elif is_integer_dtype(self.categories.dtype) and -1 in self._codes:
1838-
return self.categories.astype("object").take(self._codes, fill_value=np.nan)
1838+
return (
1839+
self.categories.astype("object")
1840+
.take(self._codes, fill_value=np.nan)
1841+
._values
1842+
)
18391843
return np.array(self)
18401844

18411845
def check_for_ordered(self, op) -> None:
@@ -2147,21 +2151,6 @@ def _formatter(self, boxed: bool = False):
21472151
# Defer to CategoricalFormatter's formatter.
21482152
return None
21492153

2150-
def _tidy_repr(self, max_vals: int = 10, footer: bool = True) -> str:
2151-
"""
2152-
a short repr displaying only max_vals and an optional (but default
2153-
footer)
2154-
"""
2155-
num = max_vals // 2
2156-
head = self[:num]._get_repr(length=False, footer=False)
2157-
tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False)
2158-
2159-
result = f"{head[:-1]}, ..., {tail[1:]}"
2160-
if footer:
2161-
result = f"{result}\n{self._repr_footer()}"
2162-
2163-
return str(result)
2164-
21652154
def _repr_categories(self) -> list[str]:
21662155
"""
21672156
return the base repr for the categories
@@ -2217,33 +2206,49 @@ def _repr_categories_info(self) -> str:
22172206
# replace to simple save space by
22182207
return f"{levheader}[{levstring.replace(' < ... < ', ' ... ')}]"
22192208

2220-
def _repr_footer(self) -> str:
2221-
info = self._repr_categories_info()
2222-
return f"Length: {len(self)}\n{info}"
2223-
2224-
def _get_repr(
2225-
self, length: bool = True, na_rep: str = "NaN", footer: bool = True
2226-
) -> str:
2209+
def _get_values_repr(self) -> str:
22272210
from pandas.io.formats import format as fmt
22282211

2229-
formatter = fmt.CategoricalFormatter(
2230-
self, length=length, na_rep=na_rep, footer=footer
2212+
assert len(self) > 0
2213+
2214+
vals = self._internal_get_values()
2215+
fmt_values = fmt.format_array(
2216+
vals,
2217+
None,
2218+
float_format=None,
2219+
na_rep="NaN",
2220+
quoting=QUOTE_NONNUMERIC,
22312221
)
2232-
result = formatter.to_string()
2233-
return str(result)
2222+
2223+
fmt_values = [i.strip() for i in fmt_values]
2224+
joined = ", ".join(fmt_values)
2225+
result = "[" + joined + "]"
2226+
return result
22342227

22352228
def __repr__(self) -> str:
22362229
"""
22372230
String representation.
22382231
"""
2239-
_maxlen = 10
2240-
if len(self._codes) > _maxlen:
2241-
result = self._tidy_repr(_maxlen)
2242-
elif len(self._codes) > 0:
2243-
result = self._get_repr(length=len(self) > _maxlen)
2232+
footer = self._repr_categories_info()
2233+
length = len(self)
2234+
max_len = 10
2235+
if length > max_len:
2236+
# In long cases we do not display all entries, so we add Length
2237+
# information to the __repr__.
2238+
num = max_len // 2
2239+
head = self[:num]._get_values_repr()
2240+
tail = self[-(max_len - num) :]._get_values_repr()
2241+
body = f"{head[:-1]}, ..., {tail[1:]}"
2242+
length_info = f"Length: {len(self)}"
2243+
result = f"{body}\n{length_info}\n{footer}"
2244+
elif length > 0:
2245+
body = self._get_values_repr()
2246+
result = f"{body}\n{footer}"
22442247
else:
2245-
msg = self._get_repr(length=False, footer=True).replace("\n", ", ")
2246-
result = f"[], {msg}"
2248+
# In the empty case we use a comma instead of newline to get
2249+
# a more compact __repr__
2250+
body = "[]"
2251+
result = f"{body}, {footer}"
22472252

22482253
return result
22492254

pandas/core/arrays/datetimes.py

+26-5
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,14 @@
2828
get_resolution,
2929
get_supported_reso,
3030
get_unit_from_dtype,
31+
iNaT,
3132
ints_to_pydatetime,
3233
is_date_array_normalized,
3334
is_supported_unit,
3435
is_unitless,
3536
normalize_i8_timestamps,
3637
npy_unit_to_abbrev,
38+
periods_per_day,
3739
timezones,
3840
to_offset,
3941
tz_convert_from_utc,
@@ -735,14 +737,33 @@ def astype(self, dtype, copy: bool = True):
735737
def _format_native_types(
736738
self, *, na_rep: str | float = "NaT", date_format=None, **kwargs
737739
) -> npt.NDArray[np.object_]:
738-
from pandas.io.formats.format import get_format_datetime64_from_values
739-
740-
fmt = get_format_datetime64_from_values(self, date_format)
740+
if date_format is None and self._is_dates_only:
741+
# Only dates and no timezone: provide a default format
742+
date_format = "%Y-%m-%d"
741743

742744
return tslib.format_array_from_datetime(
743-
self.asi8, tz=self.tz, format=fmt, na_rep=na_rep, reso=self._creso
745+
self.asi8, tz=self.tz, format=date_format, na_rep=na_rep, reso=self._creso
744746
)
745747

748+
@property
749+
def _is_dates_only(self) -> bool:
750+
"""
751+
Check if we are round times at midnight (and no timezone), which will
752+
be given a more compact __repr__ than other cases.
753+
"""
754+
if self.tz is not None:
755+
return False
756+
757+
values_int = self.asi8
758+
consider_values = values_int != iNaT
759+
dtype = cast(np.dtype, self.dtype) # since we checked tz above
760+
reso = get_unit_from_dtype(dtype)
761+
ppd = periods_per_day(reso)
762+
763+
# TODO: can we reuse is_date_array_normalized? would need a skipna kwd
764+
even_days = np.logical_and(consider_values, values_int % ppd != 0).sum() == 0
765+
return even_days
766+
746767
# -----------------------------------------------------------------
747768
# Comparison Methods
748769

@@ -1276,7 +1297,7 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]:
12761297
>>> idx
12771298
DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'],
12781299
dtype='datetime64[ns]', freq='ME')
1279-
>>> idx.month_name(locale='pt_BR.utf8') # doctest: +SKIP
1300+
>>> idx.month_name(locale='pt_BR.utf8') # doctest: +SKIP
12801301
Index(['Janeiro', 'Fevereiro', 'Março'], dtype='object')
12811302
"""
12821303
values = self._local_timestamps()

pandas/core/arrays/string_arrow.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -618,7 +618,10 @@ def _str_map(
618618
return lib.map_infer_mask(arr, f, mask.view("uint8"))
619619

620620
def _convert_int_dtype(self, result):
621-
result = result.to_numpy()
621+
if isinstance(result, pa.Array):
622+
result = result.to_numpy(zero_copy_only=False)
623+
else:
624+
result = result.to_numpy()
622625
if result.dtype == np.int32:
623626
result = result.astype(np.int64)
624627
return result
@@ -639,9 +642,11 @@ def _reduce(
639642
self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
640643
):
641644
if name in ["any", "all"]:
642-
arr = pc.and_kleene(
643-
pc.invert(pc.is_null(self._pa_array)), pc.not_equal(self._pa_array, "")
644-
)
645+
if not skipna and name == "all":
646+
nas = pc.invert(pc.is_null(self._pa_array))
647+
arr = pc.and_kleene(nas, pc.not_equal(self._pa_array, ""))
648+
else:
649+
arr = pc.not_equal(self._pa_array, "")
645650
return ArrowExtensionArray(arr)._reduce(
646651
name, skipna=skipna, keepdims=keepdims, **kwargs
647652
)

pandas/core/arrays/timedeltas.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,7 @@ def _format_native_types(
471471
from pandas.io.formats.format import get_format_timedelta64
472472

473473
# Relies on TimeDelta._repr_base
474-
formatter = get_format_timedelta64(self._ndarray, na_rep)
474+
formatter = get_format_timedelta64(self, na_rep)
475475
# equiv: np.array([formatter(x) for x in self._ndarray])
476476
# but independent of dimension
477477
return np.frompyfunc(formatter, 1, 1)(self._ndarray)

0 commit comments

Comments
 (0)