Skip to content

Commit 16a3da2

Browse files
committed
Merge remote-tracking branch 'upstream/main' into enh/parquet/arrow_fs
2 parents 6503a7e + 2a3420a commit 16a3da2

File tree

23 files changed

+120
-122
lines changed

23 files changed

+120
-122
lines changed

ci/code_checks.sh

-2
Original file line numberDiff line numberDiff line change
@@ -564,8 +564,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
564564
pandas.api.types.is_datetime64_any_dtype \
565565
pandas.api.types.is_datetime64_ns_dtype \
566566
pandas.api.types.is_datetime64tz_dtype \
567-
pandas.api.types.is_integer_dtype \
568-
pandas.api.types.is_string_dtype \
569567
pandas.plotting.andrews_curves \
570568
pandas.plotting.autocorrelation_plot \
571569
pandas.plotting.lag_plot \

doc/source/user_guide/categorical.rst

-8
Original file line numberDiff line numberDiff line change
@@ -263,14 +263,6 @@ All instances of ``CategoricalDtype`` compare equal to the string ``'category'``
263263
264264
c1 == "category"
265265
266-
.. warning::
267-
268-
Since ``dtype='category'`` is essentially ``CategoricalDtype(None, False)``,
269-
and since all instances ``CategoricalDtype`` compare equal to ``'category'``,
270-
all instances of ``CategoricalDtype`` compare equal to a
271-
``CategoricalDtype(None, False)``, regardless of ``categories`` or
272-
``ordered``.
273-
274266
Description
275267
-----------
276268

doc/source/whatsnew/v2.0.0.rst

+3
Original file line numberDiff line numberDiff line change
@@ -1214,6 +1214,7 @@ Datetimelike
12141214
- Bug in :func:`DataFrame.from_records` when given a :class:`DataFrame` input with timezone-aware datetime64 columns incorrectly dropping the timezone-awareness (:issue:`51162`)
12151215
- Bug in :func:`to_datetime` was raising ``decimal.InvalidOperation`` when parsing date strings with ``errors='coerce'`` (:issue:`51084`)
12161216
- Bug in :func:`to_datetime` with both ``unit`` and ``origin`` specified returning incorrect results (:issue:`42624`)
1217+
- Bug in :meth:`Series.astype` and :meth:`DataFrame.astype` when converting an object-dtype object containing timezone-aware datetimes or strings to ``datetime64[ns]`` incorrectly localizing as UTC instead of raising ``TypeError`` (:issue:`50140`)
12171218
- Bug in :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` with datetime or timedelta dtypes giving incorrect results for groups containing ``NaT`` (:issue:`51373`)
12181219
- Bug in :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` incorrectly raising with :class:`PeriodDtype` or :class:`DatetimeTZDtype` (:issue:`51373`)
12191220

@@ -1276,6 +1277,7 @@ Indexing
12761277
- Bug in :meth:`DataFrame.sort_values` where ``None`` was not returned when ``by`` is empty list and ``inplace=True`` (:issue:`50643`)
12771278
- Bug in :meth:`DataFrame.loc` coercing dtypes when setting values with a list indexer (:issue:`49159`)
12781279
- Bug in :meth:`Series.loc` raising error for out of bounds end of slice indexer (:issue:`50161`)
1280+
- Bug in :meth:`DataFrame.loc` raising ``ValueError`` with all ``False`` ``bool`` indexer and empty object (:issue:`51450`)
12791281
- Bug in :meth:`DataFrame.loc` raising ``ValueError`` with ``bool`` indexer and :class:`MultiIndex` (:issue:`47687`)
12801282
- Bug in :meth:`DataFrame.loc` raising ``IndexError`` when setting values for a pyarrow-backed column with a non-scalar indexer (:issue:`50085`)
12811283
- Bug in :meth:`DataFrame.__getitem__`, :meth:`Series.__getitem__`, :meth:`DataFrame.__setitem__` and :meth:`Series.__setitem__`
@@ -1377,6 +1379,7 @@ Groupby/resample/rolling
13771379
- Bug in :meth:`.DataFrameGroupBy.agg` with ``engine="numba"`` failing to respect ``as_index=False`` (:issue:`51228`)
13781380
- Bug in :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, and :meth:`.Resampler.agg` would ignore arguments when passed a list of functions (:issue:`50863`)
13791381
- Bug in :meth:`.DataFrameGroupBy.ohlc` ignoring ``as_index=False`` (:issue:`51413`)
1382+
- Bug in :meth:`DataFrameGroupBy.agg` after subsetting columns (e.g. ``.groupby(...)[["a", "b"]]``) would not include groupings in the result (:issue:`51186`)
13801383

13811384
Reshaping
13821385
^^^^^^^^^

doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ Performance improvements
102102
~~~~~~~~~~~~~~~~~~~~~~~~
103103
- Performance improvement in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` for extension array dtypes (:issue:`51549`)
104104
- Performance improvement in :meth:`DataFrame.clip` and :meth:`Series.clip` (:issue:`51472`)
105+
- Performance improvement in :func:`read_parquet` on string columns when using ``use_nullable_dtypes=True`` (:issue:`47345`)
105106
-
106107

107108
.. ---------------------------------------------------------------------------

pandas/_libs/tslib.pyx

-4
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,6 @@ from pandas._libs.tslibs.conversion cimport (
5656
convert_timezone,
5757
get_datetime64_nanos,
5858
parse_pydatetime,
59-
precision_from_unit,
6059
)
6160
from pandas._libs.tslibs.nattype cimport (
6261
NPY_NAT,
@@ -258,7 +257,6 @@ def array_with_unit_to_datetime(
258257
"""
259258
cdef:
260259
Py_ssize_t i, n=len(values)
261-
int64_t mult
262260
bint is_ignore = errors == "ignore"
263261
bint is_coerce = errors == "coerce"
264262
bint is_raise = errors == "raise"
@@ -275,8 +273,6 @@ def array_with_unit_to_datetime(
275273
)
276274
return result, tz
277275

278-
mult, _ = precision_from_unit(unit)
279-
280276
result = np.empty(n, dtype="M8[ns]")
281277
iresult = result.view("i8")
282278

pandas/_libs/tslibs/conversion.pyx

+14-30
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ from pandas._libs.tslibs.np_datetime cimport (
3737
NPY_FR_us,
3838
check_dts_bounds,
3939
convert_reso,
40+
get_conversion_factor,
4041
get_datetime64_unit,
4142
get_datetime64_value,
4243
get_implementation_bounds,
@@ -83,9 +84,9 @@ TD64NS_DTYPE = np.dtype("m8[ns]")
8384
# Unit Conversion Helpers
8485

8586
cdef int64_t cast_from_unit(
86-
object ts,
87-
str unit,
88-
NPY_DATETIMEUNIT out_reso=NPY_FR_ns
87+
object ts,
88+
str unit,
89+
NPY_DATETIMEUNIT out_reso=NPY_FR_ns
8990
) except? -1:
9091
"""
9192
Return a casting of the unit represented to nanoseconds
@@ -104,12 +105,6 @@ cdef int64_t cast_from_unit(
104105
int64_t m
105106
int p
106107

107-
m, p = precision_from_unit(unit, out_reso)
108-
109-
# just give me the unit back
110-
if ts is None:
111-
return m
112-
113108
if unit in ["Y", "M"]:
114109
if is_float_object(ts) and not ts.is_integer():
115110
# GH#47267 it is clear that 2 "M" corresponds to 1970-02-01,
@@ -126,6 +121,8 @@ cdef int64_t cast_from_unit(
126121
dt64obj = np.datetime64(ts, unit)
127122
return get_datetime64_nanos(dt64obj, out_reso)
128123

124+
m, p = precision_from_unit(unit, out_reso)
125+
129126
# cast the unit, multiply base/frac separately
130127
# to avoid precision issues from float -> int
131128
try:
@@ -148,8 +145,8 @@ cdef int64_t cast_from_unit(
148145

149146

150147
cpdef inline (int64_t, int) precision_from_unit(
151-
str unit,
152-
NPY_DATETIMEUNIT out_reso=NPY_DATETIMEUNIT.NPY_FR_ns,
148+
str unit,
149+
NPY_DATETIMEUNIT out_reso=NPY_DATETIMEUNIT.NPY_FR_ns,
153150
):
154151
"""
155152
Return a casting of the unit represented to nanoseconds + the precision
@@ -166,34 +163,21 @@ cpdef inline (int64_t, int) precision_from_unit(
166163
int p
167164
NPY_DATETIMEUNIT reso = abbrev_to_npy_unit(unit)
168165

169-
multiplier = periods_per_second(out_reso)
170-
166+
if reso == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
167+
reso = NPY_DATETIMEUNIT.NPY_FR_ns
171168
if reso == NPY_DATETIMEUNIT.NPY_FR_Y:
172169
# each 400 years we have 97 leap years, for an average of 97/400=.2425
173170
# extra days each year. We get 31556952 by writing
174171
# 3600*24*365.2425=31556952
172+
multiplier = periods_per_second(out_reso)
175173
m = multiplier * 31556952
176174
elif reso == NPY_DATETIMEUNIT.NPY_FR_M:
177175
# 2629746 comes from dividing the "Y" case by 12.
176+
multiplier = periods_per_second(out_reso)
178177
m = multiplier * 2629746
179-
elif reso == NPY_DATETIMEUNIT.NPY_FR_W:
180-
m = multiplier * 3600 * 24 * 7
181-
elif reso == NPY_DATETIMEUNIT.NPY_FR_D:
182-
m = multiplier * 3600 * 24
183-
elif reso == NPY_DATETIMEUNIT.NPY_FR_h:
184-
m = multiplier * 3600
185-
elif reso == NPY_DATETIMEUNIT.NPY_FR_m:
186-
m = multiplier * 60
187-
elif reso == NPY_DATETIMEUNIT.NPY_FR_s:
188-
m = multiplier
189-
elif reso == NPY_DATETIMEUNIT.NPY_FR_ms:
190-
m = multiplier // 1_000
191-
elif reso == NPY_DATETIMEUNIT.NPY_FR_us:
192-
m = multiplier // 1_000_000
193-
elif reso == NPY_DATETIMEUNIT.NPY_FR_ns or reso == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
194-
m = multiplier // 1_000_000_000
195178
else:
196-
raise ValueError(f"cannot cast unit {unit}")
179+
m = get_conversion_factor(reso, out_reso)
180+
197181
p = <int>log10(m) # number of digits in 'm' minus 1
198182
return m, p
199183

pandas/_libs/tslibs/np_datetime.pyx

+2
Original file line numberDiff line numberDiff line change
@@ -571,6 +571,8 @@ cdef int64_t get_conversion_factor(
571571
return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_fs, to_unit)
572572
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_fs:
573573
return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_as, to_unit)
574+
else:
575+
raise ValueError("Converting from M or Y units is not supported.")
574576

575577

576578
cdef int64_t convert_reso(

pandas/_typing.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,13 @@
9494
AnyArrayLike = Union[ArrayLike, "Index", "Series"]
9595
TimeArrayLike = Union["DatetimeArray", "TimedeltaArray"]
9696

97+
# list-like
98+
99+
# Cannot use `Sequence` because a string is a sequence, and we don't want to
100+
# accept that. Could refine if https://github.com/python/typing/issues/256 is
101+
# resolved to differentiate between Sequence[str] and str
102+
ListLike = Union[AnyArrayLike, List, range]
103+
97104
# scalars
98105

99106
PythonScalar = Union[str, float, bool]
@@ -130,7 +137,7 @@
130137
Ordered = Optional[bool]
131138
JSONSerializable = Optional[Union[PythonScalar, List, Dict]]
132139
Frequency = Union[str, "BaseOffset"]
133-
Axes = Union[AnyArrayLike, List, range]
140+
Axes = ListLike
134141

135142
RandomState = Union[
136143
int,

pandas/core/arrays/string_.py

+12-9
Original file line numberDiff line numberDiff line change
@@ -203,16 +203,19 @@ def __from_arrow__(
203203
# pyarrow.ChunkedArray
204204
chunks = array.chunks
205205

206-
results = []
207-
for arr in chunks:
208-
# using _from_sequence to ensure None is converted to NA
209-
str_arr = StringArray._from_sequence(np.array(arr))
210-
results.append(str_arr)
211-
212-
if results:
213-
return StringArray._concat_same_type(results)
206+
if len(chunks) == 0:
207+
arr = np.array([], dtype=object)
214208
else:
215-
return StringArray(np.array([], dtype="object"))
209+
arr = pyarrow.concat_arrays(chunks).to_numpy(zero_copy_only=False)
210+
arr = lib.convert_nans_to_NA(arr)
211+
# Bypass validation inside StringArray constructor, see GH#47781
212+
new_string_array = StringArray.__new__(StringArray)
213+
NDArrayBacked.__init__(
214+
new_string_array,
215+
arr,
216+
StringDtype(storage="python"),
217+
)
218+
return new_string_array
216219

217220

218221
class BaseStringArray(ExtensionArray):

pandas/core/dtypes/common.py

+2
Original file line numberDiff line numberDiff line change
@@ -523,6 +523,7 @@ def is_string_dtype(arr_or_dtype) -> bool:
523523
524524
Examples
525525
--------
526+
>>> from pandas.api.types import is_string_dtype
526527
>>> is_string_dtype(str)
527528
True
528529
>>> is_string_dtype(object)
@@ -674,6 +675,7 @@ def is_integer_dtype(arr_or_dtype) -> bool:
674675
675676
Examples
676677
--------
678+
>>> from pandas.api.types import is_integer_dtype
677679
>>> is_integer_dtype(str)
678680
False
679681
>>> is_integer_dtype(int)

pandas/core/frame.py

+14-14
Original file line numberDiff line numberDiff line change
@@ -1134,9 +1134,9 @@ def _repr_html_(self) -> str | None:
11341134
def to_string(
11351135
self,
11361136
buf: None = ...,
1137-
columns: Sequence[str] | None = ...,
1137+
columns: Axes | None = ...,
11381138
col_space: int | list[int] | dict[Hashable, int] | None = ...,
1139-
header: bool | Sequence[str] = ...,
1139+
header: bool | list[str] = ...,
11401140
index: bool = ...,
11411141
na_rep: str = ...,
11421142
formatters: fmt.FormattersType | None = ...,
@@ -1159,9 +1159,9 @@ def to_string(
11591159
def to_string(
11601160
self,
11611161
buf: FilePath | WriteBuffer[str],
1162-
columns: Sequence[str] | None = ...,
1162+
columns: Axes | None = ...,
11631163
col_space: int | list[int] | dict[Hashable, int] | None = ...,
1164-
header: bool | Sequence[str] = ...,
1164+
header: bool | list[str] = ...,
11651165
index: bool = ...,
11661166
na_rep: str = ...,
11671167
formatters: fmt.FormattersType | None = ...,
@@ -1181,8 +1181,8 @@ def to_string(
11811181
...
11821182

11831183
@Substitution(
1184-
header_type="bool or sequence of str",
1185-
header="Write out the column names. If a list of strings "
1184+
header_type="bool or list of str",
1185+
header="Write out the column names. If a list of columns "
11861186
"is given, it is assumed to be aliases for the "
11871187
"column names",
11881188
col_space_type="int, list or dict of int",
@@ -1194,9 +1194,9 @@ def to_string(
11941194
def to_string(
11951195
self,
11961196
buf: FilePath | WriteBuffer[str] | None = None,
1197-
columns: Sequence[str] | None = None,
1197+
columns: Axes | None = None,
11981198
col_space: int | list[int] | dict[Hashable, int] | None = None,
1199-
header: bool | Sequence[str] = True,
1199+
header: bool | list[str] = True,
12001200
index: bool = True,
12011201
na_rep: str = "NaN",
12021202
formatters: fmt.FormattersType | None = None,
@@ -2965,9 +2965,9 @@ def to_orc(
29652965
def to_html(
29662966
self,
29672967
buf: FilePath | WriteBuffer[str],
2968-
columns: Sequence[Level] | None = ...,
2968+
columns: Axes | None = ...,
29692969
col_space: ColspaceArgType | None = ...,
2970-
header: bool | Sequence[str] = ...,
2970+
header: bool = ...,
29712971
index: bool = ...,
29722972
na_rep: str = ...,
29732973
formatters: FormattersType | None = ...,
@@ -2994,9 +2994,9 @@ def to_html(
29942994
def to_html(
29952995
self,
29962996
buf: None = ...,
2997-
columns: Sequence[Level] | None = ...,
2997+
columns: Axes | None = ...,
29982998
col_space: ColspaceArgType | None = ...,
2999-
header: bool | Sequence[str] = ...,
2999+
header: bool = ...,
30003000
index: bool = ...,
30013001
na_rep: str = ...,
30023002
formatters: FormattersType | None = ...,
@@ -3030,9 +3030,9 @@ def to_html(
30303030
def to_html(
30313031
self,
30323032
buf: FilePath | WriteBuffer[str] | None = None,
3033-
columns: Sequence[Level] | None = None,
3033+
columns: Axes | None = None,
30343034
col_space: ColspaceArgType | None = None,
3035-
header: bool | Sequence[str] = True,
3035+
header: bool = True,
30363036
index: bool = True,
30373037
na_rep: str = "NaN",
30383038
formatters: FormattersType | None = None,

pandas/core/generic.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -3101,7 +3101,7 @@ def to_latex(
31013101
self,
31023102
buf: None = ...,
31033103
columns: Sequence[Hashable] | None = ...,
3104-
header: bool_t | Sequence[str] = ...,
3104+
header: bool_t | list[str] = ...,
31053105
index: bool_t = ...,
31063106
na_rep: str = ...,
31073107
formatters: FormattersType | None = ...,
@@ -3128,7 +3128,7 @@ def to_latex(
31283128
self,
31293129
buf: FilePath | WriteBuffer[str],
31303130
columns: Sequence[Hashable] | None = ...,
3131-
header: bool_t | Sequence[str] = ...,
3131+
header: bool_t | list[str] = ...,
31323132
index: bool_t = ...,
31333133
na_rep: str = ...,
31343134
formatters: FormattersType | None = ...,
@@ -3155,7 +3155,7 @@ def to_latex(
31553155
self,
31563156
buf: FilePath | WriteBuffer[str] | None = None,
31573157
columns: Sequence[Hashable] | None = None,
3158-
header: bool_t | Sequence[str] = True,
3158+
header: bool_t | list[str] = True,
31593159
index: bool_t = True,
31603160
na_rep: str = "NaN",
31613161
formatters: FormattersType | None = None,

pandas/core/groupby/generic.py

+2-8
Original file line numberDiff line numberDiff line change
@@ -1359,21 +1359,15 @@ def _python_agg_general(self, func, *args, **kwargs):
13591359
return self._wrap_aggregated_output(res)
13601360

13611361
def _iterate_slices(self) -> Iterable[Series]:
1362-
obj = self._selected_obj
1362+
obj = self._obj_with_exclusions
13631363
if self.axis == 1:
13641364
obj = obj.T
13651365

1366-
if isinstance(obj, Series) and obj.name not in self.exclusions:
1366+
if isinstance(obj, Series):
13671367
# Occurs when doing DataFrameGroupBy(...)["X"]
13681368
yield obj
13691369
else:
13701370
for label, values in obj.items():
1371-
if label in self.exclusions:
1372-
# Note: if we tried to just iterate over _obj_with_exclusions,
1373-
# we would break test_wrap_agg_out by yielding a column
1374-
# that is skipped here but not dropped from obj_with_exclusions
1375-
continue
1376-
13771371
yield values
13781372

13791373
def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame:

pandas/core/indexes/multi.py

+2
Original file line numberDiff line numberDiff line change
@@ -3748,6 +3748,8 @@ def delete(self, loc) -> MultiIndex:
37483748
@doc(Index.isin)
37493749
def isin(self, values, level=None) -> npt.NDArray[np.bool_]:
37503750
if level is None:
3751+
if len(values) == 0:
3752+
return np.zeros((len(self),), dtype=np.bool_)
37513753
if not isinstance(values, MultiIndex):
37523754
values = MultiIndex.from_tuples(values)
37533755
return values.unique().get_indexer_for(self) != -1

0 commit comments

Comments
 (0)