Skip to content

Commit 49f2fc8

Browse files
committed
Merge remote-tracking branch 'upstream/main' into enh/parquet/arrow_fs
2 parents e1f8912 + ade0418 commit 49f2fc8

File tree

22 files changed

+339
-160
lines changed

22 files changed

+339
-160
lines changed

asv_bench/benchmarks/frame_methods.py

+16
Original file line numberDiff line numberDiff line change
@@ -444,6 +444,22 @@ def time_dropna_axis_mixed_dtypes(self, how, axis):
444444
self.df_mixed.dropna(how=how, axis=axis)
445445

446446

447+
class Isna:
448+
params = ["float64", "Float64", "float64[pyarrow]"]
449+
param_names = ["dtype"]
450+
451+
def setup(self, dtype):
452+
data = np.random.randn(10000, 1000)
453+
# all-na columns
454+
data[:, 600:800] = np.nan
455+
# partial-na columns
456+
data[800:1000, 4000:5000] = np.nan
457+
self.df = DataFrame(data, dtype=dtype)
458+
459+
def time_isna(self, dtype):
460+
self.df.isna()
461+
462+
447463
class Count:
448464
params = [0, 1]
449465
param_names = ["axis"]

doc/source/whatsnew/v2.1.0.rst

+5-3
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ Deprecations
103103

104104
Performance improvements
105105
~~~~~~~~~~~~~~~~~~~~~~~~
106+
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.isna` when array has zero nulls or is all nulls (:issue:`51630`)
106107
- Performance improvement in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` for extension array dtypes (:issue:`51549`)
107108
- Performance improvement in :meth:`DataFrame.clip` and :meth:`Series.clip` (:issue:`51472`)
108109
- Performance improvement in :func:`read_parquet` on string columns when using ``use_nullable_dtypes=True`` (:issue:`47345`)
@@ -121,12 +122,13 @@ Categorical
121122

122123
Datetimelike
123124
^^^^^^^^^^^^
124-
-
125+
- Bug in :meth:`Timestamp.round` with values close to the implementation bounds returning incorrect results instead of raising ``OutOfBoundsDatetime`` (:issue:`51494`)
125126
-
126127

127128
Timedelta
128129
^^^^^^^^^
129-
-
130+
- Bug in :meth:`Timedelta.round` with values close to the implementation bounds returning incorrect results instead of raising ``OutOfBoundsTimedelta`` (:issue:`51494`)
131+
- Bug in :class:`TimedeltaIndex` division or multiplication leading to ``.freq`` of "0 Days" instead of ``None`` (:issue:`51575`)
130132
-
131133

132134
Timezones
@@ -186,7 +188,7 @@ Plotting
186188

187189
Groupby/resample/rolling
188190
^^^^^^^^^^^^^^^^^^^^^^^^
189-
-
191+
- Bug in :meth:`DataFrameGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmax` return wrong dtype when used on empty DataFrameGroupBy or SeriesGroupBy (:issue:`51423`)
190192
-
191193

192194
Reshaping

pandas/_libs/lib.pyi

+6
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ def maybe_convert_objects(
7171
*,
7272
try_float: bool = ...,
7373
safe: bool = ...,
74+
convert_numeric: bool = ...,
7475
convert_datetime: Literal[False] = ...,
7576
convert_timedelta: Literal[False] = ...,
7677
convert_period: Literal[False] = ...,
@@ -84,6 +85,7 @@ def maybe_convert_objects(
8485
*,
8586
try_float: bool = ...,
8687
safe: bool = ...,
88+
convert_numeric: bool = ...,
8789
convert_datetime: Literal[False] = ...,
8890
convert_timedelta: bool = ...,
8991
convert_period: Literal[False] = ...,
@@ -97,6 +99,7 @@ def maybe_convert_objects(
9799
*,
98100
try_float: bool = ...,
99101
safe: bool = ...,
102+
convert_numeric: bool = ...,
100103
convert_datetime: bool = ...,
101104
convert_timedelta: bool = ...,
102105
convert_period: bool = ...,
@@ -110,6 +113,7 @@ def maybe_convert_objects(
110113
*,
111114
try_float: bool = ...,
112115
safe: bool = ...,
116+
convert_numeric: bool = ...,
113117
convert_datetime: Literal[True] = ...,
114118
convert_timedelta: bool = ...,
115119
convert_period: bool = ...,
@@ -123,6 +127,7 @@ def maybe_convert_objects(
123127
*,
124128
try_float: bool = ...,
125129
safe: bool = ...,
130+
convert_numeric: bool = ...,
126131
convert_datetime: bool = ...,
127132
convert_timedelta: bool = ...,
128133
convert_period: Literal[True] = ...,
@@ -136,6 +141,7 @@ def maybe_convert_objects(
136141
*,
137142
try_float: bool = ...,
138143
safe: bool = ...,
144+
convert_numeric: bool = ...,
139145
convert_datetime: bool = ...,
140146
convert_timedelta: bool = ...,
141147
convert_period: bool = ...,

pandas/_libs/lib.pyx

+17
Original file line numberDiff line numberDiff line change
@@ -2367,6 +2367,7 @@ def maybe_convert_objects(ndarray[object] objects,
23672367
*,
23682368
bint try_float=False,
23692369
bint safe=False,
2370+
bint convert_numeric=True, # NB: different default!
23702371
bint convert_datetime=False,
23712372
bint convert_timedelta=False,
23722373
bint convert_period=False,
@@ -2386,6 +2387,8 @@ def maybe_convert_objects(ndarray[object] objects,
23862387
safe : bool, default False
23872388
Whether to upcast numeric type (e.g. int cast to float). If set to
23882389
True, no upcasting will be performed.
2390+
convert_numeric : bool, default True
2391+
Whether to convert numeric entries.
23892392
convert_datetime : bool, default False
23902393
If an array-like object contains only datetime values or NaT is
23912394
encountered, whether to convert and return an array of M8[ns] dtype.
@@ -2463,9 +2466,13 @@ def maybe_convert_objects(ndarray[object] objects,
24632466
elif util.is_bool_object(val):
24642467
seen.bool_ = True
24652468
bools[i] = val
2469+
if not convert_numeric:
2470+
break
24662471
elif util.is_float_object(val):
24672472
floats[i] = complexes[i] = val
24682473
seen.float_ = True
2474+
if not convert_numeric:
2475+
break
24692476
elif is_timedelta(val):
24702477
if convert_timedelta:
24712478
seen.timedelta_ = True
@@ -2497,10 +2504,14 @@ def maybe_convert_objects(ndarray[object] objects,
24972504
else:
24982505
uints[i] = val
24992506
ints[i] = val
2507+
if not convert_numeric:
2508+
break
25002509

25012510
elif util.is_complex_object(val):
25022511
complexes[i] = val
25032512
seen.complex_ = True
2513+
if not convert_numeric:
2514+
break
25042515
elif PyDateTime_Check(val) or util.is_datetime64_object(val):
25052516

25062517
# if we have an tz's attached then return the objects
@@ -2638,6 +2649,12 @@ def maybe_convert_objects(ndarray[object] objects,
26382649
else:
26392650
seen.object_ = True
26402651

2652+
if not convert_numeric:
2653+
# Note: we count "bool" as numeric here. This is becase
2654+
# np.array(list_of_items) will convert bools just like it will numeric
2655+
# entries.
2656+
return objects
2657+
26412658
if seen.bool_:
26422659
if seen.is_bool:
26432660
# is_bool property rules out everything else

pandas/_libs/tslibs/fields.pyx

+30-8
Original file line numberDiff line numberDiff line change
@@ -704,7 +704,7 @@ cdef ndarray[int64_t] _ceil_int64(const int64_t[:] values, int64_t unit):
704704
cdef:
705705
Py_ssize_t i, n = len(values)
706706
ndarray[int64_t] result = np.empty(n, dtype="i8")
707-
int64_t res, value
707+
int64_t res, value, remainder
708708

709709
with cython.overflowcheck(True):
710710
for i in range(n):
@@ -732,6 +732,34 @@ cdef ndarray[int64_t] _roundup_int64(values, int64_t unit):
732732
return _floor_int64(values + unit // 2, unit)
733733

734734

735+
cdef ndarray[int64_t] _round_nearest_int64(const int64_t[:] values, int64_t unit):
736+
cdef:
737+
Py_ssize_t i, n = len(values)
738+
ndarray[int64_t] result = np.empty(n, dtype="i8")
739+
int64_t res, value, half, remainder, quotient
740+
741+
half = unit // 2
742+
743+
with cython.overflowcheck(True):
744+
for i in range(n):
745+
value = values[i]
746+
747+
if value == NPY_NAT:
748+
res = NPY_NAT
749+
else:
750+
quotient, remainder = divmod(value, unit)
751+
if remainder > half:
752+
res = value + (unit - remainder)
753+
elif remainder == half and quotient % 2:
754+
res = value + (unit - remainder)
755+
else:
756+
res = value - remainder
757+
758+
result[i] = res
759+
760+
return result
761+
762+
735763
def round_nsint64(values: np.ndarray, mode: RoundTo, nanos: int) -> np.ndarray:
736764
"""
737765
Applies rounding mode at given frequency
@@ -762,13 +790,7 @@ def round_nsint64(values: np.ndarray, mode: RoundTo, nanos: int) -> np.ndarray:
762790
# for odd unit there is no need of a tie break
763791
if unit % 2:
764792
return _rounddown_int64(values, unit)
765-
quotient, remainder = np.divmod(values, unit)
766-
mask = np.logical_or(
767-
remainder > (unit // 2),
768-
np.logical_and(remainder == (unit // 2), quotient % 2)
769-
)
770-
quotient[mask] += 1
771-
return quotient * unit
793+
return _round_nearest_int64(values, unit)
772794

773795
# if/elif above should catch all rounding modes defined in enum 'RoundTo':
774796
# if flow of control arrives here, it is a bug

pandas/_libs/tslibs/timedeltas.pyx

+6-1
Original file line numberDiff line numberDiff line change
@@ -1824,7 +1824,12 @@ class Timedelta(_Timedelta):
18241824
unit = delta_to_nanoseconds(to_offset(freq), self._creso)
18251825

18261826
arr = np.array([self._value], dtype="i8")
1827-
result = round_nsint64(arr, mode, unit)[0]
1827+
try:
1828+
result = round_nsint64(arr, mode, unit)[0]
1829+
except OverflowError as err:
1830+
raise OutOfBoundsTimedelta(
1831+
f"Cannot round {self} to freq={freq} without overflow"
1832+
) from err
18281833
return Timedelta._from_value_and_reso(result, self._creso)
18291834

18301835
def round(self, freq):

pandas/_libs/tslibs/timestamps.pyx

+7-1
Original file line numberDiff line numberDiff line change
@@ -1692,7 +1692,13 @@ class Timestamp(_Timestamp):
16921692
value = np.array([value], dtype=np.int64)
16931693

16941694
# Will only ever contain 1 element for timestamp
1695-
r = round_nsint64(value, mode, nanos)[0]
1695+
try:
1696+
r = round_nsint64(value, mode, nanos)[0]
1697+
except OverflowError as err:
1698+
raise OutOfBoundsDatetime(
1699+
f"Cannot round {self} to freq={freq} without overflow"
1700+
) from err
1701+
16961702
result = Timestamp._from_value_and_reso(r, self._creso, None)
16971703
if self.tz is not None:
16981704
result = result.tz_localize(

pandas/core/arrays/arrow/array.py

+7
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,13 @@ def isna(self) -> npt.NDArray[np.bool_]:
557557
558558
This should return a 1-D array the same length as 'self'.
559559
"""
560+
# GH51630: fast paths
561+
null_count = self._data.null_count
562+
if null_count == 0:
563+
return np.zeros(len(self), dtype=np.bool_)
564+
elif null_count == len(self):
565+
return np.ones(len(self), dtype=np.bool_)
566+
560567
return self._data.is_null().to_numpy()
561568

562569
def argsort(

pandas/core/arrays/timedeltas.py

+7-11
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,9 @@ def __mul__(self, other) -> TimedeltaArray:
467467
freq = None
468468
if self.freq is not None and not isna(other):
469469
freq = self.freq * other
470+
if freq.n == 0:
471+
# GH#51575 Better to have no freq than an incorrect one
472+
freq = None
470473
return type(self)._simple_new(result, dtype=result.dtype, freq=freq)
471474

472475
if not hasattr(other, "dtype"):
@@ -526,17 +529,10 @@ def _scalar_divlike_op(self, other, op):
526529
# Note: freq gets division, not floor-division, even if op
527530
# is floordiv.
528531
freq = self.freq / other
529-
530-
# TODO: 2022-12-24 test_ufunc_coercions, test_tdi_ops_attributes
531-
# get here for truediv, no tests for floordiv
532-
533-
if op is operator.floordiv:
534-
if freq.nanos == 0 and self.freq.nanos != 0:
535-
# e.g. if self.freq is Nano(1) then dividing by 2
536-
# rounds down to zero
537-
# TODO: 2022-12-24 should implement the same check
538-
# for truediv case
539-
freq = None
532+
if freq.nanos == 0 and self.freq.nanos != 0:
533+
# e.g. if self.freq is Nano(1) then dividing by 2
534+
# rounds down to zero
535+
freq = None
540536

541537
return type(self)._simple_new(result, dtype=result.dtype, freq=freq)
542538

pandas/core/dtypes/cast.py

+7-10
Original file line numberDiff line numberDiff line change
@@ -1156,23 +1156,20 @@ def maybe_infer_to_datetimelike(
11561156
if not len(value):
11571157
return value
11581158

1159-
out = lib.maybe_convert_objects(
1159+
# error: Incompatible return value type (got "Union[ExtensionArray,
1160+
# ndarray[Any, Any]]", expected "Union[ndarray[Any, Any], DatetimeArray,
1161+
# TimedeltaArray, PeriodArray, IntervalArray]")
1162+
return lib.maybe_convert_objects( # type: ignore[return-value]
11601163
value,
1164+
# Here we do not convert numeric dtypes, as if we wanted that,
1165+
# numpy would have done it for us.
1166+
convert_numeric=False,
11611167
convert_period=True,
11621168
convert_interval=True,
11631169
convert_timedelta=True,
11641170
convert_datetime=True,
11651171
dtype_if_all_nat=np.dtype("M8[ns]"),
11661172
)
1167-
if out.dtype.kind in ["i", "u", "f", "b", "c"]:
1168-
# Here we do not convert numeric dtypes, as if we wanted that,
1169-
# numpy would have done it for us.
1170-
# See also _maybe_cast_data_without_dtype
1171-
return value
1172-
# Incompatible return value type (got "Union[ExtensionArray, ndarray[Any, Any]]",
1173-
# expected "Union[ndarray[Any, Any], DatetimeArray, TimedeltaArray, PeriodArray,
1174-
# IntervalArray]")
1175-
return out # type: ignore[return-value]
11761173

11771174

11781175
def maybe_cast_to_datetime(

pandas/core/frame.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -2801,8 +2801,12 @@ def to_parquet(
28012801
``io.parquet.engine`` is used. The default ``io.parquet.engine``
28022802
behavior is to try 'pyarrow', falling back to 'fastparquet' if
28032803
'pyarrow' is unavailable.
2804-
compression : {{'snappy', 'gzip', 'brotli', None}}, default 'snappy'
2804+
compression : str or None, default 'snappy'
28052805
Name of the compression to use. Use ``None`` for no compression.
2806+
The supported compression methods actually depend on which engine
2807+
is used. For 'pyarrow', 'snappy', 'gzip', 'brotli', 'lz4', 'zstd'
2808+
are all supported. For 'fastparquet', only 'gzip' and 'snappy' are
2809+
supported.
28062810
index : bool, default None
28072811
If ``True``, include the dataframe's index(es) in the file output.
28082812
If ``False``, they will not be written to the file.
@@ -8211,7 +8215,7 @@ def groupby(
82118215
level: IndexLabel | None = None,
82128216
as_index: bool = True,
82138217
sort: bool = True,
8214-
group_keys: bool | lib.NoDefault = no_default,
8218+
group_keys: bool = True,
82158219
observed: bool = False,
82168220
dropna: bool = True,
82178221
) -> DataFrameGroupBy:

pandas/core/generic.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -6885,8 +6885,10 @@ def fillna(
68856885
"with dict/Series column "
68866886
"by column"
68876887
)
6888-
6889-
result = self if inplace else self.copy()
6888+
if using_copy_on_write():
6889+
result = self.copy(deep=None)
6890+
else:
6891+
result = self if inplace else self.copy()
68906892
is_dict = isinstance(downcast, dict)
68916893
for k, v in value.items():
68926894
if k not in result:
@@ -6940,8 +6942,10 @@ def fillna(
69406942
result.iloc[:, loc] = res_loc
69416943
else:
69426944
result.isetitem(loc, res_loc)
6943-
6944-
return result if not inplace else None
6945+
if inplace:
6946+
return self._update_inplace(result)
6947+
else:
6948+
return result
69456949

69466950
elif not is_list_like(value):
69476951
if axis == 1:

0 commit comments

Comments
 (0)