From 3c02b861af5c2dcb7b677e517e25ee4ee8aea069 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 3 Apr 2023 17:12:12 -0700 Subject: [PATCH 01/14] BUG: dt.days for timedelta non-nano overflows int32 --- pandas/_libs/tslibs/fields.pyi | 4 ++ pandas/_libs/tslibs/fields.pyx | 41 +++++++++++++------ pandas/core/arrays/timedeltas.py | 10 ++++- .../indexes/timedeltas/test_timedelta.py | 2 +- .../series/accessors/test_dt_accessor.py | 11 +++++ pandas/tests/tslibs/test_fields.py | 4 +- 6 files changed, 55 insertions(+), 17 deletions(-) diff --git a/pandas/_libs/tslibs/fields.pyi b/pandas/_libs/tslibs/fields.pyi index 8b4bc1a31a1aa..c6cfd44e9f6ab 100644 --- a/pandas/_libs/tslibs/fields.pyi +++ b/pandas/_libs/tslibs/fields.pyi @@ -30,6 +30,10 @@ def get_timedelta_field( field: str, reso: int = ..., # NPY_DATETIMEUNIT ) -> npt.NDArray[np.int32]: ... +def get_timedelta_days( + tdindex: npt.NDArray[np.int64], # const int64_t[:] + reso: int = ..., # NPY_DATETIMEUNIT +) -> npt.NDArray[np.int64]: ... def isleapyear_arr( years: np.ndarray, ) -> npt.NDArray[np.bool_]: ... diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index b162f278fcbec..1c75b47dd12ac 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -512,18 +512,7 @@ def get_timedelta_field( out = np.empty(count, dtype="i4") - if field == "days": - with nogil: - for i in range(count): - if tdindex[i] == NPY_NAT: - out[i] = -1 - continue - - pandas_timedelta_to_timedeltastruct(tdindex[i], reso, &tds) - out[i] = tds.days - return out - - elif field == "seconds": + if field == "seconds": with nogil: for i in range(count): if tdindex[i] == NPY_NAT: @@ -559,6 +548,34 @@ def get_timedelta_field( raise ValueError(f"Field {field} not supported") +@cython.wraparound(False) +@cython.boundscheck(False) +def get_timedelta_days( + const int64_t[:] tdindex, + NPY_DATETIMEUNIT reso=NPY_FR_ns, +): + """ + Given a int64-based timedelta index, extract the days, + field and return an array of these values. + """ + cdef: + Py_ssize_t i, count = len(tdindex) + ndarray[int64_t] out + pandas_timedeltastruct tds + + out = np.empty(count, dtype="i8") + + with nogil: + for i in range(count): + if tdindex[i] == NPY_NAT: + out[i] = -1 + continue + + pandas_timedelta_to_timedeltastruct(tdindex[i], reso, &tds) + out[i] = tds.days + return out + + cpdef isleapyear_arr(ndarray years): """vectorized version of isleapyear; NaT evaluates as False""" cdef: diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 7731bb499cd21..f6d251a21fcc2 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -30,7 +30,10 @@ to_offset, ) from pandas._libs.tslibs.conversion import precision_from_unit -from pandas._libs.tslibs.fields import get_timedelta_field +from pandas._libs.tslibs.fields import ( + get_timedelta_field, + get_timedelta_days, +) from pandas._libs.tslibs.timedeltas import ( array_to_timedelta64, floordiv_object_array, @@ -81,7 +84,10 @@ def _field_accessor(name: str, alias: str, docstring: str): def f(self) -> np.ndarray: values = self.asi8 - result = get_timedelta_field(values, alias, reso=self._creso) + if alias == "days": + result = get_timedelta_days(values, reso=self._creso) + else: + result = get_timedelta_field(values, alias, reso=self._creso) if self._hasna: result = self._maybe_mask_results( result, fill_value=None, convert="float64" diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 74f75eb9337e6..0cbc4bde4b07f 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -67,7 +67,7 @@ def test_pass_TimedeltaIndex_to_index(self): def test_fields(self): rng = timedelta_range("1 days, 10:11:12.100123456", periods=2, freq="s") - tm.assert_index_equal(rng.days, Index([1, 1], dtype=np.int32)) + tm.assert_index_equal(rng.days, Index([1, 1], dtype=np.int64)) tm.assert_index_equal( rng.seconds, Index([10 * 3600 + 11 * 60 + 12, 10 * 3600 + 11 * 60 + 13], dtype=np.int32), diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index fa8e184285616..92a121b7468a8 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -792,3 +792,14 @@ def test_normalize_pre_epoch_dates(): result = ser.dt.normalize() expected = pd.to_datetime(Series(["1969-01-01", "2016-01-01"])) tm.assert_series_equal(result, expected) + + +def test_day_attribute_non_nano_beyond_int32(): + # GH 52386 + data = np.array([136457654736252, 134736784364431, 245345345545332, 223432411, + 2343241, 3634548734, 23234], + dtype='timedelta64[s]') + ser = Series(data) + result = ser.dt.days + expected = Series([1579371003, 1559453522, 2839645203, 2586, 27, 42066, 0]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/tslibs/test_fields.py b/pandas/tests/tslibs/test_fields.py index 9e6464f7727bd..da67c093b8f4d 100644 --- a/pandas/tests/tslibs/test_fields.py +++ b/pandas/tests/tslibs/test_fields.py @@ -35,6 +35,6 @@ def test_get_start_end_field_readonly(dtindex): def test_get_timedelta_field_readonly(dtindex): # treat dtindex as timedeltas for this next one - result = fields.get_timedelta_field(dtindex, "days") - expected = np.arange(5, dtype=np.int32) * 32 + result = fields.get_timedelta_field(dtindex, "seconds") + expected = np.array([0] * 5, dtype=np.int32) tm.assert_numpy_array_equal(result, expected) From 18cb45d3fd8d8fe29c99c70b4f065db1f32b5820 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 3 Apr 2023 17:12:38 -0700 Subject: [PATCH 02/14] Run precommit From 55a7dc86bb129114bfb3a118c116b1eb851ba9ef Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 3 Apr 2023 17:14:35 -0700 Subject: [PATCH 03/14] lint --- pandas/core/arrays/timedeltas.py | 4 ++-- pandas/tests/series/accessors/test_dt_accessor.py | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index f6d251a21fcc2..544aa6b63ec6d 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -31,8 +31,8 @@ ) from pandas._libs.tslibs.conversion import precision_from_unit from pandas._libs.tslibs.fields import ( - get_timedelta_field, get_timedelta_days, + get_timedelta_field, ) from pandas._libs.tslibs.timedeltas import ( array_to_timedelta64, @@ -86,7 +86,7 @@ def f(self) -> np.ndarray: values = self.asi8 if alias == "days": result = get_timedelta_days(values, reso=self._creso) - else: + else: result = get_timedelta_field(values, alias, reso=self._creso) if self._hasna: result = self._maybe_mask_results( diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 92a121b7468a8..1123eddcdbc57 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -796,9 +796,18 @@ def test_normalize_pre_epoch_dates(): def test_day_attribute_non_nano_beyond_int32(): # GH 52386 - data = np.array([136457654736252, 134736784364431, 245345345545332, 223432411, - 2343241, 3634548734, 23234], - dtype='timedelta64[s]') + data = np.array( + [ + 136457654736252, + 134736784364431, + 245345345545332, + 223432411, + 2343241, + 3634548734, + 23234, + ], + dtype="timedelta64[s]", + ) ser = Series(data) result = ser.dt.days expected = Series([1579371003, 1559453522, 2839645203, 2586, 27, 42066, 0]) From 019e98a3b5e1ba0d53f79a3aaa14f0809f47d7a9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 4 Apr 2023 10:49:59 -0700 Subject: [PATCH 04/14] Address code check failures --- asv_bench/benchmarks/tslibs/fields.py | 2 +- pandas/core/arrays/timedeltas.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/tslibs/fields.py b/asv_bench/benchmarks/tslibs/fields.py index 23ae73811204c..3a2baec54109a 100644 --- a/asv_bench/benchmarks/tslibs/fields.py +++ b/asv_bench/benchmarks/tslibs/fields.py @@ -12,7 +12,7 @@ class TimeGetTimedeltaField: params = [ _sizes, - ["days", "seconds", "microseconds", "nanoseconds"], + ["seconds", "microseconds", "nanoseconds"], ] param_names = ["size", "field"] diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 544aa6b63ec6d..2f8ee75fcebc3 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -87,7 +87,10 @@ def f(self) -> np.ndarray: if alias == "days": result = get_timedelta_days(values, reso=self._creso) else: - result = get_timedelta_field(values, alias, reso=self._creso) + # error: Incompatible types in assignment ( + # expression has type "ndarray[Any, dtype[signedinteger[_32Bit]]]", + # variable has type "ndarray[Any, dtype[signedinteger[_64Bit]]] + result = get_timedelta_field(values, alias, reso=self._creso) # type: ignore[assignment] # noqa: E501 if self._hasna: result = self._maybe_mask_results( result, fill_value=None, convert="float64" From 62b751bc79a845630f324f6c32b4af4c72c7a384 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 6 Apr 2023 13:17:46 -0700 Subject: [PATCH 05/14] Add whatsnew --- doc/source/whatsnew/v2.0.1.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 9071d242e25b5..6247f2e16cab4 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -20,6 +20,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Bug in :attr:`Series.dt.days` that would overflow ``int32`` number of days (:issue:`52391`) - Fixed bug in :func:`merge` when merging with ``ArrowDtype`` one one and a NumPy dtype on the other side (:issue:`52406`) - Bug in :meth:`Series.describe` not returning :class:`ArrowDtype` with ``pyarrow.float64`` type with numeric data (:issue:`52427`) - Fixed segfault in :meth:`Series.to_numpy` with ``null[pyarrow]`` dtype (:issue:`52443`) From dc7f8819b3e198460e71a894bc08769780369dfd Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 12 Apr 2023 15:56:49 -0700 Subject: [PATCH 06/14] PERF: numpy dtype checks (#52582) --- pandas/_libs/lib.pyi | 1 + pandas/_libs/lib.pyx | 27 ++++++++++++++++++ pandas/core/arrays/base.py | 6 ++-- pandas/core/arrays/categorical.py | 6 ++-- pandas/core/arrays/datetimelike.py | 28 +++++++++---------- pandas/core/arrays/datetimes.py | 13 ++++----- pandas/core/arrays/timedeltas.py | 12 ++++---- pandas/core/dtypes/astype.py | 6 ++-- pandas/core/dtypes/cast.py | 8 ++---- pandas/core/generic.py | 3 +- pandas/core/indexes/accessors.py | 11 ++++---- pandas/core/indexes/range.py | 3 +- pandas/core/indexes/timedeltas.py | 3 +- pandas/core/methods/describe.py | 4 +-- pandas/core/reshape/tile.py | 10 +++---- pandas/io/formats/format.py | 12 +++----- pandas/io/json/_table_schema.py | 7 ++--- pandas/io/pytables.py | 5 ++-- .../tests/io/json/test_json_table_schema.py | 3 +- pandas/tseries/frequencies.py | 13 ++++----- 20 files changed, 92 insertions(+), 89 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 2e425f5797c62..05d569f0e58eb 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -36,6 +36,7 @@ NoDefault = Literal[_NoDefault.no_default] i8max: int u8max: int +def is_np_dtype(dtype: object, kinds: str | None = ...) -> bool: ... def item_from_zerodim(val: object) -> object: ... def infer_dtype(value: object, skipna: bool = ...) -> str: ... def is_iterator(obj: object) -> bool: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 5bf99301d9261..92f1dc2d4ea3b 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -3070,3 +3070,30 @@ def dtypes_all_equal(list types not None) -> bool: return False else: return True + + +def is_np_dtype(object dtype, str kinds=None) -> bool: + """ + Optimized check for `isinstance(dtype, np.dtype)` with + optional `and dtype.kind in kinds`. + + dtype = np.dtype("m8[ns]") + + In [7]: %timeit isinstance(dtype, np.dtype) + 117 ns ± 1.91 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each) + + In [8]: %timeit is_np_dtype(dtype) + 64 ns ± 1.51 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each) + + In [9]: %timeit is_timedelta64_dtype(dtype) + 209 ns ± 6.96 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each) + + In [10]: %timeit is_np_dtype(dtype, "m") + 93.4 ns ± 1.11 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each) + """ + if not cnp.PyArray_DescrCheck(dtype): + # i.e. not isinstance(dtype, np.dtype) + return False + if kinds is None: + return True + return dtype.kind in kinds diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 9342d3b68679c..e0c0f0e045ba5 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -40,11 +40,9 @@ from pandas.core.dtypes.cast import maybe_cast_to_extension_array from pandas.core.dtypes.common import ( - is_datetime64_dtype, is_dtype_equal, is_list_like, is_scalar, - is_timedelta64_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import ExtensionDtype @@ -582,12 +580,12 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: cls = dtype.construct_array_type() return cls._from_sequence(self, dtype=dtype, copy=copy) - elif is_datetime64_dtype(dtype): + elif lib.is_np_dtype(dtype, "M"): from pandas.core.arrays import DatetimeArray return DatetimeArray._from_sequence(self, dtype=dtype, copy=copy) - elif is_timedelta64_dtype(dtype): + elif lib.is_np_dtype(dtype, "m"): from pandas.core.arrays import TimedeltaArray return TimedeltaArray._from_sequence(self, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index ea4384dc0ef2d..adb083c16a838 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -38,7 +38,6 @@ ensure_platform_int, is_any_real_numeric_dtype, is_bool_dtype, - is_datetime64_dtype, is_dict_like, is_dtype_equal, is_extension_array_dtype, @@ -46,7 +45,6 @@ is_integer_dtype, is_list_like, is_scalar, - is_timedelta64_dtype, needs_i8_conversion, pandas_dtype, ) @@ -622,9 +620,9 @@ def _from_inferred_categories( # Convert to a specialized type with `dtype` if specified. if is_any_real_numeric_dtype(dtype.categories): cats = to_numeric(inferred_categories, errors="coerce") - elif is_datetime64_dtype(dtype.categories): + elif lib.is_np_dtype(dtype.categories.dtype, "M"): cats = to_datetime(inferred_categories, errors="coerce") - elif is_timedelta64_dtype(dtype.categories): + elif lib.is_np_dtype(dtype.categories.dtype, "m"): cats = to_timedelta(inferred_categories, errors="coerce") elif is_bool_dtype(dtype.categories): if true_values is None: diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 4468e360f7a9d..658ffb4669c3c 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -84,7 +84,6 @@ from pandas.core.dtypes.common import ( is_all_strings, is_datetime64_any_dtype, - is_datetime64_dtype, is_datetime_or_timedelta_dtype, is_dtype_equal, is_float_dtype, @@ -92,7 +91,6 @@ is_list_like, is_object_dtype, is_string_dtype, - is_timedelta64_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -993,7 +991,7 @@ def _get_arithmetic_result_freq(self, other) -> BaseOffset | None: @final def _add_datetimelike_scalar(self, other) -> DatetimeArray: - if not is_timedelta64_dtype(self.dtype): + if not lib.is_np_dtype(self.dtype, "m"): raise TypeError( f"cannot add {type(self).__name__} and {type(other).__name__}" ) @@ -1029,7 +1027,7 @@ def _add_datetimelike_scalar(self, other) -> DatetimeArray: @final def _add_datetime_arraylike(self, other: DatetimeArray) -> DatetimeArray: - if not is_timedelta64_dtype(self.dtype): + if not lib.is_np_dtype(self.dtype, "m"): raise TypeError( f"cannot add {type(self).__name__} and {type(other).__name__}" ) @@ -1093,7 +1091,7 @@ def _sub_datetimelike(self, other: Timestamp | DatetimeArray) -> TimedeltaArray: @final def _add_period(self, other: Period) -> PeriodArray: - if not is_timedelta64_dtype(self.dtype): + if not lib.is_np_dtype(self.dtype, "m"): raise TypeError(f"cannot add Period to a {type(self).__name__}") # We will wrap in a PeriodArray and defer to the reversed operation @@ -1294,7 +1292,7 @@ def __add__(self, other): result = self._add_offset(other) elif isinstance(other, (datetime, np.datetime64)): result = self._add_datetimelike_scalar(other) - elif isinstance(other, Period) and is_timedelta64_dtype(self.dtype): + elif isinstance(other, Period) and lib.is_np_dtype(self.dtype, "m"): result = self._add_period(other) elif lib.is_integer(other): # This check must come after the check for np.timedelta64 @@ -1305,13 +1303,13 @@ def __add__(self, other): result = obj._addsub_int_array_or_scalar(other * obj.dtype._n, operator.add) # array-like others - elif is_timedelta64_dtype(other_dtype): + elif lib.is_np_dtype(other_dtype, "m"): # TimedeltaIndex, ndarray[timedelta64] result = self._add_timedelta_arraylike(other) elif is_object_dtype(other_dtype): # e.g. Array/Index of DateOffset objects result = self._addsub_object_array(other, operator.add) - elif is_datetime64_dtype(other_dtype) or isinstance( + elif lib.is_np_dtype(other_dtype, "M") or isinstance( other_dtype, DatetimeTZDtype ): # DatetimeIndex, ndarray[datetime64] @@ -1329,7 +1327,7 @@ def __add__(self, other): # In remaining cases, this will end up raising TypeError. return NotImplemented - if isinstance(result, np.ndarray) and is_timedelta64_dtype(result.dtype): + if isinstance(result, np.ndarray) and lib.is_np_dtype(result.dtype, "m"): from pandas.core.arrays import TimedeltaArray return TimedeltaArray(result) @@ -1366,13 +1364,13 @@ def __sub__(self, other): result = self._sub_periodlike(other) # array-like others - elif is_timedelta64_dtype(other_dtype): + elif lib.is_np_dtype(other_dtype, "m"): # TimedeltaIndex, ndarray[timedelta64] result = self._add_timedelta_arraylike(-other) elif is_object_dtype(other_dtype): # e.g. Array/Index of DateOffset objects result = self._addsub_object_array(other, operator.sub) - elif is_datetime64_dtype(other_dtype) or isinstance( + elif lib.is_np_dtype(other_dtype, "M") or isinstance( other_dtype, DatetimeTZDtype ): # DatetimeIndex, ndarray[datetime64] @@ -1389,7 +1387,7 @@ def __sub__(self, other): # Includes ExtensionArrays, float_dtype return NotImplemented - if isinstance(result, np.ndarray) and is_timedelta64_dtype(result.dtype): + if isinstance(result, np.ndarray) and lib.is_np_dtype(result.dtype, "m"): from pandas.core.arrays import TimedeltaArray return TimedeltaArray(result) @@ -1398,7 +1396,7 @@ def __sub__(self, other): def __rsub__(self, other): other_dtype = getattr(other, "dtype", None) - if is_datetime64_any_dtype(other_dtype) and is_timedelta64_dtype(self.dtype): + if is_datetime64_any_dtype(other_dtype) and lib.is_np_dtype(self.dtype, "m"): # ndarray[datetime64] cannot be subtracted from self, so # we need to wrap in DatetimeArray/Index and flip the operation if lib.is_scalar(other): @@ -1420,10 +1418,10 @@ def __rsub__(self, other): raise TypeError( f"cannot subtract {type(self).__name__} from {type(other).__name__}" ) - elif isinstance(self.dtype, PeriodDtype) and is_timedelta64_dtype(other_dtype): + elif isinstance(self.dtype, PeriodDtype) and lib.is_np_dtype(other_dtype, "m"): # TODO: Can we simplify/generalize these cases at all? raise TypeError(f"cannot subtract {type(self).__name__} from {other.dtype}") - elif is_timedelta64_dtype(self.dtype): + elif lib.is_np_dtype(self.dtype, "m"): self = cast("TimedeltaArray", self) return (-self) + other diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index dcb1c0965cc5b..12245a144ec2a 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -57,7 +57,6 @@ is_object_dtype, is_sparse, is_string_dtype, - is_timedelta64_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -670,7 +669,7 @@ def astype(self, dtype, copy: bool = True): elif ( self.tz is None - and is_datetime64_dtype(dtype) + and lib.is_np_dtype(dtype, "M") and not is_unitless(dtype) and is_supported_unit(get_unit_from_dtype(dtype)) ): @@ -679,7 +678,7 @@ def astype(self, dtype, copy: bool = True): return type(self)._simple_new(res_values, dtype=res_values.dtype) # TODO: preserve freq? - elif self.tz is not None and is_datetime64_dtype(dtype): + elif self.tz is not None and lib.is_np_dtype(dtype, "M"): # pre-2.0 behavior for DTA/DTI was # values.tz_convert("UTC").tz_localize(None), which did not match # the Series behavior @@ -691,7 +690,7 @@ def astype(self, dtype, copy: bool = True): elif ( self.tz is None - and is_datetime64_dtype(dtype) + and lib.is_np_dtype(dtype, "M") and dtype != self.dtype and is_unitless(dtype) ): @@ -2083,7 +2082,7 @@ def _sequence_to_dt64ns( tz = _maybe_infer_tz(tz, data.tz) result = data._ndarray - elif is_datetime64_dtype(data_dtype): + elif lib.is_np_dtype(data_dtype, "M"): # tz-naive DatetimeArray or ndarray[datetime64] data = getattr(data, "_ndarray", data) new_dtype = data.dtype @@ -2242,7 +2241,7 @@ def maybe_convert_dtype(data, copy: bool, tz: tzinfo | None = None): data = data.astype(DT64NS_DTYPE).view("i8") copy = False - elif is_timedelta64_dtype(data.dtype) or is_bool_dtype(data.dtype): + elif lib.is_np_dtype(data.dtype, "m") or is_bool_dtype(data.dtype): # GH#29794 enforcing deprecation introduced in GH#23539 raise TypeError(f"dtype {data.dtype} cannot be converted to datetime64[ns]") elif isinstance(data.dtype, PeriodDtype): @@ -2391,7 +2390,7 @@ def _validate_tz_from_dtype( raise ValueError("Cannot pass both a timezone-aware dtype and tz=None") tz = dtz - if tz is not None and is_datetime64_dtype(dtype): + if tz is not None and lib.is_np_dtype(dtype, "M"): # We also need to check for the case where the user passed a # tz-naive dtype (i.e. datetime64[ns]) if tz is not None and not timezones.tz_compare(tz, dtz): diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 2f8ee75fcebc3..d7e413ccec293 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -493,7 +493,7 @@ def __mul__(self, other) -> Self: if not hasattr(other, "dtype"): # list, tuple other = np.array(other) - if len(other) != len(self) and not is_timedelta64_dtype(other.dtype): + if len(other) != len(self) and not lib.is_np_dtype(other.dtype, "m"): # Exclude timedelta64 here so we correctly raise TypeError # for that instead of ValueError raise ValueError("Cannot multiply with unequal lengths") @@ -594,7 +594,7 @@ def __truediv__(self, other): other = self._cast_divlike_op(other) if ( - is_timedelta64_dtype(other.dtype) + lib.is_np_dtype(other.dtype, "m") or is_integer_dtype(other.dtype) or is_float_dtype(other.dtype) ): @@ -622,7 +622,7 @@ def __rtruediv__(self, other): return self._scalar_divlike_op(other, op) other = self._cast_divlike_op(other) - if is_timedelta64_dtype(other.dtype): + if lib.is_np_dtype(other.dtype, "m"): return self._vector_divlike_op(other, op) elif is_object_dtype(other.dtype): @@ -643,7 +643,7 @@ def __floordiv__(self, other): other = self._cast_divlike_op(other) if ( - is_timedelta64_dtype(other.dtype) + lib.is_np_dtype(other.dtype, "m") or is_integer_dtype(other.dtype) or is_float_dtype(other.dtype) ): @@ -671,7 +671,7 @@ def __rfloordiv__(self, other): return self._scalar_divlike_op(other, op) other = self._cast_divlike_op(other) - if is_timedelta64_dtype(other.dtype): + if lib.is_np_dtype(other.dtype, "m"): return self._vector_divlike_op(other, op) elif is_object_dtype(other.dtype): @@ -949,7 +949,7 @@ def sequence_to_td64ns( data[mask] = iNaT copy = False - elif is_timedelta64_dtype(data.dtype): + elif lib.is_np_dtype(data.dtype, "m"): data_unit = get_unit_from_dtype(data.dtype) if not is_supported_unit(data_unit): # cast to closest supported unit, i.e. s or ns diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py index f865968328286..a69559493c386 100644 --- a/pandas/core/dtypes/astype.py +++ b/pandas/core/dtypes/astype.py @@ -18,12 +18,10 @@ from pandas.errors import IntCastingNaNError from pandas.core.dtypes.common import ( - is_datetime64_dtype, is_dtype_equal, is_integer_dtype, is_object_dtype, is_string_dtype, - is_timedelta64_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -108,14 +106,14 @@ def _astype_nansafe( # if we have a datetime/timedelta array of objects # then coerce to datetime64[ns] and use DatetimeArray.astype - if is_datetime64_dtype(dtype): + if lib.is_np_dtype(dtype, "M"): from pandas import to_datetime dti = to_datetime(arr.ravel()) dta = dti._data.reshape(arr.shape) return dta.astype(dtype, copy=False)._ndarray - elif is_timedelta64_dtype(dtype): + elif lib.is_np_dtype(dtype, "m"): from pandas.core.construction import ensure_wrapped_if_datetimelike # bc we know arr.dtype == object, this is equivalent to diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 7fc6fd7fff9b5..3f035f7207488 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -52,7 +52,6 @@ is_bool_dtype, is_complex, is_complex_dtype, - is_datetime64_dtype, is_extension_array_dtype, is_float, is_float_dtype, @@ -63,7 +62,6 @@ is_scalar, is_signed_integer_dtype, is_string_dtype, - is_timedelta64_dtype, is_unsigned_integer_dtype, pandas_dtype as pandas_dtype_func, ) @@ -1203,7 +1201,7 @@ def maybe_cast_to_datetime( # _ensure_nanosecond_dtype raises TypeError _ensure_nanosecond_dtype(dtype) - if is_timedelta64_dtype(dtype): + if lib.is_np_dtype(dtype, "m"): res = TimedeltaArray._from_sequence(value, dtype=dtype) return res else: @@ -1407,9 +1405,9 @@ def find_common_type(types): return np.dtype("object") # take lowest unit - if all(is_datetime64_dtype(t) for t in types): + if all(lib.is_np_dtype(t, "M") for t in types): return np.dtype(max(types)) - if all(is_timedelta64_dtype(t) for t in types): + if all(lib.is_np_dtype(t, "m") for t in types): return np.dtype(max(types)) # don't mix bool / int or float or complex diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3da4f96444215..0c14c76ab539f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -120,7 +120,6 @@ is_numeric_dtype, is_re_compilable, is_scalar, - is_timedelta64_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -7758,7 +7757,7 @@ def interpolate( is_numeric_or_datetime = ( is_numeric_dtype(index.dtype) or is_datetime64_any_dtype(index.dtype) - or is_timedelta64_dtype(index.dtype) + or lib.is_np_dtype(index.dtype, "m") ) if method not in methods and not is_numeric_or_datetime: raise ValueError( diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index bf4da90efc17c..4f529b71c867f 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -11,13 +11,12 @@ import numpy as np +from pandas._libs import lib from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( - is_datetime64_dtype, is_integer_dtype, is_list_like, - is_timedelta64_dtype, ) from pandas.core.dtypes.dtypes import ( CategoricalDtype, @@ -70,13 +69,13 @@ def __init__(self, data: Series, orig) -> None: def _get_values(self): data = self._parent - if is_datetime64_dtype(data.dtype): + if lib.is_np_dtype(data.dtype, "M"): return DatetimeIndex(data, copy=False, name=self.name) elif isinstance(data.dtype, DatetimeTZDtype): return DatetimeIndex(data, copy=False, name=self.name) - elif is_timedelta64_dtype(data.dtype): + elif lib.is_np_dtype(data.dtype, "m"): return TimedeltaIndex(data, copy=False, name=self.name) elif isinstance(data.dtype, PeriodDtype): @@ -593,11 +592,11 @@ def __new__(cls, data: Series): if isinstance(data.dtype, ArrowDtype) and data.dtype.kind == "M": return ArrowTemporalProperties(data, orig) - if is_datetime64_dtype(data.dtype): + if lib.is_np_dtype(data.dtype, "M"): return DatetimeProperties(data, orig) elif isinstance(data.dtype, DatetimeTZDtype): return DatetimeProperties(data, orig) - elif is_timedelta64_dtype(data.dtype): + elif lib.is_np_dtype(data.dtype, "m"): return TimedeltaProperties(data, orig) elif isinstance(data.dtype, PeriodDtype): return PeriodProperties(data, orig) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index c0775a17d76d1..f3752efc206ad 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -34,7 +34,6 @@ is_integer, is_scalar, is_signed_integer_dtype, - is_timedelta64_dtype, ) from pandas.core.dtypes.generic import ABCTimedeltaIndex @@ -978,7 +977,7 @@ def _arith_method(self, other, op): # GH#19333 is_integer evaluated True on timedelta64, # so we need to catch these explicitly return super()._arith_method(other, op) - elif is_timedelta64_dtype(other): + elif lib.is_np_dtype(getattr(other, "dtype", None), "m"): # Must be an np.ndarray; GH#22390 return super()._arith_method(other, op) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 41409bb05a41a..0a9f40bf45a96 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -18,7 +18,6 @@ from pandas.core.dtypes.common import ( is_dtype_equal, is_scalar, - is_timedelta64_dtype, ) from pandas.core.dtypes.generic import ABCSeries @@ -199,7 +198,7 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: """ Can we compare values of the given dtype to our own? """ - return is_timedelta64_dtype(dtype) # aka self._data._is_recognized_dtype + return lib.is_np_dtype(dtype, "m") # aka self._data._is_recognized_dtype # ------------------------------------------------------------------- # Indexing Methods diff --git a/pandas/core/methods/describe.py b/pandas/core/methods/describe.py index 45cf038ebc19e..2fcb0de6b5451 100644 --- a/pandas/core/methods/describe.py +++ b/pandas/core/methods/describe.py @@ -19,6 +19,7 @@ import numpy as np +from pandas._libs import lib from pandas._libs.tslibs import Timestamp from pandas._typing import ( DtypeObj, @@ -33,7 +34,6 @@ is_datetime64_any_dtype, is_extension_array_dtype, is_numeric_dtype, - is_timedelta64_dtype, ) from pandas.core.arrays.arrow.dtype import ArrowDtype @@ -363,7 +363,7 @@ def select_describe_func( return describe_numeric_1d elif is_datetime64_any_dtype(data.dtype): return describe_timestamp_1d - elif is_timedelta64_dtype(data.dtype): + elif lib.is_np_dtype(data.dtype, "m"): return describe_numeric_1d else: return describe_categorical_1d diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index bb5c943c6318e..f3f2f41491e87 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -15,6 +15,7 @@ from pandas._libs import ( Timedelta, Timestamp, + lib, ) from pandas._libs.lib import infer_dtype @@ -28,7 +29,6 @@ is_list_like, is_numeric_dtype, is_scalar, - is_timedelta64_dtype, ) from pandas.core.dtypes.dtypes import ( CategoricalDtype, @@ -487,10 +487,10 @@ def _coerce_to_type(x): if isinstance(x.dtype, DatetimeTZDtype): dtype = x.dtype - elif is_datetime64_dtype(x.dtype): + elif lib.is_np_dtype(x.dtype, "M"): x = to_datetime(x).astype("datetime64[ns]", copy=False) dtype = np.dtype("datetime64[ns]") - elif is_timedelta64_dtype(x.dtype): + elif lib.is_np_dtype(x.dtype, "m"): x = to_timedelta(x) dtype = np.dtype("timedelta64[ns]") elif is_bool_dtype(x.dtype): @@ -525,7 +525,7 @@ def _convert_bin_to_numeric_type(bins, dtype: DtypeObj | None): ValueError if bins are not of a compat dtype to dtype """ bins_dtype = infer_dtype(bins, skipna=False) - if is_timedelta64_dtype(dtype): + if lib.is_np_dtype(dtype, "m"): if bins_dtype in ["timedelta", "timedelta64"]: bins = to_timedelta(bins).view(np.int64) else: @@ -584,7 +584,7 @@ def _format_labels( elif is_datetime64_dtype(dtype): formatter = Timestamp adjust = lambda x: x - Timedelta("1ns") - elif is_timedelta64_dtype(dtype): + elif lib.is_np_dtype(dtype, "m"): formatter = Timedelta adjust = lambda x: x - Timedelta("1ns") else: diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 4e2da746e0803..ae67b05047a98 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -52,15 +52,11 @@ from pandas.core.dtypes.common import ( is_complex_dtype, - is_datetime64_dtype, is_float, - is_float_dtype, is_integer, - is_integer_dtype, is_list_like, is_numeric_dtype, is_scalar, - is_timedelta64_dtype, ) from pandas.core.dtypes.dtypes import ( CategoricalDtype, @@ -1290,17 +1286,17 @@ def format_array( List[str] """ fmt_klass: type[GenericArrayFormatter] - if is_datetime64_dtype(values.dtype): + if lib.is_np_dtype(values.dtype, "M"): fmt_klass = Datetime64Formatter elif isinstance(values.dtype, DatetimeTZDtype): fmt_klass = Datetime64TZFormatter - elif is_timedelta64_dtype(values.dtype): + elif lib.is_np_dtype(values.dtype, "m"): fmt_klass = Timedelta64Formatter elif isinstance(values.dtype, ExtensionDtype): fmt_klass = ExtensionArrayFormatter - elif is_float_dtype(values.dtype) or is_complex_dtype(values.dtype): + elif lib.is_np_dtype(values.dtype, "fc"): fmt_klass = FloatArrayFormatter - elif is_integer_dtype(values.dtype): + elif lib.is_np_dtype(values.dtype, "iu"): fmt_klass = IntArrayFormatter else: fmt_klass = GenericArrayFormatter diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index 4448bfbe977d5..7decab539da34 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -12,6 +12,7 @@ ) import warnings +from pandas._libs import lib from pandas._libs.json import loads from pandas._libs.tslibs import timezones from pandas.util._exceptions import find_stack_level @@ -19,11 +20,9 @@ from pandas.core.dtypes.base import _registry as registry from pandas.core.dtypes.common import ( is_bool_dtype, - is_datetime64_dtype, is_integer_dtype, is_numeric_dtype, is_string_dtype, - is_timedelta64_dtype, ) from pandas.core.dtypes.dtypes import ( CategoricalDtype, @@ -84,9 +83,9 @@ def as_json_table_type(x: DtypeObj) -> str: return "boolean" elif is_numeric_dtype(x): return "number" - elif is_datetime64_dtype(x) or isinstance(x, (DatetimeTZDtype, PeriodDtype)): + elif lib.is_np_dtype(x, "M") or isinstance(x, (DatetimeTZDtype, PeriodDtype)): return "datetime" - elif is_timedelta64_dtype(x): + elif lib.is_np_dtype(x, "m"): return "duration" elif isinstance(x, ExtensionDtype): return "any" diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index fcaf4d984a4c6..da0ca940791ba 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -61,7 +61,6 @@ is_list_like, is_object_dtype, is_string_dtype, - is_timedelta64_dtype, needs_i8_conversion, ) from pandas.core.dtypes.dtypes import ( @@ -2380,7 +2379,7 @@ def _get_atom(cls, values: ArrayLike) -> Col: atom = cls.get_atom_data(shape, kind=codes.dtype.name) elif is_datetime64_dtype(dtype) or isinstance(dtype, DatetimeTZDtype): atom = cls.get_atom_datetime64(shape) - elif is_timedelta64_dtype(dtype): + elif lib.is_np_dtype(dtype, "m"): atom = cls.get_atom_timedelta64(shape) elif is_complex_dtype(dtype): atom = _tables().ComplexCol(itemsize=itemsize, shape=shape[0]) @@ -3100,7 +3099,7 @@ def write_array( # attribute "tz" node._v_attrs.tz = _get_tz(value.tz) # type: ignore[union-attr] node._v_attrs.value_type = "datetime64" - elif is_timedelta64_dtype(value.dtype): + elif lib.is_np_dtype(value.dtype, "m"): self._handle.create_array(self.group, key, value.view("i8")) getattr(self.group, key)._v_attrs.value_type = "timedelta64" elif empty_array: diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 8cd5b8adb27a5..48ab0f1be8c4a 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -190,7 +190,6 @@ def test_as_json_table_type_bool_dtypes(self, bool_dtype): @pytest.mark.parametrize( "date_dtype", [ - np.datetime64, np.dtype(" Date: Wed, 12 Apr 2023 16:41:32 -0700 Subject: [PATCH 07/14] CLN: Use #pragma once instead of include guards (#52635) Use #pragma once --- pandas/_libs/pd_parser.h | 4 +--- pandas/_libs/src/headers/portable.h | 5 +---- pandas/_libs/src/inline_helper.h | 5 +---- pandas/_libs/src/parser/io.h | 5 +---- pandas/_libs/src/parser/tokenizer.h | 5 +---- pandas/_libs/src/skiplist.h | 5 +---- pandas/_libs/src/ujson/lib/ultrajson.h | 5 +---- pandas/_libs/src/ujson/python/version.h | 5 +---- pandas/_libs/tslibs/src/datetime/date_conversions.h | 5 +---- pandas/_libs/tslibs/src/datetime/np_datetime.h | 6 +----- pandas/_libs/tslibs/src/datetime/np_datetime_strings.h | 4 +--- pandas/_libs/tslibs/src/datetime/pd_datetime.h | 4 +--- 12 files changed, 12 insertions(+), 46 deletions(-) diff --git a/pandas/_libs/pd_parser.h b/pandas/_libs/pd_parser.h index acdc08bbad484..72254090c0056 100644 --- a/pandas/_libs/pd_parser.h +++ b/pandas/_libs/pd_parser.h @@ -6,8 +6,7 @@ All rights reserved. Distributed under the terms of the BSD Simplified License. */ -#ifndef PANDAS__LIBS_PD_PARSER_H_ -#define PANDAS__LIBS_PD_PARSER_H_ +#pragma once #ifdef __cplusplus extern "C" { @@ -110,4 +109,3 @@ static PandasParser_CAPI *PandasParserAPI = NULL; #ifdef __cplusplus } #endif -#endif // PANDAS__LIBS_PD_PARSER_H_ diff --git a/pandas/_libs/src/headers/portable.h b/pandas/_libs/src/headers/portable.h index 91b4702d32452..a34f833b7fd6b 100644 --- a/pandas/_libs/src/headers/portable.h +++ b/pandas/_libs/src/headers/portable.h @@ -1,5 +1,4 @@ -#ifndef _PANDAS_PORTABLE_H_ -#define _PANDAS_PORTABLE_H_ +#pragma once #include @@ -14,5 +13,3 @@ #define isspace_ascii(c) (((c) == ' ') || (((unsigned)(c) - '\t') < 5)) #define toupper_ascii(c) ((((unsigned)(c) - 'a') < 26) ? ((c) & 0x5f) : (c)) #define tolower_ascii(c) ((((unsigned)(c) - 'A') < 26) ? ((c) | 0x20) : (c)) - -#endif diff --git a/pandas/_libs/src/inline_helper.h b/pandas/_libs/src/inline_helper.h index 40fd45762ffe4..c77da0e52b9d3 100644 --- a/pandas/_libs/src/inline_helper.h +++ b/pandas/_libs/src/inline_helper.h @@ -7,8 +7,7 @@ Distributed under the terms of the BSD Simplified License. The full license is in the LICENSE file, distributed with this software. */ -#ifndef PANDAS__LIBS_SRC_INLINE_HELPER_H_ -#define PANDAS__LIBS_SRC_INLINE_HELPER_H_ +#pragma once #ifndef PANDAS_INLINE #if defined(__clang__) @@ -23,5 +22,3 @@ The full license is in the LICENSE file, distributed with this software. #define PANDAS_INLINE #endif // __GNUC__ #endif // PANDAS_INLINE - -#endif // PANDAS__LIBS_SRC_INLINE_HELPER_H_ diff --git a/pandas/_libs/src/parser/io.h b/pandas/_libs/src/parser/io.h index f0e8b01855304..9032eb6759358 100644 --- a/pandas/_libs/src/parser/io.h +++ b/pandas/_libs/src/parser/io.h @@ -7,8 +7,7 @@ Distributed under the terms of the BSD Simplified License. The full license is in the LICENSE file, distributed with this software. */ -#ifndef PANDAS__LIBS_SRC_PARSER_IO_H_ -#define PANDAS__LIBS_SRC_PARSER_IO_H_ +#pragma once #define PY_SSIZE_T_CLEAN #include @@ -30,5 +29,3 @@ int del_rd_source(void *src); void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, int *status, const char *encoding_errors); - -#endif // PANDAS__LIBS_SRC_PARSER_IO_H_ diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index eea9bfd4828d6..7e8c3d102ac63 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -9,8 +9,7 @@ See LICENSE for the license */ -#ifndef PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_ -#define PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_ +#pragma once #define PY_SSIZE_T_CLEAN #include @@ -232,5 +231,3 @@ double precise_xstrtod(const char *p, char **q, char decimal, double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int); int to_boolean(const char *item, uint8_t *val); - -#endif // PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_ diff --git a/pandas/_libs/src/skiplist.h b/pandas/_libs/src/skiplist.h index 5d0b144a1fe61..d94099da5890e 100644 --- a/pandas/_libs/src/skiplist.h +++ b/pandas/_libs/src/skiplist.h @@ -13,8 +13,7 @@ Port of Wes McKinney's Cython version of Raymond Hettinger's original pure Python recipe (https://rhettinger.wordpress.com/2010/02/06/lost-knowledge/) */ -#ifndef PANDAS__LIBS_SRC_SKIPLIST_H_ -#define PANDAS__LIBS_SRC_SKIPLIST_H_ +#pragma once #include #include @@ -296,5 +295,3 @@ PANDAS_INLINE int skiplist_remove(skiplist_t *skp, double value) { --(skp->size); return 1; } - -#endif // PANDAS__LIBS_SRC_SKIPLIST_H_ diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h index aab4246150abd..d359cf27ff7e2 100644 --- a/pandas/_libs/src/ujson/lib/ultrajson.h +++ b/pandas/_libs/src/ujson/lib/ultrajson.h @@ -49,8 +49,7 @@ tree doesn't have cyclic references. */ -#ifndef PANDAS__LIBS_SRC_UJSON_LIB_ULTRAJSON_H_ -#define PANDAS__LIBS_SRC_UJSON_LIB_ULTRAJSON_H_ +#pragma once #include #include @@ -313,5 +312,3 @@ typedef struct __JSONObjectDecoder { EXPORTFUNCTION JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuffer); EXPORTFUNCTION void encode(JSOBJ, JSONObjectEncoder *, const char *, size_t); - -#endif // PANDAS__LIBS_SRC_UJSON_LIB_ULTRAJSON_H_ diff --git a/pandas/_libs/src/ujson/python/version.h b/pandas/_libs/src/ujson/python/version.h index 15c55309d6270..97232dd821387 100644 --- a/pandas/_libs/src/ujson/python/version.h +++ b/pandas/_libs/src/ujson/python/version.h @@ -35,9 +35,6 @@ Numeric decoder derived from TCL library * Copyright (c) 1994 Sun Microsystems, Inc. */ -#ifndef PANDAS__LIBS_SRC_UJSON_PYTHON_VERSION_H_ -#define PANDAS__LIBS_SRC_UJSON_PYTHON_VERSION_H_ +#pragma once #define UJSON_VERSION "1.33" - -#endif // PANDAS__LIBS_SRC_UJSON_PYTHON_VERSION_H_ diff --git a/pandas/_libs/tslibs/src/datetime/date_conversions.h b/pandas/_libs/tslibs/src/datetime/date_conversions.h index 45ba710dd42f2..8412b512b1471 100644 --- a/pandas/_libs/tslibs/src/datetime/date_conversions.h +++ b/pandas/_libs/tslibs/src/datetime/date_conversions.h @@ -5,8 +5,7 @@ Distributed under the terms of the BSD Simplified License. The full license is in the LICENSE file, distributed with this software. */ -#ifndef PANDAS__LIBS_TSLIBS_SRC_DATETIME_DATE_CONVERSIONS_H_ -#define PANDAS__LIBS_TSLIBS_SRC_DATETIME_DATE_CONVERSIONS_H_ +#pragma once #define PY_SSIZE_T_CLEAN #include @@ -35,5 +34,3 @@ char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base, size_t *len); npy_datetime PyDateTimeToEpoch(PyObject *dt, NPY_DATETIMEUNIT base); char *int64ToIsoDuration(int64_t value, size_t *len); - -#endif // PANDAS__LIBS_TSLIBS_SRC_DATETIME_DATE_CONVERSIONS_H_ diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.h b/pandas/_libs/tslibs/src/datetime/np_datetime.h index 68f72683ab2e4..5c5b31ef9bb2f 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.h @@ -14,8 +14,7 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt */ -#ifndef PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_H_ -#define PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_H_ +#pragma once #ifndef NPY_NO_DEPRECATED_API #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION @@ -114,6 +113,3 @@ add_minutes_to_datetimestruct(npy_datetimestruct *dts, int minutes); */ PyArray_DatetimeMetaData get_datetime_metadata_from_dtype( PyArray_Descr *dtype); - - -#endif // PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_H_ diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h index a635192d70809..1098637e798fe 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h @@ -19,8 +19,7 @@ This file implements string parsing and creation for NumPy datetime. */ -#ifndef PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_STRINGS_H_ -#define PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_STRINGS_H_ +#pragma once #ifndef NPY_NO_DEPRECATED_API #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION @@ -108,4 +107,3 @@ make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, */ int make_iso_8601_timedelta(pandas_timedeltastruct *tds, char *outstr, size_t *outlen); -#endif // PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_STRINGS_H_ diff --git a/pandas/_libs/tslibs/src/datetime/pd_datetime.h b/pandas/_libs/tslibs/src/datetime/pd_datetime.h index e80e9bbeb9e6c..4e3baf4b47ed0 100644 --- a/pandas/_libs/tslibs/src/datetime/pd_datetime.h +++ b/pandas/_libs/tslibs/src/datetime/pd_datetime.h @@ -15,8 +15,7 @@ All rights reserved. See NUMPY_LICENSE.txt for the license. */ -#ifndef PANDAS__LIBS_TSLIBS_SRC_DATETIME_PD_DATETIME_H_ -#define PANDAS__LIBS_TSLIBS_SRC_DATETIME_PD_DATETIME_H_ +#pragma once #ifndef NPY_NO_DEPRECATED_API #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION @@ -112,4 +111,3 @@ static PandasDateTime_CAPI *PandasDateTimeAPI = NULL; #ifdef __cplusplus } #endif -#endif // PANDAS__LIBS_TSLIBS_SRC_DATETIME_PD_DATETIME_H_ From b66b937906fa4d4223ef43541fdfc861f9d16354 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 12 Apr 2023 17:06:40 -0700 Subject: [PATCH 08/14] Refactored custom datetime functions (#52634) refactored custom datetime functions --- .../tslibs/src/datetime/date_conversions.c | 63 ------- .../tslibs/src/datetime/date_conversions.h | 9 - .../_libs/tslibs/src/datetime/np_datetime.c | 88 ---------- .../_libs/tslibs/src/datetime/np_datetime.h | 3 - .../_libs/tslibs/src/datetime/pd_datetime.c | 155 ++++++++++++++++++ 5 files changed, 155 insertions(+), 163 deletions(-) diff --git a/pandas/_libs/tslibs/src/datetime/date_conversions.c b/pandas/_libs/tslibs/src/datetime/date_conversions.c index e2d583470fa51..190713d62d306 100644 --- a/pandas/_libs/tslibs/src/datetime/date_conversions.c +++ b/pandas/_libs/tslibs/src/datetime/date_conversions.c @@ -73,69 +73,6 @@ npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base) { return dt; } -/* Convert PyDatetime To ISO C-string. mutates len */ -char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base, - size_t *len) { - npy_datetimestruct dts; - int ret; - - ret = convert_pydatetime_to_datetimestruct(obj, &dts); - if (ret != 0) { - if (!PyErr_Occurred()) { - PyErr_SetString(PyExc_ValueError, - "Could not convert PyDateTime to numpy datetime"); - } - return NULL; - } - - *len = (size_t)get_datetime_iso_8601_strlen(0, base); - char *result = PyObject_Malloc(*len); - // Check to see if PyDateTime has a timezone. - // Don't convert to UTC if it doesn't. - int is_tz_aware = 0; - if (PyObject_HasAttrString(obj, "tzinfo")) { - PyObject *offset = extract_utc_offset(obj); - if (offset == NULL) { - PyObject_Free(result); - return NULL; - } - is_tz_aware = offset != Py_None; - Py_DECREF(offset); - } - ret = make_iso_8601_datetime(&dts, result, *len, is_tz_aware, base); - - if (ret != 0) { - PyErr_SetString(PyExc_ValueError, - "Could not convert datetime value to string"); - PyObject_Free(result); - return NULL; - } - - // Note that get_datetime_iso_8601_strlen just gives a generic size - // for ISO string conversion, not the actual size used - *len = strlen(result); - return result; -} - -npy_datetime PyDateTimeToEpoch(PyObject *dt, NPY_DATETIMEUNIT base) { - npy_datetimestruct dts; - int ret; - - ret = convert_pydatetime_to_datetimestruct(dt, &dts); - if (ret != 0) { - if (!PyErr_Occurred()) { - PyErr_SetString(PyExc_ValueError, - "Could not convert PyDateTime to numpy datetime"); - } - // TODO(username): is setting errMsg required? - // ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; - // return NULL; - } - - npy_datetime npy_dt = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts); - return NpyDateTimeToEpoch(npy_dt, base); -} - /* Converts the int64_t representation of a duration to ISO; mutates len */ char *int64ToIsoDuration(int64_t value, size_t *len) { pandas_timedeltastruct tds; diff --git a/pandas/_libs/tslibs/src/datetime/date_conversions.h b/pandas/_libs/tslibs/src/datetime/date_conversions.h index 8412b512b1471..3f9dad918938e 100644 --- a/pandas/_libs/tslibs/src/datetime/date_conversions.h +++ b/pandas/_libs/tslibs/src/datetime/date_conversions.h @@ -24,13 +24,4 @@ char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len); // replace with scaleNanosecToUnit npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base); -// Converts a Python object representing a Date / Datetime to ISO format -// up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z -// while base="ns" yields "2020-01-01T00:00:00.000000000Z" -// len is mutated to save the length of the returned string -char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base, size_t *len); - -// Convert a Python Date/Datetime to Unix epoch with resolution base -npy_datetime PyDateTimeToEpoch(PyObject *dt, NPY_DATETIMEUNIT base); - char *int64ToIsoDuration(int64_t value, size_t *len); diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.c b/pandas/_libs/tslibs/src/datetime/np_datetime.c index 0b3a973cc9b6c..e4d9c5dcd63ea 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.c @@ -299,94 +299,6 @@ PyObject *extract_utc_offset(PyObject *obj) { return tmp; } -/* - * - * Converts a Python datetime.datetime or datetime.date - * object into a NumPy npy_datetimestruct. Uses tzinfo (if present) - * to convert to UTC time. - * - * The following implementation just asks for attributes, and thus - * supports datetime duck typing. The tzinfo time zone conversion - * requires this style of access as well. - * - * Returns -1 on error, 0 on success, and 1 (with no error set) - * if obj doesn't have the needed date or datetime attributes. - */ -int convert_pydatetime_to_datetimestruct(PyObject *dtobj, - npy_datetimestruct *out) { - // Assumes that obj is a valid datetime object - PyObject *tmp; - PyObject *obj = (PyObject*)dtobj; - - /* Initialize the output to all zeros */ - memset(out, 0, sizeof(npy_datetimestruct)); - out->month = 1; - out->day = 1; - - out->year = PyLong_AsLong(PyObject_GetAttrString(obj, "year")); - out->month = PyLong_AsLong(PyObject_GetAttrString(obj, "month")); - out->day = PyLong_AsLong(PyObject_GetAttrString(obj, "day")); - - // TODO(anyone): If we can get PyDateTime_IMPORT to work, we could use - // PyDateTime_Check here, and less verbose attribute lookups. - - /* Check for time attributes (if not there, return success as a date) */ - if (!PyObject_HasAttrString(obj, "hour") || - !PyObject_HasAttrString(obj, "minute") || - !PyObject_HasAttrString(obj, "second") || - !PyObject_HasAttrString(obj, "microsecond")) { - return 0; - } - - out->hour = PyLong_AsLong(PyObject_GetAttrString(obj, "hour")); - out->min = PyLong_AsLong(PyObject_GetAttrString(obj, "minute")); - out->sec = PyLong_AsLong(PyObject_GetAttrString(obj, "second")); - out->us = PyLong_AsLong(PyObject_GetAttrString(obj, "microsecond")); - - if (PyObject_HasAttrString(obj, "tzinfo")) { - PyObject *offset = extract_utc_offset(obj); - /* Apply the time zone offset if datetime obj is tz-aware */ - if (offset != NULL) { - if (offset == Py_None) { - Py_DECREF(offset); - return 0; - } - PyObject *tmp_int; - int seconds_offset, minutes_offset; - /* - * The timedelta should have a function "total_seconds" - * which contains the value we want. - */ - tmp = PyObject_CallMethod(offset, "total_seconds", ""); - Py_DECREF(offset); - if (tmp == NULL) { - return -1; - } - tmp_int = PyNumber_Long(tmp); - if (tmp_int == NULL) { - Py_DECREF(tmp); - return -1; - } - seconds_offset = PyLong_AsLong(tmp_int); - if (seconds_offset == -1 && PyErr_Occurred()) { - Py_DECREF(tmp_int); - Py_DECREF(tmp); - return -1; - } - Py_DECREF(tmp_int); - Py_DECREF(tmp); - - /* Convert to a minutes offset and apply it */ - minutes_offset = seconds_offset / 60; - - add_minutes_to_datetimestruct(out, -minutes_offset); - } - } - - return 0; -} - - /* * Converts a datetime from a datetimestruct to a datetime based * on a metadata unit. The date is assumed to be valid. diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.h b/pandas/_libs/tslibs/src/datetime/np_datetime.h index 5c5b31ef9bb2f..6b5135f559482 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.h @@ -66,9 +66,6 @@ static const npy_datetimestruct _M_MAX_DTS = { PyObject *extract_utc_offset(PyObject *obj); -int convert_pydatetime_to_datetimestruct(PyObject *dtobj, - npy_datetimestruct *out); - npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base, const npy_datetimestruct *dts); diff --git a/pandas/_libs/tslibs/src/datetime/pd_datetime.c b/pandas/_libs/tslibs/src/datetime/pd_datetime.c index 73f63706f2a88..98b6073d7a488 100644 --- a/pandas/_libs/tslibs/src/datetime/pd_datetime.c +++ b/pandas/_libs/tslibs/src/datetime/pd_datetime.c @@ -28,6 +28,160 @@ static void pandas_datetime_destructor(PyObject *op) { PyMem_Free(ptr); } +/* + * + * Converts a Python datetime.datetime or datetime.date + * object into a NumPy npy_datetimestruct. Uses tzinfo (if present) + * to convert to UTC time. + * + * The following implementation just asks for attributes, and thus + * supports datetime duck typing. The tzinfo time zone conversion + * requires this style of access as well. + * + * Returns -1 on error, 0 on success, and 1 (with no error set) + * if obj doesn't have the needed date or datetime attributes. + */ +static int convert_pydatetime_to_datetimestruct(PyObject *dtobj, + npy_datetimestruct *out) { + // Assumes that obj is a valid datetime object + PyObject *tmp; + PyObject *obj = (PyObject*)dtobj; + + /* Initialize the output to all zeros */ + memset(out, 0, sizeof(npy_datetimestruct)); + out->month = 1; + out->day = 1; + + out->year = PyLong_AsLong(PyObject_GetAttrString(obj, "year")); + out->month = PyLong_AsLong(PyObject_GetAttrString(obj, "month")); + out->day = PyLong_AsLong(PyObject_GetAttrString(obj, "day")); + + // TODO(anyone): If we can get PyDateTime_IMPORT to work, we could use + // PyDateTime_Check here, and less verbose attribute lookups. + + /* Check for time attributes (if not there, return success as a date) */ + if (!PyObject_HasAttrString(obj, "hour") || + !PyObject_HasAttrString(obj, "minute") || + !PyObject_HasAttrString(obj, "second") || + !PyObject_HasAttrString(obj, "microsecond")) { + return 0; + } + + out->hour = PyLong_AsLong(PyObject_GetAttrString(obj, "hour")); + out->min = PyLong_AsLong(PyObject_GetAttrString(obj, "minute")); + out->sec = PyLong_AsLong(PyObject_GetAttrString(obj, "second")); + out->us = PyLong_AsLong(PyObject_GetAttrString(obj, "microsecond")); + + if (PyObject_HasAttrString(obj, "tzinfo")) { + PyObject *offset = extract_utc_offset(obj); + /* Apply the time zone offset if datetime obj is tz-aware */ + if (offset != NULL) { + if (offset == Py_None) { + Py_DECREF(offset); + return 0; + } + PyObject *tmp_int; + int seconds_offset, minutes_offset; + /* + * The timedelta should have a function "total_seconds" + * which contains the value we want. + */ + tmp = PyObject_CallMethod(offset, "total_seconds", ""); + Py_DECREF(offset); + if (tmp == NULL) { + return -1; + } + tmp_int = PyNumber_Long(tmp); + if (tmp_int == NULL) { + Py_DECREF(tmp); + return -1; + } + seconds_offset = PyLong_AsLong(tmp_int); + if (seconds_offset == -1 && PyErr_Occurred()) { + Py_DECREF(tmp_int); + Py_DECREF(tmp); + return -1; + } + Py_DECREF(tmp_int); + Py_DECREF(tmp); + + /* Convert to a minutes offset and apply it */ + minutes_offset = seconds_offset / 60; + + add_minutes_to_datetimestruct(out, -minutes_offset); + } + } + + return 0; +} + +// Converts a Python object representing a Date / Datetime to ISO format +// up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z +// while base="ns" yields "2020-01-01T00:00:00.000000000Z" +// len is mutated to save the length of the returned string +static char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base, + size_t *len) { + npy_datetimestruct dts; + int ret; + + ret = convert_pydatetime_to_datetimestruct(obj, &dts); + if (ret != 0) { + if (!PyErr_Occurred()) { + PyErr_SetString(PyExc_ValueError, + "Could not convert PyDateTime to numpy datetime"); + } + return NULL; + } + + *len = (size_t)get_datetime_iso_8601_strlen(0, base); + char *result = PyObject_Malloc(*len); + // Check to see if PyDateTime has a timezone. + // Don't convert to UTC if it doesn't. + int is_tz_aware = 0; + if (PyObject_HasAttrString(obj, "tzinfo")) { + PyObject *offset = extract_utc_offset(obj); + if (offset == NULL) { + PyObject_Free(result); + return NULL; + } + is_tz_aware = offset != Py_None; + Py_DECREF(offset); + } + ret = make_iso_8601_datetime(&dts, result, *len, is_tz_aware, base); + + if (ret != 0) { + PyErr_SetString(PyExc_ValueError, + "Could not convert datetime value to string"); + PyObject_Free(result); + return NULL; + } + + // Note that get_datetime_iso_8601_strlen just gives a generic size + // for ISO string conversion, not the actual size used + *len = strlen(result); + return result; +} + +// Convert a Python Date/Datetime to Unix epoch with resolution base +static npy_datetime PyDateTimeToEpoch(PyObject *dt, NPY_DATETIMEUNIT base) { + npy_datetimestruct dts; + int ret; + + ret = convert_pydatetime_to_datetimestruct(dt, &dts); + if (ret != 0) { + if (!PyErr_Occurred()) { + PyErr_SetString(PyExc_ValueError, + "Could not convert PyDateTime to numpy datetime"); + } + // TODO(username): is setting errMsg required? + // ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; + // return NULL; + } + + npy_datetime npy_dt = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts); + return NpyDateTimeToEpoch(npy_dt, base); +} + static int pandas_datetime_exec(PyObject *module) { PyDateTime_IMPORT; PandasDateTime_CAPI *capi = PyMem_Malloc(sizeof(PandasDateTime_CAPI)); @@ -94,5 +248,6 @@ static struct PyModuleDef pandas_datetimemodule = { .m_slots = pandas_datetime_slots}; PyMODINIT_FUNC PyInit_pandas_datetime(void) { + PyDateTime_IMPORT; return PyModuleDef_Init(&pandas_datetimemodule); } From 4b48e44aab6b21c0c7337b5950d95e8bda95cc65 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 12 Apr 2023 21:40:20 -0400 Subject: [PATCH 09/14] BLD: Add DLL hashes to RECORD (#52556) --- ci/fix_wheels.py | 67 +++++++++++++++++++++++++---------------------- ci/test_wheels.py | 12 +++++++++ 2 files changed, 47 insertions(+), 32 deletions(-) diff --git a/ci/fix_wheels.py b/ci/fix_wheels.py index 525aacf572cd4..76b70fdde9ea0 100644 --- a/ci/fix_wheels.py +++ b/ci/fix_wheels.py @@ -1,5 +1,14 @@ +""" +This file "repairs" our Windows wheels by copying the necessary DLLs for pandas to run +on a barebones Windows installation() into the wheel. + +NOTE: The paths for the DLLs are hard-coded to the location of the Visual Studio +redistributables +""" import os import shutil +import subprocess +from subprocess import CalledProcessError import sys import zipfile @@ -18,41 +27,35 @@ raise ValueError( "User must pass the path to the wheel and the destination directory." ) -# Wheels are zip files if not os.path.isdir(dest_dir): print(f"Created directory {dest_dir}") os.mkdir(dest_dir) -shutil.copy(wheel_path, dest_dir) # Remember to delete if process fails + wheel_name = os.path.basename(wheel_path) success = True -exception = None -repaired_wheel_path = os.path.join(dest_dir, wheel_name) -with zipfile.ZipFile(repaired_wheel_path, "a") as zipf: - try: - # TODO: figure out how licensing works for the redistributables - base_redist_dir = ( - f"C:/Program Files (x86)/Microsoft Visual Studio/2019/" - f"Enterprise/VC/Redist/MSVC/14.29.30133/{PYTHON_ARCH}/" - f"Microsoft.VC142.CRT/" - ) - zipf.write( - os.path.join(base_redist_dir, "msvcp140.dll"), - "pandas/_libs/window/msvcp140.dll", - ) - zipf.write( - os.path.join(base_redist_dir, "concrt140.dll"), - "pandas/_libs/window/concrt140.dll", - ) - if not is_32: - zipf.write( - os.path.join(base_redist_dir, "vcruntime140_1.dll"), - "pandas/_libs/window/vcruntime140_1.dll", - ) - except Exception as e: - success = False - exception = e -if not success: - os.remove(repaired_wheel_path) - raise exception -print(f"Successfully repaired wheel was written to {repaired_wheel_path}") +try: + # Use the wheel CLI for zipping up the wheel since the CLI will + # take care of rebuilding the hashes found in the record file + tmp_dir = os.path.join(dest_dir, "tmp") + with zipfile.ZipFile(wheel_path, "r") as f: + # Extracting all the members of the zip + # into a specific location. + f.extractall(path=tmp_dir) + base_redist_dir = ( + f"C:/Program Files (x86)/Microsoft Visual Studio/2019/" + f"Enterprise/VC/Redist/MSVC/14.29.30133/{PYTHON_ARCH}/" + f"Microsoft.VC142.CRT/" + ) + required_dlls = ["msvcp140.dll", "concrt140.dll"] + if not is_32: + required_dlls += ["vcruntime140_1.dll"] + dest_dll_dir = os.path.join(tmp_dir, "pandas/_libs/window") + for dll in required_dlls: + src = os.path.join(base_redist_dir, dll) + shutil.copy(src, dest_dll_dir) + subprocess.run(["wheel", "pack", tmp_dir, "-d", dest_dir], check=True) +except CalledProcessError: + print("Failed to add DLLS to wheel.") + sys.exit(1) +print("Successfully repaired wheel") diff --git a/ci/test_wheels.py b/ci/test_wheels.py index f861c1cbedcad..75675d7e4ffc3 100644 --- a/ci/test_wheels.py +++ b/ci/test_wheels.py @@ -2,6 +2,7 @@ import os import shutil import subprocess +from subprocess import CalledProcessError import sys if os.name == "nt": @@ -15,6 +16,17 @@ wheel_path = None print(f"IS_32_BIT is {is_32_bit}") print(f"Path to built wheel is {wheel_path}") + + print("Verifying file hashes in wheel RECORD file") + try: + tmp_dir = "tmp" + subprocess.run(["wheel", "unpack", wheel_path, "-d", tmp_dir], check=True) + except CalledProcessError: + print("wheel RECORD file hash verification failed.") + sys.exit(1) + finally: + shutil.rmtree(tmp_dir) + if is_32_bit: sys.exit(0) # No way to test Windows 32-bit(no docker image) if wheel_path is None: From f066167ee36ecfac4d76227e113a1b332ec4c326 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 13 Apr 2023 03:18:15 -0700 Subject: [PATCH 10/14] CI: Remove ArrayManager job (#52637) --- .github/workflows/ubuntu.yml | 7 +------ ci/run_tests.sh | 15 --------------- 2 files changed, 1 insertion(+), 21 deletions(-) diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 97ca346142ec1..ee765d8c01e60 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -60,10 +60,6 @@ jobs: env_file: actions-310.yaml pattern: "not slow and not network and not single_cpu" pandas_copy_on_write: "1" - - name: "Data Manager" - env_file: actions-38.yaml - pattern: "not slow and not network and not single_cpu" - pandas_data_manager: "array" - name: "Pypy" env_file: actions-pypy-38.yaml pattern: "not slow and not network and not single_cpu" @@ -86,7 +82,6 @@ jobs: EXTRA_APT: ${{ matrix.extra_apt || '' }} LANG: ${{ matrix.lang || '' }} LC_ALL: ${{ matrix.lc_all || '' }} - PANDAS_DATA_MANAGER: ${{ matrix.pandas_data_manager || 'block' }} PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }} PANDAS_CI: ${{ matrix.pandas_ci || '1' }} TEST_ARGS: ${{ matrix.test_args || '' }} @@ -97,7 +92,7 @@ jobs: COVERAGE: ${{ !contains(matrix.env_file, 'pypy') }} concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_data_manager || '' }} + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }} cancel-in-progress: true services: diff --git a/ci/run_tests.sh b/ci/run_tests.sh index e6de5caf955fc..3e79e5f60cba6 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -32,18 +32,3 @@ fi echo $PYTEST_CMD sh -c "$PYTEST_CMD" - -if [[ "$PANDAS_DATA_MANAGER" != "array" && "$PYTEST_TARGET" == "pandas" ]]; then - # The ArrayManager tests should have already been run by PYTEST_CMD if PANDAS_DATA_MANAGER was already set to array - # If we're targeting specific files, e.g. test_downstream.py, don't run. - PYTEST_AM_CMD="PANDAS_DATA_MANAGER=array pytest -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE pandas" - - if [[ "$PATTERN" ]]; then - PYTEST_AM_CMD="$PYTEST_AM_CMD -m \"$PATTERN and arraymanager\"" - else - PYTEST_AM_CMD="$PYTEST_AM_CMD -m \"arraymanager\"" - fi - - echo $PYTEST_AM_CMD - sh -c "$PYTEST_AM_CMD" -fi From 0836dac57c8e9bfebd8f9be1a9992c688071178e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 13 Apr 2023 03:19:15 -0700 Subject: [PATCH 11/14] DOC: Remove notes to old Python/package versions (#52640) --- doc/source/development/extending.rst | 2 +- doc/source/getting_started/install.rst | 8 -------- doc/source/user_guide/advanced.rst | 2 +- doc/source/user_guide/io.rst | 9 ++------- doc/source/user_guide/merging.rst | 6 ------ doc/source/user_guide/missing_data.rst | 5 ----- doc/source/user_guide/text.rst | 17 +---------------- doc/source/user_guide/visualization.rst | 2 +- pandas/core/arrays/interval.py | 3 --- pandas/core/config_init.py | 2 +- pandas/core/dtypes/concat.py | 2 -- pandas/core/frame.py | 23 +++++++---------------- pandas/core/generic.py | 6 ++---- pandas/core/indexes/interval.py | 1 - pandas/core/resample.py | 4 +--- pandas/core/reshape/merge.py | 4 ---- pandas/core/series.py | 6 +++--- pandas/io/common.py | 2 +- pandas/io/gbq.py | 6 ------ pandas/io/pytables.py | 2 +- pandas/tests/io/formats/test_to_csv.py | 2 +- pandas/tests/series/test_constructors.py | 3 +-- 22 files changed, 24 insertions(+), 93 deletions(-) diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index 1d52a5595472b..b829cfced6962 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -450,7 +450,7 @@ Below is an example to define two original properties, "internal_cache" as a tem Plotting backends ----------------- -Starting in 0.25 pandas can be extended with third-party plotting backends. The +pandas can be extended with third-party plotting backends. The main idea is letting users select a plotting backend different than the provided one based on Matplotlib. For example: diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index e82cf8ff93bbc..9aa868dab30a6 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -149,14 +149,6 @@ to install pandas with the optional dependencies to read Excel files. The full list of extras that can be installed can be found in the :ref:`dependency section.` -Installing with ActivePython -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Installation instructions for -`ActivePython `__ can be found -`here `__. Versions -2.7, 3.5 and 3.6 include pandas. - Installing using your Linux distribution's package manager. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 68024fbd05727..d76c7e2bf3b03 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -918,7 +918,7 @@ If you select a label *contained* within an interval, this will also select the df.loc[2.5] df.loc[[2.5, 3.5]] -Selecting using an ``Interval`` will only return exact matches (starting from pandas 0.25.0). +Selecting using an ``Interval`` will only return exact matches. .. ipython:: python diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 101932a23ca6a..dd6ea6eccc85c 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3999,7 +3999,7 @@ any pickled pandas object (or any other pickled object) from file: .. warning:: - :func:`read_pickle` is only guaranteed backwards compatible back to pandas version 0.20.3 + :func:`read_pickle` is only guaranteed backwards compatible back to a few minor release. .. _io.pickle.compression: @@ -5922,11 +5922,6 @@ And then issue the following queries: Google BigQuery --------------- -.. warning:: - - Starting in 0.20.0, pandas has split off Google BigQuery support into the - separate package ``pandas-gbq``. You can ``pip install pandas-gbq`` to get it. - The ``pandas-gbq`` package provides functionality to read/write from Google BigQuery. pandas integrates with this external package. if ``pandas-gbq`` is installed, you can @@ -6114,7 +6109,7 @@ SAS formats ----------- The top-level function :func:`read_sas` can read (but not write) SAS -XPORT (.xpt) and (since *v0.18.0*) SAS7BDAT (.sas7bdat) format files. +XPORT (.xpt) and SAS7BDAT (.sas7bdat) format files. SAS files only contain two value types: ASCII text and floating point values (usually 8 bytes but sometimes truncated). For xport files, diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index ce4b3d1e8c7f3..cf8d7a05bf6e7 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -510,12 +510,6 @@ all standard database join operations between ``DataFrame`` or named ``Series`` dataset. * "many_to_many" or "m:m": allowed, but does not result in checks. -.. note:: - - Support for specifying index levels as the ``on``, ``left_on``, and - ``right_on`` parameters was added in version 0.23.0. - Support for merging named ``Series`` objects was added in version 0.24.0. - The return type will be the same as ``left``. If ``left`` is a ``DataFrame`` or named ``Series`` and ``right`` is a subclass of ``DataFrame``, the return type will still be ``DataFrame``. diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 467c343f4ad1a..4d645cd75ac76 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -182,11 +182,6 @@ account for missing data. For example: Sum/prod of empties/nans ~~~~~~~~~~~~~~~~~~~~~~~~ -.. warning:: - - This behavior is now standard as of v0.22.0 and is consistent with the default in ``numpy``; previously sum/prod of all-NA or empty Series/DataFrames would return NaN. - See :ref:`v0.22.0 whatsnew ` for more. - The sum of an empty or all-NA Series or column of a DataFrame is 0. .. ipython:: python diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index f188c08b7bb94..4e0b18c73ee29 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -206,8 +206,7 @@ and replacing any remaining whitespaces with underscores: .. warning:: - Before v.0.25.0, the ``.str``-accessor did only the most rudimentary type checks. Starting with - v.0.25.0, the type of the Series is inferred and the allowed types (i.e. strings) are enforced more rigorously. + The type of the Series is inferred and the allowed types (i.e. strings). Generally speaking, the ``.str`` accessor is intended to work only on strings. With very few exceptions, other uses are not supported, and may be disabled at a later point. @@ -423,11 +422,6 @@ the ``join``-keyword. s.str.cat(u) s.str.cat(u, join="left") -.. warning:: - - If the ``join`` keyword is not passed, the method :meth:`~Series.str.cat` will currently fall back to the behavior before version 0.23.0 (i.e. no alignment), - but a ``FutureWarning`` will be raised if any of the involved indexes differ, since this default will change to ``join='left'`` in a future version. - The usual options are available for ``join`` (one of ``'left', 'outer', 'inner', 'right'``). In particular, alignment also means that the different lengths do not need to coincide anymore. @@ -503,15 +497,6 @@ Extracting substrings Extract first match in each subject (extract) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. warning:: - - Before version 0.23, argument ``expand`` of the ``extract`` method defaulted to - ``False``. When ``expand=False``, ``expand`` returns a ``Series``, ``Index``, or - ``DataFrame``, depending on the subject and regular expression - pattern. When ``expand=True``, it always returns a ``DataFrame``, - which is more consistent and less confusing from the perspective of a user. - ``expand=True`` has been the default since version 0.23.0. - The ``extract`` method accepts a `regular expression `__ with at least one capture group. diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 844be80abd1ff..ae8de4d5386b1 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -1794,7 +1794,7 @@ when plotting a large number of points. Plotting backends ----------------- -Starting in version 0.25, pandas can be extended with third-party plotting backends. The +pandas can be extended with third-party plotting backends. The main idea is letting users select a plotting backend different than the provided one based on Matplotlib. diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 1d233e0ebde1a..ea35a86095e15 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -124,8 +124,6 @@ ] = """ %(summary)s -.. versionadded:: %(versionadded)s - Parameters ---------- data : array-like (1-dimensional) @@ -187,7 +185,6 @@ % { "klass": "IntervalArray", "summary": "Pandas array for interval data that are closed on the same side.", - "versionadded": "0.24.0", "name": "", "extra_attributes": "", "extra_methods": "", diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index d3bdcee7a7341..5f1aa3a1e9535 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -275,7 +275,7 @@ def use_numba_cb(key) -> None: pc_large_repr_doc = """ : 'truncate'/'info' For DataFrames exceeding max_rows/max_cols, the repr (and HTML repr) can - show a truncated table (the default from 0.13), or switch to the view from + show a truncated table, or switch to the view from df.info() (the behaviour in earlier versions of pandas). """ diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index b55c8cd31c110..24fe1887002c9 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -240,8 +240,6 @@ def union_categoricals( ... TypeError: to union ordered Categoricals, all categories must be the same - New in version 0.20.0 - Ordered categoricals with different categories or orderings can be combined by using the `ignore_ordered=True` argument. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index dfee04a784630..0e8f2b0044c66 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -379,12 +379,6 @@ merge_asof : Merge on nearest keys. DataFrame.join : Similar method using indices. -Notes ------ -Support for specifying index levels as the `on`, `left_on`, and -`right_on` parameters was added in version 0.23.0 -Support for merging named Series objects was added in version 0.24.0 - Examples -------- >>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], @@ -1501,7 +1495,7 @@ def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: This method computes the matrix product between the DataFrame and the values of an other Series, DataFrame or a numpy array. - It can also be called using ``self @ other`` in Python >= 3.5. + It can also be called using ``self @ other``. Parameters ---------- @@ -1619,13 +1613,13 @@ def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: """ - Matrix multiplication using binary `@` operator in Python>=3.5. + Matrix multiplication using binary `@` operator. """ return self.dot(other) def __rmatmul__(self, other) -> DataFrame: """ - Matrix multiplication using binary `@` operator in Python>=3.5. + Matrix multiplication using binary `@` operator. """ try: return self.T.dot(np.transpose(other)).T @@ -2700,8 +2694,8 @@ def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None: it will be used as Root Directory path when writing a partitioned dataset. **kwargs : Additional keywords passed to :func:`pyarrow.feather.write_feather`. - Starting with pyarrow 0.17, this includes the `compression`, - `compression_level`, `chunksize` and `version` keywords. + This includes the `compression`, `compression_level`, `chunksize` + and `version` keywords. .. versionadded:: 1.1.0 @@ -4631,8 +4625,8 @@ def select_dtypes(self, include=None, exclude=None) -> Self: * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or ``'timedelta64'`` * To select Pandas categorical dtypes, use ``'category'`` - * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in - 0.20.0) or ``'datetime64[ns, tz]'`` + * To select Pandas datetimetz dtypes, use ``'datetimetz'`` + or ``'datetime64[ns, tz]'`` Examples -------- @@ -9983,9 +9977,6 @@ def join( Parameters `on`, `lsuffix`, and `rsuffix` are not supported when passing a list of `DataFrame` objects. - Support for specifying index levels as the `on` parameter was added - in version 0.23.0. - Examples -------- >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0c14c76ab539f..800aaf47e1631 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2606,7 +2606,7 @@ def to_hdf( A value of 0 or None disables compression. complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib' Specifies the compression library to be used. - As of v0.20.2 these additional compressors for Blosc are supported + These additional compressors for Blosc are supported (default if no compressor specified: 'blosc:blosclz'): {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd'}. @@ -7537,9 +7537,7 @@ def interpolate( 'cubicspline': Wrappers around the SciPy interpolation methods of similar names. See `Notes`. * 'from_derivatives': Refers to - `scipy.interpolate.BPoly.from_derivatives` which - replaces 'piecewise_polynomial' interpolation method in - scipy 0.18. + `scipy.interpolate.BPoly.from_derivatives`. axis : {{0 or 'index', 1 or 'columns', None}}, default None Axis to interpolate along. For `Series` this parameter is unused diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 965c0ba9be1e3..8cf5151a8f0b5 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -154,7 +154,6 @@ def _new_IntervalIndex(cls, d): "klass": "IntervalIndex", "summary": "Immutable index of intervals that are closed on the same side.", "name": _index_doc_kwargs["name"], - "versionadded": "0.20.0", "extra_attributes": "is_overlapping\nvalues\n", "extra_methods": "", "examples": textwrap.dedent( diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 0b9ebb1117821..50978275eb5e5 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -886,9 +886,7 @@ def interpolate( 'cubicspline': Wrappers around the SciPy interpolation methods of similar names. See `Notes`. * 'from_derivatives': Refers to - `scipy.interpolate.BPoly.from_derivatives` which - replaces 'piecewise_polynomial' interpolation method in - scipy 0.18. + `scipy.interpolate.BPoly.from_derivatives`. axis : {{0 or 'index', 1 or 'columns', None}}, default None Axis to interpolate along. For `Series` this parameter is unused diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0281a0a9f562e..03773a77de0ae 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -389,10 +389,6 @@ def merge_asof( - A "nearest" search selects the row in the right DataFrame whose 'on' key is closest in absolute distance to the left's key. - The default is "backward" and is compatible in versions below 0.20.0. - The direction parameter was added in version 0.20.0 and introduces - "forward" and "nearest". - Optionally match on equivalent keys with 'by' before searching with 'on'. Parameters diff --git a/pandas/core/series.py b/pandas/core/series.py index e11eda33b2e34..a9d63c5d03bf8 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2891,7 +2891,7 @@ def dot(self, other: AnyArrayLike) -> Series | np.ndarray: one, or the Series and each columns of a DataFrame, or the Series and each columns of an array. - It can also be called using `self @ other` in Python >= 3.5. + It can also be called using `self @ other`. Parameters ---------- @@ -2963,13 +2963,13 @@ def dot(self, other: AnyArrayLike) -> Series | np.ndarray: def __matmul__(self, other): """ - Matrix multiplication using binary `@` operator in Python>=3.5. + Matrix multiplication using binary `@` operator. """ return self.dot(other) def __rmatmul__(self, other): """ - Matrix multiplication using binary `@` operator in Python>=3.5. + Matrix multiplication using binary `@` operator. """ return self.dot(np.transpose(other)) diff --git a/pandas/io/common.py b/pandas/io/common.py index 13185603c7bac..02de416e5ce37 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -243,7 +243,7 @@ def stringify_path( Notes ----- - Objects supporting the fspath protocol (python 3.6+) are coerced + Objects supporting the fspath protocol are coerced according to its __fspath__ method. Any other object is passed through unchanged, which includes bytes, diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index d6c73664ab6f2..286d2b187c700 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -134,8 +134,6 @@ def read_gbq( If set, limit the maximum number of rows to fetch from the query results. - *New in version 0.12.0 of pandas-gbq*. - .. versionadded:: 1.1.0 progress_bar_type : Optional, str If set, use the `tqdm `__ library to @@ -156,10 +154,6 @@ def read_gbq( Use the :func:`tqdm.tqdm_gui` function to display a progress bar as a graphical dialog box. - Note that this feature requires version 0.12.0 or later of the - ``pandas-gbq`` package. And it requires the ``tqdm`` package. Slightly - different than ``pandas-gbq``, here the default is ``None``. - Returns ------- df: DataFrame diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index da0ca940791ba..85000d49cdac6 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -515,7 +515,7 @@ class HDFStore: A value of 0 or None disables compression. complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib' Specifies the compression library to be used. - As of v0.20.2 these additional compressors for Blosc are supported + These additional compressors for Blosc are supported (default if no compressor specified: 'blosc:blosclz'): {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd'}. diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 4e47e4197c710..81dc79d3111b8 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -508,7 +508,7 @@ def test_to_csv_stdout_file(self, capsys): reason=( "Especially in Windows, file stream should not be passed" "to csv writer without newline='' option." - "(https://docs.python.org/3.6/library/csv.html#csv.writer)" + "(https://docs.python.org/3/library/csv.html#csv.writer)" ), ) def test_to_csv_write_to_open_file(self): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 8e883f9cec8ea..0a8341476dc56 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1346,8 +1346,7 @@ def test_constructor_dict_list_value_explicit_dtype(self): def test_constructor_dict_order(self): # GH19018 - # initialization ordering: by insertion order if python>= 3.6, else - # order by value + # initialization ordering: by insertion order d = {"b": 1, "a": 0, "c": 2} result = Series(d) expected = Series([1, 0, 2], index=list("bac")) From 42e2378adeca4980f1578a2bb034badbc02f86af Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 13 Apr 2023 09:47:32 -0700 Subject: [PATCH 12/14] STYLE sort whatsnew entries alphabeticaly, allow for trailing full stops (#52598) * allow for trailing full stops in sort-whatsnew-entries hook * sort alphabetically instead --------- Co-authored-by: MarcoGorelli <> --- doc/source/whatsnew/v2.0.1.rst | 28 ++++--- doc/source/whatsnew/v2.1.0.rst | 98 ++++++++++++------------ scripts/sort_whatsnew_note.py | 7 +- scripts/tests/test_sort_whatsnew_note.py | 4 +- 4 files changed, 73 insertions(+), 64 deletions(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 974d866a4b091..b082758ebd5b1 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -14,10 +14,10 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression for subclassed Series when constructing from a dictionary (:issue:`52445`) -- Fixed regression in :meth:`Series.describe` showing ``RuntimeWarning`` for extension dtype :class:`Series` with one element (:issue:`52515`) +- Fixed regression in :meth:`DataFrame.pivot` changing :class:`Index` name of input object (:issue:`52629`) - Fixed regression in :meth:`DataFrame.sort_values` not resetting index when :class:`DataFrame` is already sorted and ``ignore_index=True`` (:issue:`52553`) - Fixed regression in :meth:`MultiIndex.isin` raising ``TypeError`` for ``Generator`` (:issue:`52568`) -- Fixed regression in :meth:`DataFrame.pivot` changing :class:`Index` name of input object (:issue:`52629`) +- Fixed regression in :meth:`Series.describe` showing ``RuntimeWarning`` for extension dtype :class:`Series` with one element (:issue:`52515`) .. --------------------------------------------------------------------------- .. _whatsnew_201.bug_fixes: @@ -25,25 +25,33 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in :attr:`Series.dt.days` that would overflow ``int32`` number of days (:issue:`52391`) -- Fixed bug in :func:`merge` when merging with ``ArrowDtype`` one one and a NumPy dtype on the other side (:issue:`52406`) -- Bug in :func:`to_datetime` and :func:`to_timedelta` when trying to convert numeric data with a :class:`ArrowDtype` (:issue:`52425`) -- Bug in :meth:`Series.describe` not returning :class:`ArrowDtype` with ``pyarrow.float64`` type with numeric data (:issue:`52427`) -- Fixed segfault in :meth:`Series.to_numpy` with ``null[pyarrow]`` dtype (:issue:`52443`) +- Bug in :class:`arrays.DatetimeArray` constructor returning an incorrect unit when passed a non-nanosecond numpy datetime array (:issue:`52555`) +- Bug in :func:`pandas.testing.assert_series_equal` where ``check_dtype=False`` would still raise for datetime or timedelta types with different resolutions (:issue:`52449`) - Bug in :func:`pandas.testing.assert_series_equal` where ``check_dtype=False`` would still raise for datetime or timedelta types with different resolutions (:issue:`52449`) -- Bug in :meth:`DataFrame.max` and related casting different :class:`Timestamp` resolutions always to nanoseconds (:issue:`52524`) -- Bug in :meth:`ArrowDtype.__from_arrow__` not respecting if dtype is explicitly given (:issue:`52533`) - Bug in :func:`read_csv` casting PyArrow datetimes to NumPy when ``dtype_backend="pyarrow"`` and ``parse_dates`` is set causing a performance bottleneck in the process (:issue:`52546`) -- Bug in :class:`arrays.DatetimeArray` constructor returning an incorrect unit when passed a non-nanosecond numpy datetime array (:issue:`52555`) +- Bug in :func:`read_csv` casting PyArrow datetimes to NumPy when ``dtype_backend="pyarrow"`` and ``parse_dates`` is set causing a performance bottleneck in the process (:issue:`52546`) +- Bug in :func:`to_datetime` and :func:`to_timedelta` when trying to convert numeric data with a :class:`ArrowDtype` (:issue:`52425`) +- Bug in :func:`to_datetime` and :func:`to_timedelta` when trying to convert numeric data with a :class:`ArrowDtype` (:issue:`52425`) - Bug in :func:`to_numeric` with ``errors='coerce'`` and ``dtype_backend='pyarrow'`` with :class:`ArrowDtype` data (:issue:`52588`) +- Bug in :meth:`ArrowDtype.__from_arrow__` not respecting if dtype is explicitly given (:issue:`52533`) +- Bug in :meth:`ArrowDtype.__from_arrow__` not respecting if dtype is explicitly given (:issue:`52533`) +- Bug in :meth:`DataFrame.max` and related casting different :class:`Timestamp` resolutions always to nanoseconds (:issue:`52524`) +- Bug in :meth:`DataFrame.max` and related casting different :class:`Timestamp` resolutions always to nanoseconds (:issue:`52524`) +- Bug in :meth:`Series.describe` not returning :class:`ArrowDtype` with ``pyarrow.float64`` type with numeric data (:issue:`52427`) +- Bug in :meth:`Series.describe` not returning :class:`ArrowDtype` with ``pyarrow.float64`` type with numeric data (:issue:`52427`) +- Fixed bug in :func:`merge` when merging with ``ArrowDtype`` one one and a NumPy dtype on the other side (:issue:`52406`) +- Fixed bug in :func:`merge` when merging with ``ArrowDtype`` one one and a NumPy dtype on the other side (:issue:`52406`) +- Fixed segfault in :meth:`Series.to_numpy` with ``null[pyarrow]`` dtype (:issue:`52443`) +- Fixed segfault in :meth:`Series.to_numpy` with ``null[pyarrow]`` dtype (:issue:`52443`) .. --------------------------------------------------------------------------- .. _whatsnew_201.other: Other ~~~~~ -- Implemented :meth:`Series.str.split` and :meth:`Series.str.rsplit` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`52401`) - :class:`DataFrame` created from empty dicts had :attr:`~DataFrame.columns` of dtype ``object``. It is now a :class:`RangeIndex` (:issue:`52404`) - :class:`Series` created from empty dicts had :attr:`~Series.index` of dtype ``object``. It is now a :class:`RangeIndex` (:issue:`52404`) +- Implemented :meth:`Series.str.split` and :meth:`Series.str.rsplit` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`52401`) .. --------------------------------------------------------------------------- .. _whatsnew_201.contributors: diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 9b5cba1e1ee05..0772cadf6e737 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -74,20 +74,20 @@ Other enhancements - :meth:`Categorical.map` and :meth:`CategoricalIndex.map` now have a ``na_action`` parameter. :meth:`Categorical.map` implicitly had a default value of ``"ignore"`` for ``na_action``. This has formally been deprecated and will be changed to ``None`` in the future. Also notice that :meth:`Series.map` has default ``na_action=None`` and calls to series with categorical data will now use ``na_action=None`` unless explicitly set otherwise (:issue:`44279`) -- Implemented ``__pandas_priority__`` to allow custom types to take precedence over :class:`DataFrame`, :class:`Series`, :class:`Index`, or :class:`ExtensionArray` for arithmetic operations, :ref:`see the developer guide ` (:issue:`48347`) +- :class:`api.extensions.ExtensionArray` now has a :meth:`~api.extensions.ExtensionArray.map` method (:issue:`51809`) +- :meth:`DataFrame.applymap` now uses the :meth:`~api.extensions.ExtensionArray.map` method of underlying :class:`api.extensions.ExtensionArray` instances (:issue:`52219`) - :meth:`MultiIndex.sort_values` now supports ``na_position`` (:issue:`51612`) - :meth:`MultiIndex.sortlevel` and :meth:`Index.sortlevel` gained a new keyword ``na_position`` (:issue:`51612`) - :meth:`arrays.DatetimeArray.map`, :meth:`arrays.TimedeltaArray.map` and :meth:`arrays.PeriodArray.map` can now take a ``na_action`` argument (:issue:`51644`) -- Improve error message when setting :class:`DataFrame` with wrong number of columns through :meth:`DataFrame.isetitem` (:issue:`51701`) -- Let :meth:`DataFrame.to_feather` accept a non-default :class:`Index` and non-string column names (:issue:`51787`) -- :class:`api.extensions.ExtensionArray` now has a :meth:`~api.extensions.ExtensionArray.map` method (:issue:`51809`) -- Improve error message when having incompatible columns using :meth:`DataFrame.merge` (:issue:`51861`) -- Added to the escape mode "latex-math" preserving without escaping all characters between "\(" and "\)" in formatter (:issue:`51903`) -- Improved error message when creating a DataFrame with empty data (0 rows), no index and an incorrect number of columns. (:issue:`52084`) -- :meth:`DataFrame.applymap` now uses the :meth:`~api.extensions.ExtensionArray.map` method of underlying :class:`api.extensions.ExtensionArray` instances (:issue:`52219`) - :meth:`arrays.SparseArray.map` now supports ``na_action`` (:issue:`52096`). - Add dtype of categories to ``repr`` information of :class:`CategoricalDtype` (:issue:`52179`) +- Added to the escape mode "latex-math" preserving without escaping all characters between "\(" and "\)" in formatter (:issue:`51903`) - Adding ``engine_kwargs`` parameter to :meth:`DataFrame.read_excel` (:issue:`52214`) +- Implemented ``__pandas_priority__`` to allow custom types to take precedence over :class:`DataFrame`, :class:`Series`, :class:`Index`, or :class:`ExtensionArray` for arithmetic operations, :ref:`see the developer guide ` (:issue:`48347`) +- Improve error message when having incompatible columns using :meth:`DataFrame.merge` (:issue:`51861`) +- Improve error message when setting :class:`DataFrame` with wrong number of columns through :meth:`DataFrame.isetitem` (:issue:`51701`) +- Improved error message when creating a DataFrame with empty data (0 rows), no index and an incorrect number of columns. (:issue:`52084`) +- Let :meth:`DataFrame.to_feather` accept a non-default :class:`Index` and non-string column names (:issue:`51787`) - .. --------------------------------------------------------------------------- @@ -200,38 +200,38 @@ Other API changes Deprecations ~~~~~~~~~~~~ +- Deprecated 'broadcast_axis' keyword in :meth:`Series.align` and :meth:`DataFrame.align`, upcast before calling ``align`` with ``left = DataFrame({col: left for col in right.columns}, index=right.index)`` (:issue:`51856`) +- Deprecated 'method', 'limit', and 'fill_axis' keywords in :meth:`DataFrame.align` and :meth:`Series.align`, explicitly call ``fillna`` on the alignment results instead (:issue:`51856`) - Deprecated :meth:`.DataFrameGroupBy.apply` and methods on the objects returned by :meth:`.DataFrameGroupBy.resample` operating on the grouping column(s); select the columns to operate on after groupby to either explicitly include or exclude the groupings and avoid the ``FutureWarning`` (:issue:`7155`) -- Deprecated silently dropping unrecognized timezones when parsing strings to datetimes (:issue:`18702`) -- Deprecated :meth:`DataFrame._data` and :meth:`Series._data`, use public APIs instead (:issue:`33333`) - Deprecated :meth:`.Groupby.all` and :meth:`.GroupBy.any` with datetime64 or :class:`PeriodDtype` values, matching the :class:`Series` and :class:`DataFrame` deprecations (:issue:`34479`) -- Deprecating pinning ``group.name`` to each group in :meth:`SeriesGroupBy.aggregate` aggregations; if your operation requires utilizing the groupby keys, iterate over the groupby object instead (:issue:`41090`) -- Deprecated the behavior of :func:`concat` with both ``len(keys) != len(objs)``, in a future version this will raise instead of truncating to the shorter of the two sequences (:issue:`43485`) -- Deprecated the default of ``observed=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby`; this will default to ``True`` in a future version (:issue:`43999`) -- Deprecated explicit support for subclassing :class:`Index` (:issue:`45289`) +- Deprecated :meth:`Categorical.to_list`, use ``obj.tolist()`` instead (:issue:`51254`) +- Deprecated :meth:`DataFrame._data` and :meth:`Series._data`, use public APIs instead (:issue:`33333`) - Deprecated :meth:`DataFrameGroupBy.dtypes`, check ``dtypes`` on the underlying object instead (:issue:`51045`) +- Deprecated ``axis=1`` in :meth:`DataFrame.ewm`, :meth:`DataFrame.rolling`, :meth:`DataFrame.expanding`, transpose before calling the method instead (:issue:`51778`) - Deprecated ``axis=1`` in :meth:`DataFrame.groupby` and in :class:`Grouper` constructor, do ``frame.T.groupby(...)`` instead (:issue:`51203`) -- Deprecated :meth:`Categorical.to_list`, use ``obj.tolist()`` instead (:issue:`51254`) -- Deprecated passing a :class:`DataFrame` to :meth:`DataFrame.from_records`, use :meth:`DataFrame.set_index` or :meth:`DataFrame.drop` instead (:issue:`51353`) - Deprecated accepting slices in :meth:`DataFrame.take`, call ``obj[slicer]`` or pass a sequence of integers instead (:issue:`51539`) -- Deprecated ``axis=1`` in :meth:`DataFrame.ewm`, :meth:`DataFrame.rolling`, :meth:`DataFrame.expanding`, transpose before calling the method instead (:issue:`51778`) +- Deprecated explicit support for subclassing :class:`Index` (:issue:`45289`) +- Deprecated passing a :class:`DataFrame` to :meth:`DataFrame.from_records`, use :meth:`DataFrame.set_index` or :meth:`DataFrame.drop` instead (:issue:`51353`) +- Deprecated silently dropping unrecognized timezones when parsing strings to datetimes (:issue:`18702`) - Deprecated the ``axis`` keyword in :meth:`DataFrame.ewm`, :meth:`Series.ewm`, :meth:`DataFrame.rolling`, :meth:`Series.rolling`, :meth:`DataFrame.expanding`, :meth:`Series.expanding` (:issue:`51778`) - Deprecated the ``axis`` keyword in :meth:`DataFrame.resample`, :meth:`Series.resample` (:issue:`51778`) -- Deprecated 'method', 'limit', and 'fill_axis' keywords in :meth:`DataFrame.align` and :meth:`Series.align`, explicitly call ``fillna`` on the alignment results instead (:issue:`51856`) -- Deprecated 'broadcast_axis' keyword in :meth:`Series.align` and :meth:`DataFrame.align`, upcast before calling ``align`` with ``left = DataFrame({col: left for col in right.columns}, index=right.index)`` (:issue:`51856`) +- Deprecated the behavior of :func:`concat` with both ``len(keys) != len(objs)``, in a future version this will raise instead of truncating to the shorter of the two sequences (:issue:`43485`) +- Deprecated the default of ``observed=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby`; this will default to ``True`` in a future version (:issue:`43999`) +- Deprecating pinning ``group.name`` to each group in :meth:`SeriesGroupBy.aggregate` aggregations; if your operation requires utilizing the groupby keys, iterate over the groupby object instead (:issue:`41090`) - Deprecated the 'axis' keyword in :meth:`.GroupBy.idxmax`, :meth:`.GroupBy.idxmin`, :meth:`.GroupBy.fillna`, :meth:`.GroupBy.take`, :meth:`.GroupBy.skew`, :meth:`.GroupBy.rank`, :meth:`.GroupBy.cumprod`, :meth:`.GroupBy.cumsum`, :meth:`.GroupBy.cummax`, :meth:`.GroupBy.cummin`, :meth:`.GroupBy.pct_change`, :meth:`GroupBy.diff`, :meth:`.GroupBy.shift`, and :meth:`DataFrameGroupBy.corrwith`; for ``axis=1`` operate on the underlying :class:`DataFrame` instead (:issue:`50405`, :issue:`51046`) -- Deprecated the "fastpath" keyword in :class:`Categorical` constructor, use :meth:`Categorical.from_codes` instead (:issue:`20110`) +- Deprecated :func:`is_categorical_dtype`, use ``isinstance(obj.dtype, pd.CategoricalDtype)`` instead (:issue:`52527`) +- Deprecated :func:`is_datetime64tz_dtype`, check ``isinstance(dtype, pd.DatetimeTZDtype)`` instead (:issue:`52607`) +- Deprecated :func:`is_int64_dtype`, check ``dtype == np.dtype(np.int64)`` instead (:issue:`52564`) +- Deprecated :func:`is_interval_dtype`, check ``isinstance(dtype, pd.IntervalDtype)`` instead (:issue:`52607`) +- Deprecated :meth:`DataFrame.swapaxes` and :meth:`Series.swapaxes`, use :meth:`DataFrame.transpose` or :meth:`Series.transpose` instead (:issue:`51946`) +- Deprecated ``freq`` parameter in :class:`PeriodArray` constructor, pass ``dtype`` instead (:issue:`52462`) - Deprecated behavior of :meth:`Series.dt.to_pydatetime`, in a future version this will return a :class:`Series` containing python ``datetime`` objects instead of an ``ndarray`` of datetimes; this matches the behavior of other :meth:`Series.dt` properties (:issue:`20306`) -- Deprecated passing a dictionary to :meth:`.SeriesGroupBy.agg`; pass a list of aggregations instead (:issue:`50684`) - Deprecated logical operations (``|``, ``&``, ``^``) between pandas objects and dtype-less sequences (e.g. ``list``, ``tuple``), wrap a sequence in a :class:`Series` or numpy array before operating instead (:issue:`51521`) -- Deprecated the methods :meth:`Series.bool` and :meth:`DataFrame.bool` (:issue:`51749`) -- Deprecated :meth:`DataFrame.swapaxes` and :meth:`Series.swapaxes`, use :meth:`DataFrame.transpose` or :meth:`Series.transpose` instead (:issue:`51946`) - Deprecated making :meth:`Series.apply` return a :class:`DataFrame` when the passed-in callable returns a :class:`Series` object. In the future this will return a :class:`Series` whose values are themselves :class:`Series`. This pattern was very slow and it's recommended to use alternative methods to archive the same goal (:issue:`52116`) - Deprecated parameter ``convert_type`` in :meth:`Series.apply` (:issue:`52140`) -- Deprecated ``freq`` parameter in :class:`PeriodArray` constructor, pass ``dtype`` instead (:issue:`52462`) -- Deprecated :func:`is_categorical_dtype`, use ``isinstance(obj.dtype, pd.CategoricalDtype)`` instead (:issue:`52527`) -- Deprecated :func:`is_int64_dtype`, check ``dtype == np.dtype(np.int64)`` instead (:issue:`52564`) -- Deprecated :func:`is_interval_dtype`, check ``isinstance(dtype, pd.IntervalDtype)`` instead (:issue:`52607`) -- Deprecated :func:`is_datetime64tz_dtype`, check ``isinstance(dtype, pd.DatetimeTZDtype)`` instead (:issue:`52607`) +- Deprecated passing a dictionary to :meth:`.SeriesGroupBy.agg`; pass a list of aggregations instead (:issue:`50684`) +- Deprecated the "fastpath" keyword in :class:`Categorical` constructor, use :meth:`Categorical.from_codes` instead (:issue:`20110`) +- Deprecated the methods :meth:`Series.bool` and :meth:`DataFrame.bool` (:issue:`51749`) - Deprecated unused "closed" and "normalize" keywords in the :class:`DatetimeIndex` constructor (:issue:`52628`) - Deprecated unused "closed" keyword in the :class:`TimedeltaIndex` constructor (:issue:`52628`) - @@ -241,26 +241,26 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Performance improvement in :func:`factorize` for object columns not containing strings (:issue:`51921`) +- Performance improvement in :func:`read_orc` when reading a remote URI file path. (:issue:`51609`) +- Performance improvement in :func:`read_parquet` and :meth:`DataFrame.to_parquet` when reading a remote file with ``engine="pyarrow"`` (:issue:`51609`) - Performance improvement in :func:`read_parquet` on string columns when using ``use_nullable_dtypes=True`` (:issue:`47345`) - Performance improvement in :meth:`DataFrame.clip` and :meth:`Series.clip` (:issue:`51472`) - Performance improvement in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` for extension array dtypes (:issue:`51549`) - Performance improvement in :meth:`DataFrame.where` when ``cond`` is backed by an extension dtype (:issue:`51574`) -- Performance improvement in :func:`read_orc` when reading a remote URI file path. (:issue:`51609`) -- Performance improvement in :func:`read_parquet` and :meth:`DataFrame.to_parquet` when reading a remote file with ``engine="pyarrow"`` (:issue:`51609`) +- Performance improvement in :meth:`MultiIndex.set_levels` and :meth:`MultiIndex.set_codes` when ``verify_integrity=True`` (:issue:`51873`) - Performance improvement in :meth:`MultiIndex.sortlevel` when ``ascending`` is a list (:issue:`51612`) -- Performance improvement in :meth:`~arrays.ArrowExtensionArray.isna` when array has zero nulls or is all nulls (:issue:`51630`) +- Performance improvement in :meth:`Series.combine_first` (:issue:`51777`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.fillna` when array does not contain nulls (:issue:`51635`) +- Performance improvement in :meth:`~arrays.ArrowExtensionArray.isna` when array has zero nulls or is all nulls (:issue:`51630`) - Performance improvement when parsing strings to ``boolean[pyarrow]`` dtype (:issue:`51730`) - Performance improvement when searching an :class:`Index` sliced from other indexes (:issue:`51738`) -- Performance improvement in :meth:`Series.combine_first` (:issue:`51777`) -- Performance improvement in :meth:`MultiIndex.set_levels` and :meth:`MultiIndex.set_codes` when ``verify_integrity=True`` (:issue:`51873`) -- Performance improvement in :func:`factorize` for object columns not containing strings (:issue:`51921`) - Performance improvement in :func:`concat` (:issue:`52291`, :issue:`52290`) - Performance improvement in :class:`Series` reductions (:issue:`52341`) -- Performance improvement in :meth:`Series.to_numpy` when dtype is a numpy float dtype and ``na_value`` is ``np.nan`` (:issue:`52430`) +- Performance improvement in :func:`concat` when ``axis=1`` and objects have different indexes (:issue:`52541`) - Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`) +- Performance improvement in :meth:`Series.to_numpy` when dtype is a numpy float dtype and ``na_value`` is ``np.nan`` (:issue:`52430`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`52525`) -- Performance improvement in :func:`concat` when ``axis=1`` and objects have different indexes (:issue:`52541`) - .. --------------------------------------------------------------------------- @@ -276,16 +276,16 @@ Categorical Datetimelike ^^^^^^^^^^^^ -- Bug in :meth:`Timestamp.round` with values close to the implementation bounds returning incorrect results instead of raising ``OutOfBoundsDatetime`` (:issue:`51494`) - :meth:`DatetimeIndex.map` with ``na_action="ignore"`` now works as expected. (:issue:`51644`) +- Bug in :meth:`Timestamp.round` with values close to the implementation bounds returning incorrect results instead of raising ``OutOfBoundsDatetime`` (:issue:`51494`) - Bug in :meth:`arrays.DatetimeArray.map` and :meth:`DatetimeIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`) - Timedelta ^^^^^^^^^ -- Bug in :meth:`Timedelta.round` with values close to the implementation bounds returning incorrect results instead of raising ``OutOfBoundsTimedelta`` (:issue:`51494`) -- Bug in :class:`TimedeltaIndex` division or multiplication leading to ``.freq`` of "0 Days" instead of ``None`` (:issue:`51575`) - :meth:`TimedeltaIndex.map` with ``na_action="ignore"`` now works as expected (:issue:`51644`) +- Bug in :class:`TimedeltaIndex` division or multiplication leading to ``.freq`` of "0 Days" instead of ``None`` (:issue:`51575`) +- Bug in :meth:`Timedelta.round` with values close to the implementation bounds returning incorrect results instead of raising ``OutOfBoundsTimedelta`` (:issue:`51494`) - Bug in :meth:`arrays.TimedeltaArray.map` and :meth:`TimedeltaIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`) - @@ -296,8 +296,8 @@ Timezones Numeric ^^^^^^^ -- Bug in :meth:`Series.corr` and :meth:`Series.cov` raising ``AttributeError`` for masked dtypes (:issue:`51422`) - Bug in :meth:`DataFrame.corrwith` raising ``NotImplementedError`` for pyarrow-backed dtypes (:issue:`52314`) +- Bug in :meth:`Series.corr` and :meth:`Series.cov` raising ``AttributeError`` for masked dtypes (:issue:`51422`) - Conversion @@ -333,19 +333,19 @@ MultiIndex I/O ^^^ -- Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`) - :meth:`DataFrame.to_orc` now raising ``ValueError`` when non-default :class:`Index` is given (:issue:`51828`) - Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`) +- Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`) - Period ^^^^^^ -- Bug in :class:`PeriodDtype` constructor failing to raise ``TypeError`` when no argument is passed or when ``None`` is passed (:issue:`27388`) - :meth:`PeriodIndex.map` with ``na_action="ignore"`` now works as expected (:issue:`51644`) +- Bug in :class:`PeriodDtype` constructor failing to raise ``TypeError`` when no argument is passed or when ``None`` is passed (:issue:`27388`) - Bug in :class:`PeriodDtype` constructor raising ``ValueError`` instead of ``TypeError`` when an invalid type is passed (:issue:`51790`) -- Bug in :meth:`arrays.PeriodArray.map` and :meth:`PeriodIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`) - Bug in :func:`read_csv` not processing empty strings as a null value, with ``engine="pyarrow"`` (:issue:`52087`) - Bug in :func:`read_csv` returning ``object`` dtype columns instead of ``float64`` dtype columns with ``engine="pyarrow"`` for columns that are all null with ``engine="pyarrow"`` (:issue:`52087`) +- Bug in :meth:`arrays.PeriodArray.map` and :meth:`PeriodIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`) - Bug in incorrectly allowing construction of :class:`Period` or :class:`PeriodDtype` with :class:`CustomBusinessDay` freq; use :class:`BusinessDay` instead (:issue:`52534`) - @@ -356,22 +356,22 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ +- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` in incorrectly allowing non-fixed ``freq`` when resampling on a :class:`TimedeltaIndex` (:issue:`51896`) - Bug in :meth:`DataFrameGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmax` return wrong dtype when used on empty DataFrameGroupBy or SeriesGroupBy (:issue:`51423`) - Bug in weighted rolling aggregations when specifying ``min_periods=0`` (:issue:`51449`) -- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` in incorrectly allowing non-fixed ``freq`` when resampling on a :class:`TimedeltaIndex` (:issue:`51896`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby`, where, when the index of the grouped :class:`Series` or :class:`DataFrame` was a :class:`DatetimeIndex`, :class:`TimedeltaIndex` or :class:`PeriodIndex`, and the ``groupby`` method was given a function as its first argument, the function operated on the whole index rather than each element of the index. (:issue:`51979`) +- Bug in :meth:`DataFrameGroupBy.apply` causing an error to be raised when the input :class:`DataFrame` was subset as a :class:`DataFrame` after groupby (``[['a']]`` and not ``['a']``) and the given callable returned :class:`Series` that were not all indexed the same. (:issue:`52444`) - Bug in :meth:`GroupBy.groups` with a datetime key in conjunction with another key produced incorrect number of group keys (:issue:`51158`) - Bug in :meth:`GroupBy.var` failing to raise ``TypeError`` when called with datetime64 or :class:`PeriodDtype` values (:issue:`52128`) -- Bug in :meth:`DataFrameGroupBy.apply` causing an error to be raised when the input :class:`DataFrame` was subset as a :class:`DataFrame` after groupby (``[['a']]`` and not ``['a']``) and the given callable returned :class:`Series` that were not all indexed the same. (:issue:`52444`) - Reshaping ^^^^^^^^^ -- Bug in :meth:`DataFrame.stack` losing extension dtypes when columns is a :class:`MultiIndex` and frame contains mixed dtypes (:issue:`45740`) - Bug in :meth:`DataFrame.agg` and :meth:`Series.agg` on non-unique columns would return incorrect type when dist-like argument passed in (:issue:`51099`) +- Bug in :meth:`DataFrame.stack` losing extension dtypes when columns is a :class:`MultiIndex` and frame contains mixed dtypes (:issue:`45740`) - Bug in :meth:`DataFrame.transpose` inferring dtype for object column (:issue:`51546`) - Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` and losing precision on very large integers (:issue:`51764`) - @@ -383,8 +383,8 @@ Sparse ExtensionArray ^^^^^^^^^^^^^^ -- Bug where the ``__from_arrow__`` method of masked ExtensionDtypes(e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept pyarrow arrays of type ``pyarrow.null()`` (:issue:`52223`) - Bug in :meth:`Series.rank` returning wrong order for small values with ``Float64`` dtype (:issue:`52471`) +- Bug where the ``__from_arrow__`` method of masked ExtensionDtypes(e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept pyarrow arrays of type ``pyarrow.null()`` (:issue:`52223`) - Styler @@ -395,10 +395,10 @@ Styler Other ^^^^^ - Bug in :func:`assert_almost_equal` now throwing assertion error for two unequal sets (:issue:`51727`) -- Bug in :meth:`Series.memory_usage` when ``deep=True`` throw an error with Series of objects and the returned value is incorrect, as it does not take into account GC corrections (:issue:`51858`) - Bug in :func:`assert_frame_equal` checks category dtypes even when asked not to check index type (:issue:`52126`) -- Bug in :meth:`Series.map` when giving a callable to an empty series, the returned series had ``object`` dtype. It now keeps the original dtype (:issue:`52384`) - Bug in :meth:`DataFrame.reindex` with a ``fill_value`` that should be inferred with a :class:`ExtensionDtype` incorrectly inferring ``object`` dtype (:issue:`52586`) +- Bug in :meth:`Series.map` when giving a callable to an empty series, the returned series had ``object`` dtype. It now keeps the original dtype (:issue:`52384`) +- Bug in :meth:`Series.memory_usage` when ``deep=True`` throw an error with Series of objects and the returned value is incorrect, as it does not take into account GC corrections (:issue:`51858`) - .. ***DO NOT USE THIS SECTION*** diff --git a/scripts/sort_whatsnew_note.py b/scripts/sort_whatsnew_note.py index e4ab44984b0d8..ae1d3346a5827 100644 --- a/scripts/sort_whatsnew_note.py +++ b/scripts/sort_whatsnew_note.py @@ -30,7 +30,9 @@ import sys from typing import Sequence -pattern = re.compile(r"\(:issue:`(\d+)`\)\n$") +# Check line starts with `-` and ends with e.g. `(:issue:`12345`)`, +# possibly with a trailing full stop. +pattern = re.compile(r"-.*\(:issue:`(\d+)`\)\.?$") def sort_whatsnew_note(content: str) -> int: @@ -41,8 +43,7 @@ def sort_whatsnew_note(content: str) -> int: if line.startswith("- ") and pattern.search(line) is not None: block.append(line) else: - key = lambda x: int(pattern.search(x).group(1)) - block = sorted(block, key=key) + block = sorted(block) new_lines.extend(block) new_lines.append(line) block = [] diff --git a/scripts/tests/test_sort_whatsnew_note.py b/scripts/tests/test_sort_whatsnew_note.py index 6e40f6814c402..95ba74bbe4030 100644 --- a/scripts/tests/test_sort_whatsnew_note.py +++ b/scripts/tests/test_sort_whatsnew_note.py @@ -10,8 +10,8 @@ def test_sort_whatsnew_note(): "\n" "Timedelta\n" "^^^^^^^^^\n" - "- Bug in :class:`TimedeltaIndex` (:issue:`51575`)\n" "- Bug in :meth:`Timedelta.round` (:issue:`51494`)\n" + "- Bug in :class:`TimedeltaIndex` (:issue:`51575`)\n" "\n" ) expected = ( @@ -22,8 +22,8 @@ def test_sort_whatsnew_note(): "\n" "Timedelta\n" "^^^^^^^^^\n" - "- Bug in :meth:`Timedelta.round` (:issue:`51494`)\n" "- Bug in :class:`TimedeltaIndex` (:issue:`51575`)\n" + "- Bug in :meth:`Timedelta.round` (:issue:`51494`)\n" "\n" ) result = sort_whatsnew_note(content) From 0752367acdf12a25d4bfad7769b935a17fbc7899 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 13 Apr 2023 09:44:19 -0700 Subject: [PATCH 13/14] Fix redundant entries --- doc/source/whatsnew/v2.0.1.rst | 4 ---- 1 file changed, 4 deletions(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index b082758ebd5b1..e70f243a8914f 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -31,17 +31,13 @@ Bug fixes - Bug in :func:`read_csv` casting PyArrow datetimes to NumPy when ``dtype_backend="pyarrow"`` and ``parse_dates`` is set causing a performance bottleneck in the process (:issue:`52546`) - Bug in :func:`read_csv` casting PyArrow datetimes to NumPy when ``dtype_backend="pyarrow"`` and ``parse_dates`` is set causing a performance bottleneck in the process (:issue:`52546`) - Bug in :func:`to_datetime` and :func:`to_timedelta` when trying to convert numeric data with a :class:`ArrowDtype` (:issue:`52425`) -- Bug in :func:`to_datetime` and :func:`to_timedelta` when trying to convert numeric data with a :class:`ArrowDtype` (:issue:`52425`) - Bug in :func:`to_numeric` with ``errors='coerce'`` and ``dtype_backend='pyarrow'`` with :class:`ArrowDtype` data (:issue:`52588`) - Bug in :meth:`ArrowDtype.__from_arrow__` not respecting if dtype is explicitly given (:issue:`52533`) - Bug in :meth:`ArrowDtype.__from_arrow__` not respecting if dtype is explicitly given (:issue:`52533`) - Bug in :meth:`DataFrame.max` and related casting different :class:`Timestamp` resolutions always to nanoseconds (:issue:`52524`) - Bug in :meth:`DataFrame.max` and related casting different :class:`Timestamp` resolutions always to nanoseconds (:issue:`52524`) - Bug in :meth:`Series.describe` not returning :class:`ArrowDtype` with ``pyarrow.float64`` type with numeric data (:issue:`52427`) -- Bug in :meth:`Series.describe` not returning :class:`ArrowDtype` with ``pyarrow.float64`` type with numeric data (:issue:`52427`) - Fixed bug in :func:`merge` when merging with ``ArrowDtype`` one one and a NumPy dtype on the other side (:issue:`52406`) -- Fixed bug in :func:`merge` when merging with ``ArrowDtype`` one one and a NumPy dtype on the other side (:issue:`52406`) -- Fixed segfault in :meth:`Series.to_numpy` with ``null[pyarrow]`` dtype (:issue:`52443`) - Fixed segfault in :meth:`Series.to_numpy` with ``null[pyarrow]`` dtype (:issue:`52443`) .. --------------------------------------------------------------------------- From 4380bd9251adf9363743ea3a22afda1fb316fca6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 13 Apr 2023 09:49:08 -0700 Subject: [PATCH 14/14] remove redundant entries --- doc/source/whatsnew/v2.0.1.rst | 4 ---- 1 file changed, 4 deletions(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index e70f243a8914f..c64f7a46d3058 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -27,14 +27,10 @@ Bug fixes - Bug in :attr:`Series.dt.days` that would overflow ``int32`` number of days (:issue:`52391`) - Bug in :class:`arrays.DatetimeArray` constructor returning an incorrect unit when passed a non-nanosecond numpy datetime array (:issue:`52555`) - Bug in :func:`pandas.testing.assert_series_equal` where ``check_dtype=False`` would still raise for datetime or timedelta types with different resolutions (:issue:`52449`) -- Bug in :func:`pandas.testing.assert_series_equal` where ``check_dtype=False`` would still raise for datetime or timedelta types with different resolutions (:issue:`52449`) -- Bug in :func:`read_csv` casting PyArrow datetimes to NumPy when ``dtype_backend="pyarrow"`` and ``parse_dates`` is set causing a performance bottleneck in the process (:issue:`52546`) - Bug in :func:`read_csv` casting PyArrow datetimes to NumPy when ``dtype_backend="pyarrow"`` and ``parse_dates`` is set causing a performance bottleneck in the process (:issue:`52546`) - Bug in :func:`to_datetime` and :func:`to_timedelta` when trying to convert numeric data with a :class:`ArrowDtype` (:issue:`52425`) - Bug in :func:`to_numeric` with ``errors='coerce'`` and ``dtype_backend='pyarrow'`` with :class:`ArrowDtype` data (:issue:`52588`) - Bug in :meth:`ArrowDtype.__from_arrow__` not respecting if dtype is explicitly given (:issue:`52533`) -- Bug in :meth:`ArrowDtype.__from_arrow__` not respecting if dtype is explicitly given (:issue:`52533`) -- Bug in :meth:`DataFrame.max` and related casting different :class:`Timestamp` resolutions always to nanoseconds (:issue:`52524`) - Bug in :meth:`DataFrame.max` and related casting different :class:`Timestamp` resolutions always to nanoseconds (:issue:`52524`) - Bug in :meth:`Series.describe` not returning :class:`ArrowDtype` with ``pyarrow.float64`` type with numeric data (:issue:`52427`) - Fixed bug in :func:`merge` when merging with ``ArrowDtype`` one one and a NumPy dtype on the other side (:issue:`52406`)