From af49f1716ca6bc8d1b8263bb977653bf076e493f Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 5 Mar 2022 17:58:17 -0800 Subject: [PATCH] CLN: Assorted cleanups --- pandas/_libs/groupby.pyx | 72 +++++++++++++++------ pandas/_libs/tslibs/conversion.pyx | 12 +--- pandas/_libs/tslibs/tzconversion.pyx | 55 ++++------------ pandas/_libs/tslibs/vectorized.pyi | 2 +- pandas/_libs/tslibs/vectorized.pyx | 13 ++-- pandas/core/arrays/categorical.py | 13 +--- pandas/core/dtypes/cast.py | 12 ++-- pandas/core/generic.py | 6 +- pandas/core/groupby/ops.py | 15 +++-- pandas/core/series.py | 3 +- pandas/tests/extension/base/setitem.py | 1 - pandas/tests/groupby/test_groupby.py | 4 +- pandas/tests/plotting/test_datetimelike.py | 4 +- pandas/tests/series/methods/test_replace.py | 7 +- 14 files changed, 103 insertions(+), 116 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index f751d57c186b0..12fe78a0f8a18 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -30,8 +30,8 @@ from numpy.math cimport NAN cnp.import_array() +from pandas._libs cimport util from pandas._libs.algos cimport kth_smallest_c -from pandas._libs.util cimport get_nat from pandas._libs.algos import ( ensure_platform_int, @@ -49,7 +49,7 @@ from pandas._libs.dtypes cimport ( from pandas._libs.missing cimport checknull -cdef int64_t NPY_NAT = get_nat() +cdef int64_t NPY_NAT = util.get_nat() _int64_max = np.iinfo(np.int64).max cdef float64_t NaN = np.NaN @@ -248,13 +248,7 @@ def group_cumsum( accum = np.zeros((ngroups, K), dtype=np.asarray(values).dtype) compensation = np.zeros((ngroups, K), dtype=np.asarray(values).dtype) - if numeric_t == float32_t or numeric_t == float64_t: - na_val = NaN - elif numeric_t is int64_t and is_datetimelike: - na_val = NPY_NAT - else: - # Will not be used, but define to avoid unitialized warning. - na_val = 0 + na_val = _get_na_val(0, is_datetimelike) with nogil: for i in range(N): @@ -995,6 +989,47 @@ cdef inline bint _treat_as_na(numeric_object_t val, bint is_datetimelike) nogil: return False +cdef numeric_t _get_min_or_max(numeric_t val, bint compute_max): + """ + Find either the min or the max supported by numeric_t; 'val' is a placeholder + to effectively make numeric_t an argument. + """ + if numeric_t is int64_t: + return -_int64_max if compute_max else util.INT64_MAX + elif numeric_t is int32_t: + return util.INT32_MIN if compute_max else util.INT32_MAX + elif numeric_t is int16_t: + return util.INT16_MIN if compute_max else util.INT16_MAX + elif numeric_t is int8_t: + return util.INT8_MIN if compute_max else util.INT8_MAX + + elif numeric_t is uint64_t: + return 0 if compute_max else util.UINT64_MAX + elif numeric_t is uint32_t: + return 0 if compute_max else util.UINT32_MAX + elif numeric_t is uint16_t: + return 0 if compute_max else util.UINT16_MAX + elif numeric_t is uint8_t: + return 0 if compute_max else util.UINT8_MAX + + else: + return -np.inf if compute_max else np.inf + + +cdef numeric_t _get_na_val(numeric_t val, bint is_datetimelike): + cdef: + numeric_t na_val + + if numeric_t == float32_t or numeric_t == float64_t: + na_val = NaN + elif numeric_t is int64_t and is_datetimelike: + na_val = NPY_NAT + else: + # Will not be used, but define to avoid unitialized warning. + na_val = 0 + return na_val + + # TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can # use `const iu_64_floating_obj_t[:, :] values` @cython.wraparound(False) @@ -1359,16 +1394,17 @@ cdef group_min_max( nobs = np.zeros((out).shape, dtype=np.int64) group_min_or_max = np.empty_like(out) + group_min_or_max[:] = _get_min_or_max(0, compute_max) + if iu_64_floating_t is int64_t: - group_min_or_max[:] = -_int64_max if compute_max else _int64_max + # TODO: only if is_datetimelike? nan_val = NPY_NAT elif iu_64_floating_t is uint64_t: # NB: We do not define nan_val because there is no such thing # for uint64_t. We carefully avoid having to reference it in this # case. - group_min_or_max[:] = 0 if compute_max else np.iinfo(np.uint64).max + pass else: - group_min_or_max[:] = -np.inf if compute_max else np.inf nan_val = NAN N, K = (values).shape @@ -1527,26 +1563,20 @@ cdef group_cummin_max( bint isna_entry accum = np.empty((ngroups, (values).shape[1]), dtype=values.dtype) - if iu_64_floating_t is int64_t: - accum[:] = -_int64_max if compute_max else _int64_max - elif iu_64_floating_t is uint64_t: - accum[:] = 0 if compute_max else np.iinfo(np.uint64).max - else: - accum[:] = -np.inf if compute_max else np.inf + accum[:] = _get_min_or_max(0, compute_max) + + na_val = _get_na_val(0, is_datetimelike) if uses_mask: na_possible = True # Will never be used, just to avoid uninitialized warning na_val = 0 elif iu_64_floating_t is float64_t or iu_64_floating_t is float32_t: - na_val = NaN na_possible = True elif is_datetimelike: - na_val = NPY_NAT na_possible = True else: # Will never be used, just to avoid uninitialized warning - na_val = 0 na_possible = False if na_possible: diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index a6dc8cc16b229..e30a91ae3e10a 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -757,7 +757,7 @@ cdef inline bint _infer_tsobject_fold( _TSObject obj, const int64_t[:] trans, const int64_t[:] deltas, - int32_t pos, + intp_t pos, ): """ Infer _TSObject fold property from value by assuming 0 and then setting @@ -770,7 +770,7 @@ cdef inline bint _infer_tsobject_fold( ndarray of offset transition points in nanoseconds since epoch. deltas : int64_t[:] array of offsets corresponding to transition points in trans. - pos : int32_t + pos : intp_t Position of the last transition point before taking fold into account. Returns @@ -828,13 +828,7 @@ cpdef inline datetime localize_pydatetime(datetime dt, tzinfo tz): return dt elif isinstance(dt, ABCTimestamp): return dt.tz_localize(tz) - elif is_utc(tz): - return _localize_pydatetime(dt, tz) - try: - # datetime.replace with pytz may be incorrect result - return tz.localize(dt) - except AttributeError: - return dt.replace(tzinfo=tz) + return _localize_pydatetime(dt, tz) # ---------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index d28b851d0fbc1..4dbfabad5dc84 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -513,12 +513,13 @@ cdef const int64_t[:] _tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): # OSError may be thrown by tzlocal on windows at or close to 1970-01-01 # see https://github.com/pandas-dev/pandas/pull/37591#issuecomment-720628241 -cdef inline int64_t _tzlocal_get_offset_components(int64_t val, tzinfo tz, - bint to_utc, - bint *fold=NULL) except? -1: +cdef int64_t _tz_convert_tzlocal_utc(int64_t val, tzinfo tz, bint to_utc=True, + bint* fold=NULL) except? -1: """ - Calculate offset in nanoseconds needed to convert the i8 representation of - a datetime from a tzlocal timezone to UTC, or vice-versa. + Convert the i8 representation of a datetime from a tzlocal timezone to + UTC, or vice-versa. + + Private, not intended for use outside of tslibs.conversion Parameters ---------- @@ -529,10 +530,11 @@ cdef inline int64_t _tzlocal_get_offset_components(int64_t val, tzinfo tz, fold : bint*, default NULL pointer to fold: whether datetime ends up in a fold or not after adjustment + Only passed with to_utc=False. Returns ------- - delta : int64_t + result : int64_t Notes ----- @@ -553,45 +555,12 @@ cdef inline int64_t _tzlocal_get_offset_components(int64_t val, tzinfo tz, dt = dt.replace(tzinfo=tzutc()) dt = dt.astimezone(tz) - if fold is not NULL: - fold[0] = dt.fold + if fold is not NULL: + # NB: fold is only passed with to_utc=False + fold[0] = dt.fold td = tz.utcoffset(dt) - return int(td.total_seconds() * 1_000_000_000) - - -# OSError may be thrown by tzlocal on windows at or close to 1970-01-01 -# see https://github.com/pandas-dev/pandas/pull/37591#issuecomment-720628241 -cdef int64_t _tz_convert_tzlocal_utc(int64_t val, tzinfo tz, bint to_utc=True, - bint* fold=NULL) except? -1: - """ - Convert the i8 representation of a datetime from a tzlocal timezone to - UTC, or vice-versa. - - Private, not intended for use outside of tslibs.conversion - - Parameters - ---------- - val : int64_t - tz : tzinfo - to_utc : bint - True if converting tzlocal _to_ UTC, False if going the other direction - fold : bint* - pointer to fold: whether datetime ends up in a fold or not - after adjustment - - Returns - ------- - result : int64_t - - Notes - ----- - Sets fold by pointer - """ - cdef: - int64_t delta - - delta = _tzlocal_get_offset_components(val, tz, to_utc, fold) + delta = int(td.total_seconds() * 1_000_000_000) if to_utc: return val - delta diff --git a/pandas/_libs/tslibs/vectorized.pyi b/pandas/_libs/tslibs/vectorized.pyi index e9a39a6a75a39..c138050c9c17f 100644 --- a/pandas/_libs/tslibs/vectorized.pyi +++ b/pandas/_libs/tslibs/vectorized.pyi @@ -30,7 +30,7 @@ def get_resolution( def ints_to_pydatetime( arr: npt.NDArray[np.int64], # const int64_t[:}] tz: tzinfo | None = ..., - freq: str | BaseOffset | None = ..., + freq: BaseOffset | None = ..., fold: bool = ..., box: str = ..., ) -> npt.NDArray[np.object_]: ... diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 63cf7d2ce23ee..17720de33ab33 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -27,7 +27,7 @@ from .np_datetime cimport ( dt64_to_dtstruct, npy_datetimestruct, ) -from .offsets cimport to_offset +from .offsets cimport BaseOffset from .period cimport get_period_ordinal from .timestamps cimport create_timestamp_from_ts from .timezones cimport ( @@ -87,7 +87,7 @@ cdef inline object create_time_from_ts( def ints_to_pydatetime( const int64_t[:] stamps, tzinfo tz=None, - object freq=None, + BaseOffset freq=None, bint fold=False, str box="datetime" ) -> np.ndarray: @@ -99,7 +99,7 @@ def ints_to_pydatetime( stamps : array of i8 tz : str, optional convert to this timezone - freq : str/Offset, optional + freq : BaseOffset, optional freq to convert fold : bint, default is 0 Due to daylight saving time, one wall clock time can occur twice @@ -138,9 +138,6 @@ def ints_to_pydatetime( func_create = create_date_from_ts elif box == "timestamp": func_create = create_timestamp_from_ts - - if isinstance(freq, str): - freq = to_offset(freq) elif box == "time": func_create = create_time_from_ts elif box == "datetime": @@ -311,7 +308,6 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t pos = trans.searchsorted(stamps, side="right") - 1 for i in range(n): - # TODO: reinstate nogil for use_utc case? if stamps[i] == NPY_NAT: result[i] = NPY_NAT continue @@ -393,7 +389,7 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: @cython.boundscheck(False) def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): cdef: - Py_ssize_t n = len(stamps) + Py_ssize_t i, n = len(stamps) int64_t[:] result = np.empty(n, dtype=np.int64) ndarray[int64_t] trans int64_t[:] deltas @@ -416,7 +412,6 @@ def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): pos = trans.searchsorted(stamps, side="right") - 1 for i in range(n): - # TODO: reinstate nogil for use_utc case? if stamps[i] == NPY_NAT: result[i] = NPY_NAT continue diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 709ddd049b07b..016a19e474066 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -49,10 +49,7 @@ from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg -from pandas.core.dtypes.cast import ( - coerce_indexer_dtype, - maybe_cast_to_extension_array, -) +from pandas.core.dtypes.cast import coerce_indexer_dtype from pandas.core.dtypes.common import ( ensure_int64, ensure_platform_int, @@ -64,7 +61,6 @@ is_hashable, is_integer_dtype, is_list_like, - is_object_dtype, is_scalar, is_timedelta64_dtype, needs_i8_conversion, @@ -2758,13 +2754,6 @@ def _get_codes_for_values(values, categories: Index) -> np.ndarray: codes = _get_codes_for_values(flat, categories) return codes.reshape(values.shape) - if isinstance(categories.dtype, ExtensionDtype) and is_object_dtype(values): - # Support inferring the correct extension dtype from an array of - # scalar objects. e.g. - # Categorical(array[Period, Period], categories=PeriodIndex(...)) - cls = categories.dtype.construct_array_type() - values = maybe_cast_to_extension_array(cls, values) - codes = categories.get_indexer_for(values) return coerce_indexer_dtype(codes, categories) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 9f25dd90caefd..929dfa5c12078 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1268,9 +1268,9 @@ def maybe_cast_to_datetime( if is_timedelta64_dtype(dtype): # TODO: _from_sequence would raise ValueError in cases where - # ensure_nanosecond_dtype raises TypeError + # _ensure_nanosecond_dtype raises TypeError dtype = cast(np.dtype, dtype) - dtype = ensure_nanosecond_dtype(dtype) + dtype = _ensure_nanosecond_dtype(dtype) res = TimedeltaArray._from_sequence(value, dtype=dtype) return res @@ -1281,7 +1281,7 @@ def maybe_cast_to_datetime( vdtype = getattr(value, "dtype", None) if is_datetime64 or is_datetime64tz: - dtype = ensure_nanosecond_dtype(dtype) + dtype = _ensure_nanosecond_dtype(dtype) value = np.array(value, copy=False) @@ -1399,14 +1399,14 @@ def sanitize_to_nanoseconds(values: np.ndarray, copy: bool = False) -> np.ndarra return values -def ensure_nanosecond_dtype(dtype: DtypeObj) -> DtypeObj: +def _ensure_nanosecond_dtype(dtype: DtypeObj) -> DtypeObj: """ Convert dtypes with granularity less than nanosecond to nanosecond - >>> ensure_nanosecond_dtype(np.dtype("M8[s]")) + >>> _ensure_nanosecond_dtype(np.dtype("M8[s]")) dtype('>> ensure_nanosecond_dtype(np.dtype("m8[ps]")) + >>> _ensure_nanosecond_dtype(np.dtype("m8[ps]")) Traceback (most recent call last): ... TypeError: cannot convert timedeltalike to dtype [timedelta64[ps]] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3dcc41b0b68c9..3a8081a234a3b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -114,16 +114,17 @@ ) from pandas.core import ( + algorithms as algos, arraylike, + common as com, indexing, missing, nanops, + sample, ) -import pandas.core.algorithms as algos from pandas.core.array_algos.replace import should_use_regex from pandas.core.arrays import ExtensionArray from pandas.core.base import PandasObject -import pandas.core.common as com from pandas.core.construction import ( create_series_with_explicit_dtype, extract_array, @@ -148,7 +149,6 @@ from pandas.core.missing import find_valid_index from pandas.core.ops import align_method_FRAME from pandas.core.reshape.concat import concat -import pandas.core.sample as sample from pandas.core.shared_docs import _shared_docs from pandas.core.sorting import get_indexer_indexer from pandas.core.window import ( diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index e4e42e7a1178e..6e6ef14a25941 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -331,9 +331,9 @@ def _ea_wrap_cython_operation( **kwargs, ) - if self.how in ["rank"]: - # i.e. how in WrappedCythonOp.cast_blocklist, since - # other cast_blocklist methods dont go through cython_operation + if self.how in self.cast_blocklist: + # i.e. how in ["rank"], since other cast_blocklist methods dont go + # through cython_operation return res_values return self._reconstruct_ea_result(values, res_values) @@ -571,7 +571,14 @@ def _call_cython_op( **kwargs, ) else: - func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) + func( + out=result, + values=values, + labels=comp_ids, + ngroups=ngroups, + is_datetimelike=is_datetimelike, + **kwargs, + ) if self.kind == "aggregate": # i.e. counts is defined. Locations where count None: Reset the cacher. """ if hasattr(self, "_cacher"): - # should only get here with self.ndim == 1 del self._cacher def _set_as_cached(self, item, cacher) -> None: diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 0c7fc61e2a7d9..c2db54d832195 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -365,7 +365,6 @@ def test_setitem_frame_2d_values(self, data): # https://github.com/pandas-dev/pandas/pull/44514#discussion_r754002410 using_array_manager = isinstance(df._mgr, pd.core.internals.ArrayManager) - df = pd.DataFrame({"A": data}) blk_data = df._mgr.arrays[0] orig = df.copy() diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 3fa7d3646a0bb..7bf63bb3c2cac 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2469,7 +2469,7 @@ def test_groupby_numerical_stability_cumsum(): def test_groupby_cumsum_skipna_false(): - # don't propagate np.nan above the diagonal + # GH#46216 don't propagate np.nan above the diagonal arr = np.random.randn(5, 5) df = DataFrame(arr) for i in range(5): @@ -2485,7 +2485,7 @@ def test_groupby_cumsum_skipna_false(): def test_groupby_cumsum_timedelta64(): - # don't ignore is_datetimelike in libgroupby.group_cumsum + # GH#46216 don't ignore is_datetimelike in libgroupby.group_cumsum dti = date_range("2016-01-01", periods=5) ser = Series(dti) - dti[0] ser[2] = pd.NaT diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 9b08cd0637751..cb428daac84ba 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -558,7 +558,7 @@ def test_gaps(self): # non-ts idx = [0, 1, 2, 5, 7, 9, 12, 15, 20] ser = Series(np.random.randn(len(idx)), idx) - ser[2:5] = np.nan + ser.iloc[2:5] = np.nan _, ax = self.plt.subplots() ser.plot(ax=ax) lines = ax.get_lines() @@ -573,7 +573,7 @@ def test_gaps(self): def test_gap_upsample(self): low = tm.makeTimeSeries() - low[5:25] = np.nan + low.iloc[5:25] = np.nan _, ax = self.plt.subplots() low.plot(ax=ax) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 7db93eb27d82b..7177648df27ce 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -659,7 +659,12 @@ def test_replace_nullable_numeric(self): assert ints.replace(1, 9).dtype == ints.dtype assert ints.replace({1: 9.0}).dtype == ints.dtype assert ints.replace(1, 9.0).dtype == ints.dtype - # FIXME: ints.replace({1: 9.5}) raises bc of incorrect _can_hold_element + + # nullable (for now) raises instead of casting + with pytest.raises(TypeError, match="Invalid value"): + ints.replace({1: 9.5}) + with pytest.raises(TypeError, match="Invalid value"): + ints.replace(1, 9.5) @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_series(self, regex):