pandas-dev · jreback · Mar 6, 2022 · Mar 6, 2022
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -30,8 +30,8 @@ from numpy.math cimport NAN
 
 cnp.import_array()
 
+from pandas._libs cimport util
 from pandas._libs.algos cimport kth_smallest_c
-from pandas._libs.util cimport get_nat
 
 from pandas._libs.algos import (
     ensure_platform_int,
@@ -49,7 +49,7 @@ from pandas._libs.dtypes cimport (
 from pandas._libs.missing cimport checknull
 
 
-cdef int64_t NPY_NAT = get_nat()
+cdef int64_t NPY_NAT = util.get_nat()
 _int64_max = np.iinfo(np.int64).max
 
 cdef float64_t NaN = <float64_t>np.NaN
@@ -248,13 +248,7 @@ def group_cumsum(
     accum = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)
     compensation = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)
 
-    if numeric_t == float32_t or numeric_t == float64_t:
-        na_val = NaN
-    elif numeric_t is int64_t and is_datetimelike:
-        na_val = NPY_NAT
-    else:
-        # Will not be used, but define to avoid unitialized warning.
-        na_val = 0
+    na_val = _get_na_val(<numeric_t>0, is_datetimelike)
 
     with nogil:
         for i in range(N):
@@ -995,6 +989,47 @@ cdef inline bint _treat_as_na(numeric_object_t val, bint is_datetimelike) nogil:
         return False
 
 
+cdef numeric_t _get_min_or_max(numeric_t val, bint compute_max):
+    """
+    Find either the min or the max supported by numeric_t; 'val' is a placeholder
+    to effectively make numeric_t an argument.
+    """
+    if numeric_t is int64_t:
+        return -_int64_max if compute_max else util.INT64_MAX
+    elif numeric_t is int32_t:
+        return util.INT32_MIN if compute_max else util.INT32_MAX
+    elif numeric_t is int16_t:
+        return util.INT16_MIN if compute_max else util.INT16_MAX
+    elif numeric_t is int8_t:
+        return util.INT8_MIN if compute_max else util.INT8_MAX
+
+    elif numeric_t is uint64_t:
+        return 0 if compute_max else util.UINT64_MAX
+    elif numeric_t is uint32_t:
+        return 0 if compute_max else util.UINT32_MAX
+    elif numeric_t is uint16_t:
+        return 0 if compute_max else util.UINT16_MAX
+    elif numeric_t is uint8_t:
+        return 0 if compute_max else util.UINT8_MAX
+
+    else:
+        return -np.inf if compute_max else np.inf
+
+
+cdef numeric_t _get_na_val(numeric_t val, bint is_datetimelike):
+    cdef:
+        numeric_t na_val
+
+    if numeric_t == float32_t or numeric_t == float64_t:
+        na_val = NaN
+    elif numeric_t is int64_t and is_datetimelike:
+        na_val = NPY_NAT
+    else:
+        # Will not be used, but define to avoid unitialized warning.
+        na_val = 0
+    return na_val
+
+
 # TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can
 #  use `const iu_64_floating_obj_t[:, :] values`
 @cython.wraparound(False)
@@ -1359,16 +1394,17 @@ cdef group_min_max(
     nobs = np.zeros((<object>out).shape, dtype=np.int64)
 
     group_min_or_max = np.empty_like(out)
+    group_min_or_max[:] = _get_min_or_max(<iu_64_floating_t>0, compute_max)
+
     if iu_64_floating_t is int64_t:
-        group_min_or_max[:] = -_int64_max if compute_max else _int64_max
+        # TODO: only if is_datetimelike?
         nan_val = NPY_NAT
     elif iu_64_floating_t is uint64_t:
         # NB: We do not define nan_val because there is no such thing
         # for uint64_t.  We carefully avoid having to reference it in this
         # case.
-        group_min_or_max[:] = 0 if compute_max else np.iinfo(np.uint64).max
+        pass
     else:
-        group_min_or_max[:] = -np.inf if compute_max else np.inf
         nan_val = NAN
 
     N, K = (<object>values).shape
@@ -1527,26 +1563,20 @@ cdef group_cummin_max(
         bint isna_entry
 
     accum = np.empty((ngroups, (<object>values).shape[1]), dtype=values.dtype)
-    if iu_64_floating_t is int64_t:
-        accum[:] = -_int64_max if compute_max else _int64_max
-    elif iu_64_floating_t is uint64_t:
-        accum[:] = 0 if compute_max else np.iinfo(np.uint64).max
-    else:
-        accum[:] = -np.inf if compute_max else np.inf
+    accum[:] = _get_min_or_max(<iu_64_floating_t>0, compute_max)
+
+    na_val = _get_na_val(<iu_64_floating_t>0, is_datetimelike)
 
     if uses_mask:
         na_possible = True
         # Will never be used, just to avoid uninitialized warning
         na_val = 0
     elif iu_64_floating_t is float64_t or iu_64_floating_t is float32_t:
-        na_val = NaN
         na_possible = True
     elif is_datetimelike:
-        na_val = NPY_NAT
         na_possible = True
     else:
         # Will never be used, just to avoid uninitialized warning
-        na_val = 0
         na_possible = False
 
     if na_possible:

diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx
@@ -757,7 +757,7 @@ cdef inline bint _infer_tsobject_fold(
     _TSObject obj,
     const int64_t[:] trans,
     const int64_t[:] deltas,
-    int32_t pos,
+    intp_t pos,
 ):
     """
     Infer _TSObject fold property from value by assuming 0 and then setting
@@ -770,7 +770,7 @@ cdef inline bint _infer_tsobject_fold(
         ndarray of offset transition points in nanoseconds since epoch.
     deltas : int64_t[:]
         array of offsets corresponding to transition points in trans.
-    pos : int32_t
+    pos : intp_t
         Position of the last transition point before taking fold into account.
 
     Returns
@@ -828,13 +828,7 @@ cpdef inline datetime localize_pydatetime(datetime dt, tzinfo tz):
         return dt
     elif isinstance(dt, ABCTimestamp):
         return dt.tz_localize(tz)
-    elif is_utc(tz):
-        return _localize_pydatetime(dt, tz)
-    try:
-        # datetime.replace with pytz may be incorrect result
-        return tz.localize(dt)
-    except AttributeError:
-        return dt.replace(tzinfo=tz)
+    return _localize_pydatetime(dt, tz)
 
 
 # ----------------------------------------------------------------------

diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx
@@ -513,12 +513,13 @@ cdef const int64_t[:] _tz_convert_from_utc(const int64_t[:] vals, tzinfo tz):
 
 # OSError may be thrown by tzlocal on windows at or close to 1970-01-01
 #  see https://github.com/pandas-dev/pandas/pull/37591#issuecomment-720628241
-cdef inline int64_t _tzlocal_get_offset_components(int64_t val, tzinfo tz,
-                                                   bint to_utc,
-                                                   bint *fold=NULL) except? -1:
+cdef int64_t _tz_convert_tzlocal_utc(int64_t val, tzinfo tz, bint to_utc=True,
+                                     bint* fold=NULL) except? -1:
     """
-    Calculate offset in nanoseconds needed to convert the i8 representation of
-    a datetime from a tzlocal timezone to UTC, or vice-versa.
+    Convert the i8 representation of a datetime from a tzlocal timezone to
+    UTC, or vice-versa.
+
+    Private, not intended for use outside of tslibs.conversion
 
     Parameters
     ----------
@@ -529,10 +530,11 @@ cdef inline int64_t _tzlocal_get_offset_components(int64_t val, tzinfo tz,
     fold : bint*, default NULL
         pointer to fold: whether datetime ends up in a fold or not
         after adjustment
+        Only passed with to_utc=False.
 
     Returns
     -------
-    delta : int64_t
+    result : int64_t
 
     Notes
     -----
@@ -553,45 +555,12 @@ cdef inline int64_t _tzlocal_get_offset_components(int64_t val, tzinfo tz,
         dt = dt.replace(tzinfo=tzutc())
         dt = dt.astimezone(tz)
 
-    if fold is not NULL:
-        fold[0] = dt.fold
+        if fold is not NULL:
+            # NB: fold is only passed with to_utc=False
+            fold[0] = dt.fold
 
     td = tz.utcoffset(dt)
-    return int(td.total_seconds() * 1_000_000_000)
-
-
-# OSError may be thrown by tzlocal on windows at or close to 1970-01-01
-#  see https://github.com/pandas-dev/pandas/pull/37591#issuecomment-720628241
-cdef int64_t _tz_convert_tzlocal_utc(int64_t val, tzinfo tz, bint to_utc=True,
-                                     bint* fold=NULL) except? -1:
-    """
-    Convert the i8 representation of a datetime from a tzlocal timezone to
-    UTC, or vice-versa.
-
-    Private, not intended for use outside of tslibs.conversion
-
-    Parameters
-    ----------
-    val : int64_t
-    tz : tzinfo
-    to_utc : bint
-        True if converting tzlocal _to_ UTC, False if going the other direction
-    fold : bint*
-        pointer to fold: whether datetime ends up in a fold or not
-        after adjustment
-
-    Returns
-    -------
-    result : int64_t
-
-    Notes
-    -----
-    Sets fold by pointer
-    """
-    cdef:
-        int64_t delta
-
-    delta = _tzlocal_get_offset_components(val, tz, to_utc, fold)
+    delta = int(td.total_seconds() * 1_000_000_000)
 
     if to_utc:
         return val - delta

diff --git a/pandas/_libs/tslibs/vectorized.pyi b/pandas/_libs/tslibs/vectorized.pyi
@@ -30,7 +30,7 @@ def get_resolution(
 def ints_to_pydatetime(
     arr: npt.NDArray[np.int64],  # const int64_t[:}]
     tz: tzinfo | None = ...,
-    freq: str | BaseOffset | None = ...,
+    freq: BaseOffset | None = ...,
     fold: bool = ...,
     box: str = ...,
 ) -> npt.NDArray[np.object_]: ...
diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx
@@ -27,7 +27,7 @@ from .np_datetime cimport (
     dt64_to_dtstruct,
     npy_datetimestruct,
 )
-from .offsets cimport to_offset
+from .offsets cimport BaseOffset
 from .period cimport get_period_ordinal
 from .timestamps cimport create_timestamp_from_ts
 from .timezones cimport (
@@ -87,7 +87,7 @@ cdef inline object create_time_from_ts(
 def ints_to_pydatetime(
     const int64_t[:] stamps,
     tzinfo tz=None,
-    object freq=None,
+    BaseOffset freq=None,
     bint fold=False,
     str box="datetime"
 ) -> np.ndarray:
@@ -99,7 +99,7 @@ def ints_to_pydatetime(
     stamps : array of i8
     tz : str, optional
          convert to this timezone
-    freq : str/Offset, optional
+    freq : BaseOffset, optional
          freq to convert
     fold : bint, default is 0
         Due to daylight saving time, one wall clock time can occur twice
@@ -138,9 +138,6 @@ def ints_to_pydatetime(
         func_create = create_date_from_ts
     elif box == "timestamp":
         func_create = create_timestamp_from_ts
-
-        if isinstance(freq, str):
-            freq = to_offset(freq)
     elif box == "time":
         func_create = create_time_from_ts
     elif box == "datetime":
@@ -311,7 +308,6 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t
             pos = trans.searchsorted(stamps, side="right") - 1
 
     for i in range(n):
-        # TODO: reinstate nogil for use_utc case?
         if stamps[i] == NPY_NAT:
             result[i] = NPY_NAT
             continue
@@ -393,7 +389,7 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool:
 @cython.boundscheck(False)
 def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz):
     cdef:
-        Py_ssize_t n = len(stamps)
+        Py_ssize_t i, n = len(stamps)
         int64_t[:] result = np.empty(n, dtype=np.int64)
         ndarray[int64_t] trans
         int64_t[:] deltas
@@ -416,7 +412,6 @@ def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz):
             pos = trans.searchsorted(stamps, side="right") - 1
 
     for i in range(n):
-        # TODO: reinstate nogil for use_utc case?
         if stamps[i] == NPY_NAT:
             result[i] = NPY_NAT
             continue

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -49,10 +49,7 @@
 from pandas.util._exceptions import find_stack_level
 from pandas.util._validators import validate_bool_kwarg
 
-from pandas.core.dtypes.cast import (
-    coerce_indexer_dtype,
-    maybe_cast_to_extension_array,
-)
+from pandas.core.dtypes.cast import coerce_indexer_dtype
 from pandas.core.dtypes.common import (
     ensure_int64,
     ensure_platform_int,
@@ -64,7 +61,6 @@
     is_hashable,
     is_integer_dtype,
     is_list_like,
-    is_object_dtype,
     is_scalar,
     is_timedelta64_dtype,
     needs_i8_conversion,
@@ -2758,13 +2754,6 @@ def _get_codes_for_values(values, categories: Index) -> np.ndarray:
         codes = _get_codes_for_values(flat, categories)
         return codes.reshape(values.shape)
 
-    if isinstance(categories.dtype, ExtensionDtype) and is_object_dtype(values):
-        # Support inferring the correct extension dtype from an array of
-        # scalar objects. e.g.
-        # Categorical(array[Period, Period], categories=PeriodIndex(...))
-        cls = categories.dtype.construct_array_type()
-        values = maybe_cast_to_extension_array(cls, values)
-
     codes = categories.get_indexer_for(values)
     return coerce_indexer_dtype(codes, categories)
 

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -1268,9 +1268,9 @@ def maybe_cast_to_datetime(
 
     if is_timedelta64_dtype(dtype):
         # TODO: _from_sequence would raise ValueError in cases where
-        #  ensure_nanosecond_dtype raises TypeError
+        #  _ensure_nanosecond_dtype raises TypeError
         dtype = cast(np.dtype, dtype)
-        dtype = ensure_nanosecond_dtype(dtype)
+        dtype = _ensure_nanosecond_dtype(dtype)
         res = TimedeltaArray._from_sequence(value, dtype=dtype)
         return res
 
@@ -1281,7 +1281,7 @@ def maybe_cast_to_datetime(
         vdtype = getattr(value, "dtype", None)
 
         if is_datetime64 or is_datetime64tz:
-            dtype = ensure_nanosecond_dtype(dtype)
+            dtype = _ensure_nanosecond_dtype(dtype)
 
             value = np.array(value, copy=False)
 
@@ -1399,14 +1399,14 @@ def sanitize_to_nanoseconds(values: np.ndarray, copy: bool = False) -> np.ndarra
     return values
 
 
-def ensure_nanosecond_dtype(dtype: DtypeObj) -> DtypeObj:
+def _ensure_nanosecond_dtype(dtype: DtypeObj) -> DtypeObj:
     """
     Convert dtypes with granularity less than nanosecond to nanosecond
 
-    >>> ensure_nanosecond_dtype(np.dtype("M8[s]"))
+    >>> _ensure_nanosecond_dtype(np.dtype("M8[s]"))
     dtype('<M8[ns]')
 
-    >>> ensure_nanosecond_dtype(np.dtype("m8[ps]"))
+    >>> _ensure_nanosecond_dtype(np.dtype("m8[ps]"))
     Traceback (most recent call last):
         ...
     TypeError: cannot convert timedeltalike to dtype [timedelta64[ps]]