Skip to content

CLN: Assorted cleanups #46243

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 51 additions & 21 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ from numpy.math cimport NAN

cnp.import_array()

from pandas._libs cimport util
from pandas._libs.algos cimport kth_smallest_c
from pandas._libs.util cimport get_nat

from pandas._libs.algos import (
ensure_platform_int,
Expand All @@ -49,7 +49,7 @@ from pandas._libs.dtypes cimport (
from pandas._libs.missing cimport checknull


cdef int64_t NPY_NAT = get_nat()
cdef int64_t NPY_NAT = util.get_nat()
_int64_max = np.iinfo(np.int64).max

cdef float64_t NaN = <float64_t>np.NaN
Expand Down Expand Up @@ -248,13 +248,7 @@ def group_cumsum(
accum = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)
compensation = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)

if numeric_t == float32_t or numeric_t == float64_t:
na_val = NaN
elif numeric_t is int64_t and is_datetimelike:
na_val = NPY_NAT
else:
# Will not be used, but define to avoid unitialized warning.
na_val = 0
na_val = _get_na_val(<numeric_t>0, is_datetimelike)

with nogil:
for i in range(N):
Expand Down Expand Up @@ -995,6 +989,47 @@ cdef inline bint _treat_as_na(numeric_object_t val, bint is_datetimelike) nogil:
return False


cdef numeric_t _get_min_or_max(numeric_t val, bint compute_max):
"""
Find either the min or the max supported by numeric_t; 'val' is a placeholder
to effectively make numeric_t an argument.
"""
if numeric_t is int64_t:
return -_int64_max if compute_max else util.INT64_MAX
elif numeric_t is int32_t:
return util.INT32_MIN if compute_max else util.INT32_MAX
elif numeric_t is int16_t:
return util.INT16_MIN if compute_max else util.INT16_MAX
elif numeric_t is int8_t:
return util.INT8_MIN if compute_max else util.INT8_MAX

elif numeric_t is uint64_t:
return 0 if compute_max else util.UINT64_MAX
elif numeric_t is uint32_t:
return 0 if compute_max else util.UINT32_MAX
elif numeric_t is uint16_t:
return 0 if compute_max else util.UINT16_MAX
elif numeric_t is uint8_t:
return 0 if compute_max else util.UINT8_MAX

else:
return -np.inf if compute_max else np.inf


cdef numeric_t _get_na_val(numeric_t val, bint is_datetimelike):
cdef:
numeric_t na_val

if numeric_t == float32_t or numeric_t == float64_t:
na_val = NaN
elif numeric_t is int64_t and is_datetimelike:
na_val = NPY_NAT
else:
# Will not be used, but define to avoid unitialized warning.
na_val = 0
return na_val


# TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can
# use `const iu_64_floating_obj_t[:, :] values`
@cython.wraparound(False)
Expand Down Expand Up @@ -1359,16 +1394,17 @@ cdef group_min_max(
nobs = np.zeros((<object>out).shape, dtype=np.int64)

group_min_or_max = np.empty_like(out)
group_min_or_max[:] = _get_min_or_max(<iu_64_floating_t>0, compute_max)

if iu_64_floating_t is int64_t:
group_min_or_max[:] = -_int64_max if compute_max else _int64_max
# TODO: only if is_datetimelike?
nan_val = NPY_NAT
elif iu_64_floating_t is uint64_t:
# NB: We do not define nan_val because there is no such thing
# for uint64_t. We carefully avoid having to reference it in this
# case.
group_min_or_max[:] = 0 if compute_max else np.iinfo(np.uint64).max
pass
else:
group_min_or_max[:] = -np.inf if compute_max else np.inf
nan_val = NAN

N, K = (<object>values).shape
Expand Down Expand Up @@ -1527,26 +1563,20 @@ cdef group_cummin_max(
bint isna_entry

accum = np.empty((ngroups, (<object>values).shape[1]), dtype=values.dtype)
if iu_64_floating_t is int64_t:
accum[:] = -_int64_max if compute_max else _int64_max
elif iu_64_floating_t is uint64_t:
accum[:] = 0 if compute_max else np.iinfo(np.uint64).max
else:
accum[:] = -np.inf if compute_max else np.inf
accum[:] = _get_min_or_max(<iu_64_floating_t>0, compute_max)

na_val = _get_na_val(<iu_64_floating_t>0, is_datetimelike)

if uses_mask:
na_possible = True
# Will never be used, just to avoid uninitialized warning
na_val = 0
elif iu_64_floating_t is float64_t or iu_64_floating_t is float32_t:
na_val = NaN
na_possible = True
elif is_datetimelike:
na_val = NPY_NAT
na_possible = True
else:
# Will never be used, just to avoid uninitialized warning
na_val = 0
na_possible = False

if na_possible:
Expand Down
12 changes: 3 additions & 9 deletions pandas/_libs/tslibs/conversion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -757,7 +757,7 @@ cdef inline bint _infer_tsobject_fold(
_TSObject obj,
const int64_t[:] trans,
const int64_t[:] deltas,
int32_t pos,
intp_t pos,
):
"""
Infer _TSObject fold property from value by assuming 0 and then setting
Expand All @@ -770,7 +770,7 @@ cdef inline bint _infer_tsobject_fold(
ndarray of offset transition points in nanoseconds since epoch.
deltas : int64_t[:]
array of offsets corresponding to transition points in trans.
pos : int32_t
pos : intp_t
Position of the last transition point before taking fold into account.

Returns
Expand Down Expand Up @@ -828,13 +828,7 @@ cpdef inline datetime localize_pydatetime(datetime dt, tzinfo tz):
return dt
elif isinstance(dt, ABCTimestamp):
return dt.tz_localize(tz)
elif is_utc(tz):
return _localize_pydatetime(dt, tz)
try:
# datetime.replace with pytz may be incorrect result
return tz.localize(dt)
except AttributeError:
return dt.replace(tzinfo=tz)
return _localize_pydatetime(dt, tz)


# ----------------------------------------------------------------------
Expand Down
55 changes: 12 additions & 43 deletions pandas/_libs/tslibs/tzconversion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -513,12 +513,13 @@ cdef const int64_t[:] _tz_convert_from_utc(const int64_t[:] vals, tzinfo tz):

# OSError may be thrown by tzlocal on windows at or close to 1970-01-01
# see https://github.com/pandas-dev/pandas/pull/37591#issuecomment-720628241
cdef inline int64_t _tzlocal_get_offset_components(int64_t val, tzinfo tz,
bint to_utc,
bint *fold=NULL) except? -1:
cdef int64_t _tz_convert_tzlocal_utc(int64_t val, tzinfo tz, bint to_utc=True,
bint* fold=NULL) except? -1:
"""
Calculate offset in nanoseconds needed to convert the i8 representation of
a datetime from a tzlocal timezone to UTC, or vice-versa.
Convert the i8 representation of a datetime from a tzlocal timezone to
UTC, or vice-versa.

Private, not intended for use outside of tslibs.conversion

Parameters
----------
Expand All @@ -529,10 +530,11 @@ cdef inline int64_t _tzlocal_get_offset_components(int64_t val, tzinfo tz,
fold : bint*, default NULL
pointer to fold: whether datetime ends up in a fold or not
after adjustment
Only passed with to_utc=False.

Returns
-------
delta : int64_t
result : int64_t

Notes
-----
Expand All @@ -553,45 +555,12 @@ cdef inline int64_t _tzlocal_get_offset_components(int64_t val, tzinfo tz,
dt = dt.replace(tzinfo=tzutc())
dt = dt.astimezone(tz)

if fold is not NULL:
fold[0] = dt.fold
if fold is not NULL:
# NB: fold is only passed with to_utc=False
fold[0] = dt.fold

td = tz.utcoffset(dt)
return int(td.total_seconds() * 1_000_000_000)


# OSError may be thrown by tzlocal on windows at or close to 1970-01-01
# see https://github.com/pandas-dev/pandas/pull/37591#issuecomment-720628241
cdef int64_t _tz_convert_tzlocal_utc(int64_t val, tzinfo tz, bint to_utc=True,
bint* fold=NULL) except? -1:
"""
Convert the i8 representation of a datetime from a tzlocal timezone to
UTC, or vice-versa.

Private, not intended for use outside of tslibs.conversion

Parameters
----------
val : int64_t
tz : tzinfo
to_utc : bint
True if converting tzlocal _to_ UTC, False if going the other direction
fold : bint*
pointer to fold: whether datetime ends up in a fold or not
after adjustment

Returns
-------
result : int64_t

Notes
-----
Sets fold by pointer
"""
cdef:
int64_t delta

delta = _tzlocal_get_offset_components(val, tz, to_utc, fold)
delta = int(td.total_seconds() * 1_000_000_000)

if to_utc:
return val - delta
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/tslibs/vectorized.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def get_resolution(
def ints_to_pydatetime(
arr: npt.NDArray[np.int64], # const int64_t[:}]
tz: tzinfo | None = ...,
freq: str | BaseOffset | None = ...,
freq: BaseOffset | None = ...,
fold: bool = ...,
box: str = ...,
) -> npt.NDArray[np.object_]: ...
13 changes: 4 additions & 9 deletions pandas/_libs/tslibs/vectorized.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ from .np_datetime cimport (
dt64_to_dtstruct,
npy_datetimestruct,
)
from .offsets cimport to_offset
from .offsets cimport BaseOffset
from .period cimport get_period_ordinal
from .timestamps cimport create_timestamp_from_ts
from .timezones cimport (
Expand Down Expand Up @@ -87,7 +87,7 @@ cdef inline object create_time_from_ts(
def ints_to_pydatetime(
const int64_t[:] stamps,
tzinfo tz=None,
object freq=None,
BaseOffset freq=None,
bint fold=False,
str box="datetime"
) -> np.ndarray:
Expand All @@ -99,7 +99,7 @@ def ints_to_pydatetime(
stamps : array of i8
tz : str, optional
convert to this timezone
freq : str/Offset, optional
freq : BaseOffset, optional
freq to convert
fold : bint, default is 0
Due to daylight saving time, one wall clock time can occur twice
Expand Down Expand Up @@ -138,9 +138,6 @@ def ints_to_pydatetime(
func_create = create_date_from_ts
elif box == "timestamp":
func_create = create_timestamp_from_ts

if isinstance(freq, str):
freq = to_offset(freq)
elif box == "time":
func_create = create_time_from_ts
elif box == "datetime":
Expand Down Expand Up @@ -311,7 +308,6 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t
pos = trans.searchsorted(stamps, side="right") - 1

for i in range(n):
# TODO: reinstate nogil for use_utc case?
if stamps[i] == NPY_NAT:
result[i] = NPY_NAT
continue
Expand Down Expand Up @@ -393,7 +389,7 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool:
@cython.boundscheck(False)
def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz):
cdef:
Py_ssize_t n = len(stamps)
Py_ssize_t i, n = len(stamps)
int64_t[:] result = np.empty(n, dtype=np.int64)
ndarray[int64_t] trans
int64_t[:] deltas
Expand All @@ -416,7 +412,6 @@ def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz):
pos = trans.searchsorted(stamps, side="right") - 1

for i in range(n):
# TODO: reinstate nogil for use_utc case?
if stamps[i] == NPY_NAT:
result[i] = NPY_NAT
continue
Expand Down
13 changes: 1 addition & 12 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,7 @@
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import validate_bool_kwarg

from pandas.core.dtypes.cast import (
coerce_indexer_dtype,
maybe_cast_to_extension_array,
)
from pandas.core.dtypes.cast import coerce_indexer_dtype
from pandas.core.dtypes.common import (
ensure_int64,
ensure_platform_int,
Expand All @@ -64,7 +61,6 @@
is_hashable,
is_integer_dtype,
is_list_like,
is_object_dtype,
is_scalar,
is_timedelta64_dtype,
needs_i8_conversion,
Expand Down Expand Up @@ -2758,13 +2754,6 @@ def _get_codes_for_values(values, categories: Index) -> np.ndarray:
codes = _get_codes_for_values(flat, categories)
return codes.reshape(values.shape)

if isinstance(categories.dtype, ExtensionDtype) and is_object_dtype(values):
# Support inferring the correct extension dtype from an array of
# scalar objects. e.g.
# Categorical(array[Period, Period], categories=PeriodIndex(...))
cls = categories.dtype.construct_array_type()
values = maybe_cast_to_extension_array(cls, values)

codes = categories.get_indexer_for(values)
return coerce_indexer_dtype(codes, categories)

Expand Down
12 changes: 6 additions & 6 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -1268,9 +1268,9 @@ def maybe_cast_to_datetime(

if is_timedelta64_dtype(dtype):
# TODO: _from_sequence would raise ValueError in cases where
# ensure_nanosecond_dtype raises TypeError
# _ensure_nanosecond_dtype raises TypeError
dtype = cast(np.dtype, dtype)
dtype = ensure_nanosecond_dtype(dtype)
dtype = _ensure_nanosecond_dtype(dtype)
res = TimedeltaArray._from_sequence(value, dtype=dtype)
return res

Expand All @@ -1281,7 +1281,7 @@ def maybe_cast_to_datetime(
vdtype = getattr(value, "dtype", None)

if is_datetime64 or is_datetime64tz:
dtype = ensure_nanosecond_dtype(dtype)
dtype = _ensure_nanosecond_dtype(dtype)

value = np.array(value, copy=False)

Expand Down Expand Up @@ -1399,14 +1399,14 @@ def sanitize_to_nanoseconds(values: np.ndarray, copy: bool = False) -> np.ndarra
return values


def ensure_nanosecond_dtype(dtype: DtypeObj) -> DtypeObj:
def _ensure_nanosecond_dtype(dtype: DtypeObj) -> DtypeObj:
"""
Convert dtypes with granularity less than nanosecond to nanosecond

>>> ensure_nanosecond_dtype(np.dtype("M8[s]"))
>>> _ensure_nanosecond_dtype(np.dtype("M8[s]"))
dtype('<M8[ns]')

>>> ensure_nanosecond_dtype(np.dtype("m8[ps]"))
>>> _ensure_nanosecond_dtype(np.dtype("m8[ps]"))
Traceback (most recent call last):
...
TypeError: cannot convert timedeltalike to dtype [timedelta64[ps]]
Expand Down
Loading