Skip to content

CLN: some code cleanups in pandas/_libs/ #31808

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Feb 22, 2020
31 changes: 24 additions & 7 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,30 @@ from libc.math cimport fabs, sqrt

import numpy as np
cimport numpy as cnp
from numpy cimport (ndarray,
NPY_INT64, NPY_INT32, NPY_INT16, NPY_INT8,
NPY_UINT64, NPY_UINT32, NPY_UINT16, NPY_UINT8,
NPY_FLOAT32, NPY_FLOAT64,
NPY_OBJECT,
int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
uint32_t, uint64_t, float32_t, float64_t)
from numpy cimport (
NPY_FLOAT32,
NPY_FLOAT64,
NPY_INT8,
NPY_INT16,
NPY_INT32,
NPY_INT64,
NPY_OBJECT,
NPY_UINT8,
NPY_UINT16,
NPY_UINT32,
NPY_UINT64,
float32_t,
float64_t,
int8_t,
int16_t,
int32_t,
int64_t,
ndarray,
uint8_t,
uint16_t,
uint32_t,
uint64_t,
)
cnp.import_array()


Expand Down
20 changes: 16 additions & 4 deletions pandas/_libs/join.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,25 @@ from cython import Py_ssize_t

import numpy as np
cimport numpy as cnp
from numpy cimport (ndarray,
int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
uint32_t, uint64_t, float32_t, float64_t)
from numpy cimport (
float32_t,
float64_t,
int8_t,
int16_t,
int32_t,
int64_t,
ndarray,
uint8_t,
uint16_t,
uint32_t,
uint64_t,
)
cnp.import_array()

from pandas._libs.algos import (
groupsort_indexer, ensure_platform_int, take_1d_int64_int64
ensure_platform_int,
groupsort_indexer,
take_1d_int64_int64,
)


Expand Down
43 changes: 32 additions & 11 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,33 @@ from cpython.iterator cimport PyIter_Check
from cpython.sequence cimport PySequence_Check
from cpython.number cimport PyNumber_Check

from cpython.datetime cimport (PyDateTime_Check, PyDate_Check,
PyTime_Check, PyDelta_Check,
PyDateTime_IMPORT)
from cpython.datetime cimport (
PyDateTime_Check,
PyDate_Check,
PyTime_Check,
PyDelta_Check,
PyDateTime_IMPORT,
)
PyDateTime_IMPORT

import numpy as np
cimport numpy as cnp
from numpy cimport (ndarray, PyArray_Check, PyArray_GETITEM,
PyArray_ITER_DATA, PyArray_ITER_NEXT, PyArray_IterNew,
flatiter, NPY_OBJECT,
int64_t, float32_t, float64_t,
uint8_t, uint64_t, complex128_t)
from numpy cimport (
NPY_OBJECT,
PyArray_Check,
PyArray_GETITEM,
PyArray_ITER_DATA,
PyArray_ITER_NEXT,
PyArray_IterNew,
complex128_t,
flatiter,
float32_t,
float64_t,
int64_t,
ndarray,
uint8_t,
uint64_t,
)
cnp.import_array()

cdef extern from "numpy/arrayobject.h":
Expand Down Expand Up @@ -60,7 +75,12 @@ from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64
from pandas._libs.tslibs.timezones cimport get_timezone, tz_compare

from pandas._libs.missing cimport (
checknull, isnaobj, is_null_datetime64, is_null_timedelta64, is_null_period, C_NA
checknull,
isnaobj,
is_null_datetime64,
is_null_timedelta64,
is_null_period,
C_NA,
)


Expand Down Expand Up @@ -246,7 +266,7 @@ def item_from_zerodim(val: object) -> object:

@cython.wraparound(False)
@cython.boundscheck(False)
def fast_unique_multiple(list arrays, sort: bool=True):
def fast_unique_multiple(list arrays, sort: bool = True):
"""
Generate a list of unique values from a list of arrays.

Expand Down Expand Up @@ -277,6 +297,7 @@ def fast_unique_multiple(list arrays, sort: bool=True):
if val not in table:
table[val] = stub
uniques.append(val)

if sort is None:
try:
uniques.sort()
Expand All @@ -289,7 +310,7 @@ def fast_unique_multiple(list arrays, sort: bool=True):

@cython.wraparound(False)
@cython.boundscheck(False)
def fast_unique_multiple_list(lists: list, sort: bool=True) -> list:
def fast_unique_multiple_list(lists: list, sort: bool = True) -> list:
cdef:
list buf
Py_ssize_t k = len(lists)
Expand Down
16 changes: 14 additions & 2 deletions pandas/_libs/reshape.pyx
Original file line number Diff line number Diff line change
@@ -1,8 +1,20 @@
import cython
from cython import Py_ssize_t

from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
uint32_t, uint64_t, float32_t, float64_t, ndarray)
from numpy cimport (
float32_t,
float64_t,
int8_t,
int16_t,
int32_t,
int64_t,
ndarray,
uint8_t,
uint16_t,
uint32_t,
uint64_t,
)

cimport numpy as cnp
import numpy as np
from pandas._libs.lib cimport c_is_list_like
Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/sparse.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,7 @@ cdef class BlockIndex(SparseIndex):
ylen = y.blengths

# block may be split, but can't exceed original len / 2 + 1
max_len = int(min(self.length, y.length) / 2) + 1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are there no cases where int rounds up?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, in order for int() to round up it needs to be done with math.ceil or with a "hack" like:

number // divider + (number % divider > 0)

For example:

number = 42.01
divider = 8

number / divider # 5.25125
number // divider # 5.0
int(number / divider) # 5
number // divider + (number % divider > 0) # 6.0

max_len = min(self.length, y.length) // 2 + 1
out_bloc = np.empty(max_len, dtype=np.int32)
out_blen = np.empty(max_len, dtype=np.int32)

Expand Down Expand Up @@ -672,7 +672,7 @@ cdef class BlockUnion(BlockMerge):
ystart = self.ystart
yend = self.yend

max_len = int(min(self.x.length, self.y.length) / 2) + 1
max_len = min(self.x.length, self.y.length) // 2 + 1
out_bloc = np.empty(max_len, dtype=np.int32)
out_blen = np.empty(max_len, dtype=np.int32)

Expand Down
60 changes: 34 additions & 26 deletions pandas/_libs/tslibs/period.pyx
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from datetime import datetime

from cpython.object cimport (
PyObject_RichCompareBool,
Py_EQ, Py_NE)
from cpython.object cimport PyObject_RichCompareBool, Py_EQ, Py_NE

from numpy cimport int64_t, import_array, ndarray
import numpy as np
Expand All @@ -14,15 +12,25 @@ from libc.string cimport strlen, memset

import cython

from cpython.datetime cimport (PyDateTime_Check, PyDelta_Check, PyDate_Check,
PyDateTime_IMPORT)
from cpython.datetime cimport (
PyDate_Check,
PyDateTime_Check,
PyDateTime_IMPORT,
PyDelta_Check,
)
# import datetime C API
PyDateTime_IMPORT

from pandas._libs.tslibs.np_datetime cimport (
npy_datetimestruct, dtstruct_to_dt64, dt64_to_dtstruct,
pandas_datetime_to_datetimestruct, check_dts_bounds,
NPY_DATETIMEUNIT, NPY_FR_D, NPY_FR_us)
npy_datetimestruct,
dtstruct_to_dt64,
dt64_to_dtstruct,
pandas_datetime_to_datetimestruct,
check_dts_bounds,
NPY_DATETIMEUNIT,
NPY_FR_D,
NPY_FR_us,
)

cdef extern from "src/datetime/np_datetime.h":
int64_t npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT fr,
Expand All @@ -37,12 +45,15 @@ from pandas._libs.tslibs.timedeltas import Timedelta
from pandas._libs.tslibs.timedeltas cimport delta_to_nanoseconds

cimport pandas._libs.tslibs.ccalendar as ccalendar
from pandas._libs.tslibs.ccalendar cimport (
dayofweek, get_day_of_year, is_leapyear)
from pandas._libs.tslibs.ccalendar cimport dayofweek, get_day_of_year, is_leapyear
from pandas._libs.tslibs.ccalendar import MONTH_NUMBERS
from pandas._libs.tslibs.frequencies cimport (
get_freq_code, get_base_alias, get_to_timestamp_base, get_freq_str,
get_rule_month)
get_base_alias,
get_freq_code,
get_freq_str,
get_rule_month,
get_to_timestamp_base,
)
from pandas._libs.tslibs.parsing import parse_time_string
from pandas._libs.tslibs.resolution import Resolution
from pandas._libs.tslibs.nattype import nat_strings
Expand All @@ -55,7 +66,7 @@ from pandas._libs.tslibs.tzconversion cimport tz_convert_utc_to_tzlocal

cdef:
enum:
INT32_MIN = -2147483648
INT32_MIN = -2_147_483_648


ctypedef struct asfreq_info:
Expand Down Expand Up @@ -179,8 +190,7 @@ cdef freq_conv_func get_asfreq_func(int from_freq, int to_freq) nogil:
return <freq_conv_func>asfreq_MtoB
elif from_group == FR_WK:
return <freq_conv_func>asfreq_WtoB
elif from_group in [FR_DAY, FR_HR, FR_MIN, FR_SEC,
FR_MS, FR_US, FR_NS]:
elif from_group in [FR_DAY, FR_HR, FR_MIN, FR_SEC, FR_MS, FR_US, FR_NS]:
return <freq_conv_func>asfreq_DTtoB
else:
return <freq_conv_func>nofunc
Expand Down Expand Up @@ -289,17 +299,15 @@ cdef int64_t DtoB(npy_datetimestruct *dts, int roll_back,
return DtoB_weekday(unix_date)


cdef inline int64_t upsample_daytime(int64_t ordinal,
asfreq_info *af_info) nogil:
if (af_info.is_end):
cdef inline int64_t upsample_daytime(int64_t ordinal, asfreq_info *af_info) nogil:
if af_info.is_end:
return (ordinal + 1) * af_info.intraday_conversion_factor - 1
else:
return ordinal * af_info.intraday_conversion_factor


cdef inline int64_t downsample_daytime(int64_t ordinal,
asfreq_info *af_info) nogil:
return ordinal // (af_info.intraday_conversion_factor)
cdef inline int64_t downsample_daytime(int64_t ordinal, asfreq_info *af_info) nogil:
return ordinal // af_info.intraday_conversion_factor


cdef inline int64_t transform_via_day(int64_t ordinal,
Expand Down Expand Up @@ -1464,24 +1472,24 @@ def extract_freq(ndarray[object] values):

cdef:
Py_ssize_t i, n = len(values)
object p
object value

for i in range(n):
p = values[i]
value = values[i]

try:
# now Timestamp / NaT has freq attr
if is_period_object(p):
return p.freq
if is_period_object(value):
return value.freq
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 for avoiding 1-character variable name

except AttributeError:
pass

raise ValueError('freq not specified and cannot be inferred')


# -----------------------------------------------------------------------
# period helpers


@cython.wraparound(False)
@cython.boundscheck(False)
cdef int64_t[:] localize_dt64arr_to_period(const int64_t[:] stamps,
Expand Down
14 changes: 5 additions & 9 deletions pandas/_libs/tslibs/strptime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,7 @@ cdef dict _parse_code_table = {'y': 0,
'u': 22}


def array_strptime(object[:] values, object fmt,
bint exact=True, errors='raise'):
def array_strptime(object[:] values, object fmt, bint exact=True, errors='raise'):
"""
Calculates the datetime structs represented by the passed array of strings

Expand Down Expand Up @@ -78,12 +77,9 @@ def array_strptime(object[:] values, object fmt,
if fmt is not None:
if '%W' in fmt or '%U' in fmt:
if '%Y' not in fmt and '%y' not in fmt:
raise ValueError("Cannot use '%W' or '%U' without "
"day and year")
if ('%A' not in fmt and '%a' not in fmt and '%w' not
in fmt):
raise ValueError("Cannot use '%W' or '%U' without "
"day and year")
raise ValueError("Cannot use '%W' or '%U' without day and year")
if '%A' not in fmt and '%a' not in fmt and '%w' not in fmt:
raise ValueError("Cannot use '%W' or '%U' without day and year")
elif '%Z' in fmt and '%z' in fmt:
raise ValueError("Cannot parse both %Z and %z")

Expand Down Expand Up @@ -749,6 +745,6 @@ cdef parse_timezone_directive(str z):
microseconds = int(gmtoff_remainder + gmtoff_remainder_padding)

total_minutes = ((hours * 60) + minutes + (seconds // 60) +
(microseconds // 60000000))
(microseconds // 60_000_000))
total_minutes = -total_minutes if z.startswith("-") else total_minutes
return pytz.FixedOffset(total_minutes)
4 changes: 2 additions & 2 deletions pandas/_libs/tslibs/timezones.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ cdef int64_t[:] unbox_utcoffsets(object transinfo):
arr = np.empty(sz, dtype='i8')

for i in range(sz):
arr[i] = int(transinfo[i][0].total_seconds()) * 1000000000
arr[i] = int(transinfo[i][0].total_seconds()) * 1_000_000_000

return arr

Expand All @@ -217,7 +217,7 @@ cdef object get_dst_info(object tz):
if cache_key is None:
# e.g. pytz.FixedOffset, matplotlib.dates._UTC,
# psycopg2.tz.FixedOffsetTimezone
num = int(get_utcoffset(tz, None).total_seconds()) * 1000000000
num = int(get_utcoffset(tz, None).total_seconds()) * 1_000_000_000
return (np.array([NPY_NAT + 1], dtype=np.int64),
np.array([num], dtype=np.int64),
None)
Expand Down