Skip to content

standalone implementation of ccalendar #18540

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 28 commits into from
Dec 7, 2017
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
4e942f5
implement ccalendar. only use it incrementally for profiling
jbrockmendel Nov 26, 2017
6cbb4be
Fix nameerror
jbrockmendel Nov 26, 2017
7f22c6d
fixup signature
jbrockmendel Nov 26, 2017
94557a7
next run with just is_leapyear changed
jbrockmendel Nov 26, 2017
ce8e3d5
then try with just get_days_in_month
jbrockmendel Nov 27, 2017
344252e
fixup typo
jbrockmendel Nov 27, 2017
a084fb3
use all of dayofweek, get_days_in_monht, is_leapyear in fields
jbrockmendel Nov 27, 2017
1b74668
try to avoid re-initializing sakamoto_arr
jbrockmendel Nov 27, 2017
bc4da9e
Merge branch 'master' of https://github.com/pandas-dev/pandas into ts…
jbrockmendel Nov 27, 2017
fdf280e
implement get_week_of_year
jbrockmendel Nov 28, 2017
f8cc945
punctuation
jbrockmendel Nov 28, 2017
0cfd9a4
Merge branch 'master' of https://github.com/pandas-dev/pandas into ts…
jbrockmendel Nov 28, 2017
a29950b
use get_week_of_year
jbrockmendel Nov 28, 2017
3d07a2a
whitesapce fixup
jbrockmendel Nov 28, 2017
b16f443
whitesapce fixup
jbrockmendel Nov 28, 2017
9ec475a
comment
jbrockmendel Nov 28, 2017
b221710
fixup typo
jbrockmendel Nov 28, 2017
d91ec05
fix 12-->13
jbrockmendel Nov 28, 2017
8f78992
flesh out docstring
jbrockmendel Nov 28, 2017
32be4f4
implement ccalendar fastpaths for timestamp properties
jbrockmendel Nov 28, 2017
3fda037
Merge branch 'master' of https://github.com/pandas-dev/pandas into ts…
jbrockmendel Nov 28, 2017
03a854d
Merge branch 'master' of https://github.com/pandas-dev/pandas into ts…
jbrockmendel Nov 29, 2017
d0f8b13
fixup comment typo
jbrockmendel Nov 29, 2017
505fa9e
Merge branch 'master' of https://github.com/pandas-dev/pandas into ts…
jbrockmendel Nov 29, 2017
0add03e
remove monthrange per reviewer request
jbrockmendel Nov 29, 2017
b66caa6
remove monthrange from pxd
jbrockmendel Nov 29, 2017
7cc0824
Merge branch 'master' of https://github.com/pandas-dev/pandas into ts…
jbrockmendel Dec 3, 2017
6b245cf
Merge branch 'master' of https://github.com/pandas-dev/pandas into ts…
jbrockmendel Dec 7, 2017
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions pandas/_libs/tslibs/ccalendar.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# -*- coding: utf-8 -*-
# cython: profile=False
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this only used in fields? no-where else?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only implemented in fields. Haven't updated others yet. A couple places in offsets, a few more fastpaths in Timestamp.


from cython cimport Py_ssize_t

from numpy cimport int64_t, int32_t


cpdef monthrange(int64_t year, Py_ssize_t month)

cdef int dayofweek(int y, int m, int m) nogil
cdef bint is_leapyear(int64_t year) nogil
cpdef int32_t get_days_in_month(int year, Py_ssize_t month) nogil
cpdef int32_t get_week_of_year(int year, int month, int day) nogil
188 changes: 188 additions & 0 deletions pandas/_libs/tslibs/ccalendar.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
# -*- coding: utf-8 -*-
# cython: profile=False
# cython: boundscheck=False
"""
Cython implementations of functions resembling the stdlib calendar module
"""

cimport cython
from cython cimport Py_ssize_t

import numpy as np
cimport numpy as np
from numpy cimport int64_t, int32_t
np.import_array()


# ----------------------------------------------------------------------
# Constants

# Slightly more performant cython lookups than a 2D table
# The first 12 entries correspond to month lengths for non-leap years.
# The remaining 12 entries give month lengths for leap years
cdef int32_t* days_per_month_array = [
31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31,
31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]

cdef int* sakamoto_arr = [0, 3, 2, 5, 0, 3, 5, 1, 4, 6, 2, 4]

# The first 12 entries give the month days elapsed as of the first of month N
# in non-leap years. The remaining 12 entries give the days elapsed in leap
# years.
cdef int32_t* _month_offset = [
0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365,
0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366]

# ----------------------------------------------------------------------


@cython.wraparound(False)
@cython.boundscheck(False)
cpdef inline int32_t get_days_in_month(int year, Py_ssize_t month) nogil:
"""Return the number of days in the given month of the given year.

Parameters
----------
year : int
month : int

Returns
-------
days_in_month : int

Notes
-----
Assumes that the arguments are valid. Passing a month not between 1 and 12
risks a segfault.
"""
return days_per_month_array[12 * is_leapyear(year) + month - 1]


@cython.wraparound(False)
@cython.boundscheck(False)
cpdef monthrange(int64_t year, Py_ssize_t month):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we need this as a function that returns a tuple. its it simply better to split into 2, e.g. monthday and dayofweek? its 2 functions, but then these become all nogil

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Those two separate functions also exist here. There's not much reason to retain monthrange, you're right.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are you changing this in this PR?

"""
Return tuple containing the weekday of the first day of the month and
the number of days in the month.

Parameters
----------
year : int
month : int

Returns
-------
weekday : int
days_in_month : int

Raises
------
ValueError if month is invalid
"""
cdef:
int32_t days

if month < 1 or month > 12:
raise ValueError("bad month number 0; must be 1-12")

days = get_days_in_month(year, month)
return (dayofweek(year, month, 1), days)


@cython.wraparound(False)
@cython.boundscheck(False)
@cython.cdivision
cdef int dayofweek(int y, int m, int d) nogil:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems you are duplicating things from np_datetime.c ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems you are duplicating things from np_datetime.c ?

@jorisvandenbossche think porting less than duplicating. The np_datetime.c version is never actually used in src/datetime (will remove old version in follow-up). As long as perf doesn't take a hit, the default should by python/cython right?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry I don't understand your explanation. Previously it was imported it from np_datetime.c, and you reimplemented it here and changed that import. That seems like duplication rather than porting?

dayofweek is maybe not used in np_datetime.c itself, but is_leapyear is, so that one cannot just be removed there.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That seems like duplication rather than porting?

You're right. The intention is to remove the np_datetime.c version in a follow-up PR.

is_leapyear will be duplicated because it is used in np_datetime.c, as you noted. The idea is that np_datetime.c* is a heavy dependency (in practice tends to come with tslibs.np_datetime and np_datetime_strings.c) which can be replaced with a much lighter (and more pythonic) ccalendar dependency in some cases.

* The name overlap between np_datetime.c and tslibs.np_datetime is not my favorite.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would rather include extra deps than duplicate code. mark this for a followup.

"""Find the day of week for the date described by the Y/M/D triple y, m, d
using Sakamoto's method, from wikipedia.

https://en.wikipedia.org/wiki/\
Determination_of_the_day_of_the_week#Sakamoto.27s_methods

Parameters
----------
y : int
m : int
d : int

Returns
-------
weekday : int

Notes
-----
Assumes that y, m, d, represents a valid date.
"""
cdef:
int day

y -= m < 3
day = (y + y / 4 - y / 100 + y / 400 + sakamoto_arr[m - 1] + d) % 7
# convert to python day
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a link to python dayofweek docs (e.g. its sunday 0, -6)

return (day + 6) % 7


Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

on get_week_of_year you have boundscheck/wraparound but not here, do we need in either?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this comment about is_leapyear? If so, is_leapyear doesn't do any array lookups, so those decorators are irrelevant.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yea

cdef bint is_leapyear(int64_t year) nogil:
"""Returns 1 if the given year is a leap year, 0 otherwise.

Parameters
----------
year : int

Returns
-------
is_leap : bool
"""
return ((year & 0x3) == 0 and # year % 4 == 0
((year % 100) != 0 or (year % 400) == 0))


@cython.wraparound(False)
@cython.boundscheck(False)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you changing the boundscheck/wraparound in this PR?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hadn't planned on it. Are they harmful?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

prob not harmful but extra code and confusing. pls change.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well in get_week_of_year there are array lookups, so disabling these checks should be perf-relevant, shouldn't they?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

where are the array lookups?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Line 174 lookup in month_offset

cpdef int32_t get_week_of_year(int year, int month, int day) nogil:
"""Return the ordinal week-of-year for the given day.

Parameters
----------
year : int
month : int
day : int

Returns
-------
week_of_year : int32_t

Notes
-----
Assumes the inputs describe a valid date.
"""
cdef:
bint isleap, isleap_prev
int32_t mo_off
int32_t doy, dow
int woy

isleap = is_leapyear(year)
isleap_prev = is_leapyear(year - 1)

mo_off = _month_offset[isleap * 13 + month - 1]

doy = mo_off + day
dow = dayofweek(year, month, day)

# estimate
woy = (doy - 1) - dow + 3
if woy >= 0:
woy = woy / 7 + 1

# verify
if woy < 0:
if (woy > -2) or (woy == -2 and isleap_prev):
woy = 53
else:
woy = 52
elif woy == 53:
if 31 - day + dow < 3:
woy = 1

return woy
36 changes: 6 additions & 30 deletions pandas/_libs/tslibs/fields.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@ from numpy cimport ndarray, int64_t, int32_t, int8_t
np.import_array()


from ccalendar cimport (get_days_in_month, is_leapyear, dayofweek,
get_week_of_year)
from np_datetime cimport (pandas_datetimestruct, pandas_timedeltastruct,
dt64_to_dtstruct, td64_to_tdstruct,
days_per_month_table, is_leapyear, dayofweek)
dt64_to_dtstruct, td64_to_tdstruct)
from nattype cimport NPY_NAT


Expand Down Expand Up @@ -379,7 +380,7 @@ def get_date_field(ndarray[int64_t] dtindex, object field):
ndarray[int32_t, ndim=2] _month_offset
int isleap, isleap_prev
pandas_datetimestruct dts
int mo_off, doy, dow, woy
int mo_off, doy, dow

_month_offset = np.array(
[[ 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365 ],
Expand Down Expand Up @@ -507,28 +508,7 @@ def get_date_field(ndarray[int64_t] dtindex, object field):
continue

dt64_to_dtstruct(dtindex[i], &dts)
isleap = is_leapyear(dts.year)
isleap_prev = is_leapyear(dts.year - 1)
mo_off = _month_offset[isleap, dts.month - 1]
doy = mo_off + dts.day
dow = dayofweek(dts.year, dts.month, dts.day)

# estimate
woy = (doy - 1) - dow + 3
if woy >= 0:
woy = woy / 7 + 1

# verify
if woy < 0:
if (woy > -2) or (woy == -2 and isleap_prev):
woy = 53
else:
woy = 52
elif woy == 53:
if 31 - dts.day + dow < 3:
woy = 1

out[i] = woy
out[i] = get_week_of_year(dts.year, dts.month, dts.day)
return out

elif field == 'q':
Expand All @@ -551,7 +531,7 @@ def get_date_field(ndarray[int64_t] dtindex, object field):
continue

dt64_to_dtstruct(dtindex[i], &dts)
out[i] = days_in_month(dts)
out[i] = get_days_in_month(dts.year, dts.month)
return out
elif field == 'is_leap_year':
return isleapyear_arr(get_date_field(dtindex, 'Y'))
Expand Down Expand Up @@ -676,10 +656,6 @@ def get_timedelta_field(ndarray[int64_t] tdindex, object field):
raise ValueError("Field %s not supported" % field)


cdef inline int days_in_month(pandas_datetimestruct dts) nogil:
return days_per_month_table[is_leapyear(dts.year)][dts.month - 1]


cpdef isleapyear_arr(ndarray years):
"""vectorized version of isleapyear; NaT evaluates as False"""
cdef:
Expand Down
6 changes: 5 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,7 @@ class CheckSDist(sdist_class):
'pandas/_libs/skiplist.pyx',
'pandas/_libs/sparse.pyx',
'pandas/_libs/parsers.pyx',
'pandas/_libs/tslibs/ccalendar.pyx',
'pandas/_libs/tslibs/strptime.pyx',
'pandas/_libs/tslibs/np_datetime.pyx',
'pandas/_libs/tslibs/timedeltas.pyx',
Expand Down Expand Up @@ -562,6 +563,8 @@ def pxd(name):
'_libs/tslibs/nattype'],
'depends': tseries_depends,
'sources': np_datetime_sources},
'_libs.tslibs.ccalendar': {
'pyxfile': '_libs/tslibs/ccalendar'},
'_libs.tslibs.conversion': {
'pyxfile': '_libs/tslibs/conversion',
'pxdfiles': ['_libs/src/util',
Expand All @@ -572,7 +575,8 @@ def pxd(name):
'sources': np_datetime_sources},
'_libs.tslibs.fields': {
'pyxfile': '_libs/tslibs/fields',
'pxdfiles': ['_libs/tslibs/nattype'],
'pxdfiles': ['_libs/tslibs/ccalendar',
'_libs/tslibs/nattype'],
'depends': tseries_depends,
'sources': np_datetime_sources},
'_libs.tslibs.frequencies': {
Expand Down