Skip to content

Commit e0b81d4

Browse files
jbrockmendeljreback
authored andcommitted
Move FrequencyInferer out of libresolution (pandas-dev#21992)
1 parent 2d0c961 commit e0b81d4

File tree

3 files changed

+304
-291
lines changed

3 files changed

+304
-291
lines changed

pandas/_libs/algos.pyx

+41
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@ cnp.import_array()
2222
cimport util
2323
from util cimport numeric, get_nat
2424

25+
from khash cimport (khiter_t,
26+
kh_destroy_int64, kh_put_int64,
27+
kh_init_int64, kh_int64_t,
28+
kh_resize_int64, kh_get_int64)
29+
2530
import missing
2631

2732
cdef float64_t FP_ERR = 1e-13
@@ -71,6 +76,42 @@ class NegInfinity(object):
7176
__ge__ = lambda self, other: isinstance(other, NegInfinity)
7277

7378

79+
cpdef ndarray[int64_t, ndim=1] unique_deltas(ndarray[int64_t] arr):
80+
"""
81+
Efficiently find the unique first-differences of the given array.
82+
83+
Parameters
84+
----------
85+
arr : ndarray[in64_t]
86+
87+
Returns
88+
-------
89+
result : ndarray[int64_t]
90+
result is sorted
91+
"""
92+
cdef:
93+
Py_ssize_t i, n = len(arr)
94+
int64_t val
95+
khiter_t k
96+
kh_int64_t *table
97+
int ret = 0
98+
list uniques = []
99+
100+
table = kh_init_int64()
101+
kh_resize_int64(table, 10)
102+
for i in range(n - 1):
103+
val = arr[i + 1] - arr[i]
104+
k = kh_get_int64(table, val)
105+
if k == table.n_buckets:
106+
kh_put_int64(table, val, &ret)
107+
uniques.append(val)
108+
kh_destroy_int64(table)
109+
110+
result = np.array(uniques, dtype=np.int64)
111+
result.sort()
112+
return result
113+
114+
74115
@cython.wraparound(False)
75116
@cython.boundscheck(False)
76117
def is_lexsorted(list list_of_arrays):

pandas/_libs/tslibs/resolution.pyx

+2-287
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# -*- coding: utf-8 -*-
22
# cython: profile=False
33

4+
cimport cython
45
from cython cimport Py_ssize_t
56

67
import numpy as np
@@ -10,23 +11,12 @@ cnp.import_array()
1011

1112
from util cimport is_string_object, get_nat
1213

13-
from pandas._libs.khash cimport (khiter_t,
14-
kh_destroy_int64, kh_put_int64,
15-
kh_init_int64, kh_int64_t,
16-
kh_resize_int64, kh_get_int64)
17-
1814
from np_datetime cimport npy_datetimestruct, dt64_to_dtstruct
1915
from frequencies cimport get_freq_code
2016
from timezones cimport (is_utc, is_tzlocal,
2117
maybe_get_tz, get_dst_info)
22-
from fields import build_field_sarray
23-
from conversion import tz_convert
2418
from conversion cimport tz_convert_utc_to_tzlocal
25-
from ccalendar import MONTH_ALIASES, int_to_weekday
2619
from ccalendar cimport get_days_in_month
27-
from timestamps import Timestamp
28-
29-
from pandas._libs.properties import cache_readonly
3020

3121
# ----------------------------------------------------------------------
3222
# Constants
@@ -41,13 +31,6 @@ cdef int RESO_MIN = 4
4131
cdef int RESO_HR = 5
4232
cdef int RESO_DAY = 6
4333

44-
_ONE_MICRO = <int64_t>1000L
45-
_ONE_MILLI = <int64_t>(_ONE_MICRO * 1000)
46-
_ONE_SECOND = <int64_t>(_ONE_MILLI * 1000)
47-
_ONE_MINUTE = <int64_t>(60 * _ONE_SECOND)
48-
_ONE_HOUR = <int64_t>(60 * _ONE_MINUTE)
49-
_ONE_DAY = <int64_t>(24 * _ONE_HOUR)
50-
5134
# ----------------------------------------------------------------------
5235

5336
cpdef resolution(ndarray[int64_t] stamps, tz=None):
@@ -331,31 +314,7 @@ class Resolution(object):
331314
# ----------------------------------------------------------------------
332315
# Frequency Inference
333316

334-
cdef ndarray[int64_t, ndim=1] unique_deltas(ndarray[int64_t] arr):
335-
cdef:
336-
Py_ssize_t i, n = len(arr)
337-
int64_t val
338-
khiter_t k
339-
kh_int64_t *table
340-
int ret = 0
341-
list uniques = []
342-
343-
table = kh_init_int64()
344-
kh_resize_int64(table, 10)
345-
for i in range(n - 1):
346-
val = arr[i + 1] - arr[i]
347-
k = kh_get_int64(table, val)
348-
if k == table.n_buckets:
349-
kh_put_int64(table, val, &ret)
350-
uniques.append(val)
351-
kh_destroy_int64(table)
352-
353-
result = np.array(uniques, dtype=np.int64)
354-
result.sort()
355-
return result
356-
357-
358-
cdef object month_position_check(fields, weekdays):
317+
def month_position_check(fields, weekdays):
359318
cdef:
360319
int32_t daysinmonth, y, m, d
361320
bint calendar_end = True
@@ -397,247 +356,3 @@ cdef object month_position_check(fields, weekdays):
397356
return 'bs'
398357
else:
399358
return None
400-
401-
402-
cdef inline bint _is_multiple(int64_t us, int64_t mult):
403-
return us % mult == 0
404-
405-
406-
cdef inline str _maybe_add_count(str base, int64_t count):
407-
if count != 1:
408-
return '{count}{base}'.format(count=count, base=base)
409-
else:
410-
return base
411-
412-
413-
cdef class _FrequencyInferer(object):
414-
"""
415-
Not sure if I can avoid the state machine here
416-
"""
417-
cdef public:
418-
object index
419-
object values
420-
bint warn
421-
bint is_monotonic
422-
dict _cache
423-
424-
def __init__(self, index, warn=True):
425-
self.index = index
426-
self.values = np.asarray(index).view('i8')
427-
428-
# This moves the values, which are implicitly in UTC, to the
429-
# the timezone so they are in local time
430-
if hasattr(index, 'tz'):
431-
if index.tz is not None:
432-
self.values = tz_convert(self.values, 'UTC', index.tz)
433-
434-
self.warn = warn
435-
436-
if len(index) < 3:
437-
raise ValueError('Need at least 3 dates to infer frequency')
438-
439-
self.is_monotonic = (self.index.is_monotonic_increasing or
440-
self.index.is_monotonic_decreasing)
441-
442-
@cache_readonly
443-
def deltas(self):
444-
return unique_deltas(self.values)
445-
446-
@cache_readonly
447-
def deltas_asi8(self):
448-
return unique_deltas(self.index.asi8)
449-
450-
@cache_readonly
451-
def is_unique(self):
452-
return len(self.deltas) == 1
453-
454-
@cache_readonly
455-
def is_unique_asi8(self):
456-
return len(self.deltas_asi8) == 1
457-
458-
def get_freq(self):
459-
if not self.is_monotonic or not self.index.is_unique:
460-
return None
461-
462-
delta = self.deltas[0]
463-
if _is_multiple(delta, _ONE_DAY):
464-
return self._infer_daily_rule()
465-
else:
466-
# Business hourly, maybe. 17: one day / 65: one weekend
467-
if self.hour_deltas in ([1, 17], [1, 65], [1, 17, 65]):
468-
return 'BH'
469-
# Possibly intraday frequency. Here we use the
470-
# original .asi8 values as the modified values
471-
# will not work around DST transitions. See #8772
472-
elif not self.is_unique_asi8:
473-
return None
474-
delta = self.deltas_asi8[0]
475-
if _is_multiple(delta, _ONE_HOUR):
476-
# Hours
477-
return _maybe_add_count('H', delta / _ONE_HOUR)
478-
elif _is_multiple(delta, _ONE_MINUTE):
479-
# Minutes
480-
return _maybe_add_count('T', delta / _ONE_MINUTE)
481-
elif _is_multiple(delta, _ONE_SECOND):
482-
# Seconds
483-
return _maybe_add_count('S', delta / _ONE_SECOND)
484-
elif _is_multiple(delta, _ONE_MILLI):
485-
# Milliseconds
486-
return _maybe_add_count('L', delta / _ONE_MILLI)
487-
elif _is_multiple(delta, _ONE_MICRO):
488-
# Microseconds
489-
return _maybe_add_count('U', delta / _ONE_MICRO)
490-
else:
491-
# Nanoseconds
492-
return _maybe_add_count('N', delta)
493-
494-
@cache_readonly
495-
def day_deltas(self):
496-
return [x / _ONE_DAY for x in self.deltas]
497-
498-
@cache_readonly
499-
def hour_deltas(self):
500-
return [x / _ONE_HOUR for x in self.deltas]
501-
502-
@cache_readonly
503-
def fields(self):
504-
return build_field_sarray(self.values)
505-
506-
@cache_readonly
507-
def rep_stamp(self):
508-
return Timestamp(self.values[0])
509-
510-
cdef object month_position_check(self):
511-
return month_position_check(self.fields, self.index.dayofweek)
512-
513-
@cache_readonly
514-
def mdiffs(self):
515-
nmonths = self.fields['Y'] * 12 + self.fields['M']
516-
return unique_deltas(nmonths.astype('i8'))
517-
518-
@cache_readonly
519-
def ydiffs(self):
520-
return unique_deltas(self.fields['Y'].astype('i8'))
521-
522-
cdef _infer_daily_rule(self):
523-
annual_rule = self._get_annual_rule()
524-
if annual_rule:
525-
nyears = self.ydiffs[0]
526-
month = MONTH_ALIASES[self.rep_stamp.month]
527-
alias = '{prefix}-{month}'.format(prefix=annual_rule, month=month)
528-
return _maybe_add_count(alias, nyears)
529-
530-
quarterly_rule = self._get_quarterly_rule()
531-
if quarterly_rule:
532-
nquarters = self.mdiffs[0] / 3
533-
mod_dict = {0: 12, 2: 11, 1: 10}
534-
month = MONTH_ALIASES[mod_dict[self.rep_stamp.month % 3]]
535-
alias = '{prefix}-{month}'.format(prefix=quarterly_rule,
536-
month=month)
537-
return _maybe_add_count(alias, nquarters)
538-
539-
monthly_rule = self._get_monthly_rule()
540-
if monthly_rule:
541-
return _maybe_add_count(monthly_rule, self.mdiffs[0])
542-
543-
if self.is_unique:
544-
days = self.deltas[0] / _ONE_DAY
545-
if days % 7 == 0:
546-
# Weekly
547-
day = int_to_weekday[self.rep_stamp.weekday()]
548-
return _maybe_add_count('W-{day}'.format(day=day), days / 7)
549-
else:
550-
return _maybe_add_count('D', days)
551-
552-
if self._is_business_daily():
553-
return 'B'
554-
555-
wom_rule = self._get_wom_rule()
556-
if wom_rule:
557-
return wom_rule
558-
559-
cdef _get_annual_rule(self):
560-
if len(self.ydiffs) > 1:
561-
return None
562-
563-
# lazy import to prevent circularity
564-
# TODO: Avoid non-cython dependency
565-
from pandas.core.algorithms import unique
566-
567-
if len(unique(self.fields['M'])) > 1:
568-
return None
569-
570-
pos_check = self.month_position_check()
571-
return {'cs': 'AS', 'bs': 'BAS',
572-
'ce': 'A', 'be': 'BA'}.get(pos_check)
573-
574-
cdef _get_quarterly_rule(self):
575-
if len(self.mdiffs) > 1:
576-
return None
577-
578-
if not self.mdiffs[0] % 3 == 0:
579-
return None
580-
581-
pos_check = self.month_position_check()
582-
return {'cs': 'QS', 'bs': 'BQS',
583-
'ce': 'Q', 'be': 'BQ'}.get(pos_check)
584-
585-
cdef _get_monthly_rule(self):
586-
if len(self.mdiffs) > 1:
587-
return None
588-
pos_check = self.month_position_check()
589-
return {'cs': 'MS', 'bs': 'BMS',
590-
'ce': 'M', 'be': 'BM'}.get(pos_check)
591-
592-
cdef bint _is_business_daily(self):
593-
# quick check: cannot be business daily
594-
if self.day_deltas != [1, 3]:
595-
return False
596-
597-
# probably business daily, but need to confirm
598-
first_weekday = self.index[0].weekday()
599-
shifts = np.diff(self.index.asi8)
600-
shifts = np.floor_divide(shifts, _ONE_DAY)
601-
weekdays = np.mod(first_weekday + np.cumsum(shifts), 7)
602-
return np.all(((weekdays == 0) & (shifts == 3)) |
603-
((weekdays > 0) & (weekdays <= 4) & (shifts == 1)))
604-
605-
cdef _get_wom_rule(self):
606-
# wdiffs = unique(np.diff(self.index.week))
607-
# We also need -47, -49, -48 to catch index spanning year boundary
608-
# if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all():
609-
# return None
610-
611-
# lazy import to prevent circularity
612-
# TODO: Avoid non-cython dependency
613-
from pandas.core.algorithms import unique
614-
615-
weekdays = unique(self.index.weekday)
616-
if len(weekdays) > 1:
617-
return None
618-
619-
week_of_months = unique((self.index.day - 1) // 7)
620-
# Only attempt to infer up to WOM-4. See #9425
621-
week_of_months = week_of_months[week_of_months < 4]
622-
if len(week_of_months) == 0 or len(week_of_months) > 1:
623-
return None
624-
625-
# get which week
626-
week = week_of_months[0] + 1
627-
wd = int_to_weekday[weekdays[0]]
628-
629-
return 'WOM-{week}{weekday}'.format(week=week, weekday=wd)
630-
631-
632-
cdef class _TimedeltaFrequencyInferer(_FrequencyInferer):
633-
634-
cdef _infer_daily_rule(self):
635-
if self.is_unique:
636-
days = self.deltas[0] / _ONE_DAY
637-
if days % 7 == 0:
638-
# Weekly
639-
wd = int_to_weekday[self.rep_stamp.weekday()]
640-
alias = 'W-{weekday}'.format(weekday=wd)
641-
return _maybe_add_count(alias, days / 7)
642-
else:
643-
return _maybe_add_count('D', days)

0 commit comments

Comments
 (0)