Skip to content

Commit da2b7ad

Browse files
committed
ENH: more frequency inference, unique_deltas function
1 parent 8d599a4 commit da2b7ad

File tree

8 files changed

+300
-11
lines changed

8 files changed

+300
-11
lines changed

pandas/src/datetime.pyx

+42-3
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ ctypedef enum time_res:
4141
# This serves as the box for datetime64
4242
class Timestamp(_Timestamp):
4343

44+
__slots__ = ['value', 'offset']
45+
4446
def __new__(cls, object ts_input, object offset=None, tz=None):
4547
if isinstance(ts_input, float):
4648
# to do, do we want to support this, ie with fractional seconds?
@@ -103,6 +105,37 @@ class Timestamp(_Timestamp):
103105

104106
return Period(self, freq=freq)
105107

108+
#----------------------------------------------------------------------
109+
# Frequency inference
110+
111+
def unique_deltas(ndarray[int64_t] arr):
112+
cdef:
113+
Py_ssize_t i, n = len(arr)
114+
int64_t val
115+
khiter_t k
116+
kh_int64_t *table
117+
int ret = 0
118+
list uniques = []
119+
120+
table = kh_init_int64()
121+
kh_resize_int64(table, 10)
122+
for i in range(n - 1):
123+
val = arr[i + 1] - arr[i]
124+
k = kh_get_int64(table, val)
125+
if k == table.n_buckets:
126+
kh_put_int64(table, val, &ret)
127+
uniques.append(val)
128+
kh_destroy_int64(table)
129+
130+
result = np.array(uniques, dtype=np.int64)
131+
result.sort()
132+
return result
133+
134+
135+
cdef inline bint _is_multiple(int64_t us, int64_t mult):
136+
return us % mult == 0
137+
138+
106139
def apply_offset(ndarray[object] values, object offset):
107140
cdef:
108141
Py_ssize_t i, n = len(values)
@@ -843,7 +876,7 @@ def tz_localize_array(ndarray[int64_t] vals, object tz):
843876
return vals
844877

845878
# Accessors
846-
# ------------------------------------------------------------------------------
879+
#----------------------------------------------------------------------
847880

848881
def build_field_sarray(ndarray[int64_t] dtindex):
849882
'''
@@ -966,8 +999,14 @@ def fast_field_accessor(ndarray[int64_t] dtindex, object field):
966999

9671000
raise ValueError("Field %s not supported" % field)
9681001

1002+
1003+
cdef inline int m8_weekday(int64_t val):
1004+
ts = convert_to_tsobject(val)
1005+
return ts_dayofweek(ts)
1006+
1007+
9691008
# Some general helper functions
970-
# ------------------------------------------------------------------------------
1009+
#----------------------------------------------------------------------
9711010

9721011
def isleapyear(int64_t year):
9731012
return is_leapyear(year)
@@ -988,7 +1027,7 @@ cdef inline int64_t ts_dayofweek(_TSObject ts):
9881027
return dayofweek(ts.dtval.year, ts.dtval.month, ts.dtval.day)
9891028

9901029
# Period logic
991-
# ------------------------------------------------------------------------------
1030+
#----------------------------------------------------------------------
9921031

9931032
cdef int64_t apply_mult(int64_t period_ord, int64_t mult):
9941033
"""

pandas/src/khash.h

-1
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,6 @@ int main() {
114114
#include <limits.h>
115115
#include <Python.h>
116116

117-
/* compipler specific configuration */
118117

119118
#if UINT_MAX == 0xffffffffu
120119
typedef unsigned int khint32_t;

pandas/src/ktypes.h

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#ifndef __KTYPES_H
2+
#define __KTYPES_H
3+
4+
/* compipler specific configuration */
5+
6+
#endif /* __KTYPES_H */

pandas/src/kvec.h

+98
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
/* The MIT License
2+
3+
Copyright (c) 2008, by Attractive Chaos <[email protected]>
4+
5+
Permission is hereby granted, free of charge, to any person obtaining
6+
a copy of this software and associated documentation files (the
7+
"Software"), to deal in the Software without restriction, including
8+
without limitation the rights to use, copy, modify, merge, publish,
9+
distribute, sublicense, and/or sell copies of the Software, and to
10+
permit persons to whom the Software is furnished to do so, subject to
11+
the following conditions:
12+
13+
The above copyright notice and this permission notice shall be
14+
included in all copies or substantial portions of the Software.
15+
16+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20+
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21+
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23+
SOFTWARE.
24+
*/
25+
26+
/*
27+
An example:
28+
29+
#include "kvec.h"
30+
int main() {
31+
kvec_t(int) array;
32+
kv_init(array);
33+
kv_push(int, array, 10); // append
34+
kv_a(int, array, 20) = 5; // dynamic
35+
kv_A(array, 20) = 4; // static
36+
kv_destroy(array);
37+
return 0;
38+
}
39+
*/
40+
41+
/*
42+
2008-09-22 (0.1.0):
43+
44+
* The initial version.
45+
46+
*/
47+
48+
#ifndef AC_KVEC_H
49+
#define AC_KVEC_H
50+
51+
#include <stdlib.h>
52+
#include <ktypes.h>
53+
54+
#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
55+
56+
#define kvec_t(type) struct { size_t n, m; type *a; }
57+
#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0)
58+
#define kv_destroy(v) free((v).a)
59+
#define kv_A(v, i) ((v).a[(i)])
60+
#define kv_pop(v) ((v).a[--(v).n])
61+
#define kv_size(v) ((v).n)
62+
#define kv_max(v) ((v).m)
63+
64+
#define kv_resize(type, v, s) ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m))
65+
66+
#define kv_copy(type, v1, v0) do { \
67+
if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \
68+
(v1).n = (v0).n; \
69+
memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \
70+
} while (0) \
71+
72+
#define kv_push(type, v, x) do { \
73+
if ((v).n == (v).m) { \
74+
(v).m = (v).m? (v).m<<1 : 2; \
75+
(v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \
76+
} \
77+
(v).a[(v).n++] = (x); \
78+
} while (0)
79+
80+
#define kv_pushp(type, v) (((v).n == (v).m)? \
81+
((v).m = ((v).m? (v).m<<1 : 2), \
82+
(v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \
83+
: 0), ((v).a + ((v).n++))
84+
85+
#define kv_a(type, v, i) ((v).m <= (size_t)(i)? \
86+
((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \
87+
(v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \
88+
: (v).n <= (size_t)(i)? (v).n = (i) \
89+
: 0), (v).a[(i)]
90+
91+
#define kv_int64_push(v, x) (kv_push(int64_t, (v), (x)))
92+
93+
typedef struct {
94+
size_t n, m;
95+
int64_t *a;
96+
} kv_int64_t;
97+
98+
#endif

pandas/src/sandbox.pyx

+8-1
Original file line numberDiff line numberDiff line change
@@ -533,6 +533,13 @@ def group_add(ndarray[float64_t, ndim=2] out,
533533
from datetime cimport getAbsTime
534534

535535

536+
# cdef extern from "kvec.h":
537+
538+
# ctypedef struct kv_int64_t:
539+
# size_t n, m
540+
# int64_t *a
541+
542+
543+
536544
def get_abs_time(freq, dailyDate, originalDate):
537545
return getAbsTime(freq, dailyDate, originalDate)
538-

pandas/tseries/frequencies.py

+141-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import re
22

33
from pandas.tseries.offsets import DateOffset
4+
from pandas._tseries import Timestamp
45
import pandas.tseries.offsets as offsets
56

67

@@ -52,7 +53,7 @@ def _get_freq_str(base, mult):
5253
_unknown_freq = 'Unknown'
5354

5455

55-
#-------------------------------------------------------------------------------
56+
#----------------------------------------------------------------------
5657
# Offset names ("time rules") and related functions
5758

5859

@@ -615,3 +616,142 @@ def _period_str_to_code(freqstr):
615616
return _period_code_map[alias]
616617
except:
617618
raise "Could not interpret frequency %s" % freqstr
619+
620+
621+
def infer_freq(index):
622+
"""
623+
Not sure if I can avoid the state machine here
624+
"""
625+
from pandas._sandbox import unique_deltas
626+
627+
if len(index) < 3:
628+
raise ValueError('Need at least 3 dates to infer frequency')
629+
630+
deltas = unique_deltas(index)
631+
632+
is_unique = len(deltas) == 1
633+
634+
if _is_multiple(deltas[0], _day_us):
635+
if is_unique:
636+
days = deltas[0] / _day_us
637+
if days % 7 == 0:
638+
# Weekly
639+
alias = _weekday_rule_aliases[days]
640+
return _maybe_add_count('W-%s' % alias, days / 7)
641+
else:
642+
return _maybe_add_count('D', days)
643+
644+
fields = lib.build_field_sarray(index)
645+
646+
day_list = [x / _day_us for x in deltas]
647+
rstamp = Timestamp(index[0])
648+
649+
annual_rule = _get_annual_rule(fields)
650+
if annual_rule:
651+
nyears = day_list[0] // 365
652+
month = _month_aliases[rstamp.month]
653+
return _maybe_add_count('%s-%s' % (annual_rule, month), nyears)
654+
655+
quarterly_rule = _get_quarterly_rule(fields)
656+
if quarterly_rule:
657+
month = _month_aliases[rstamp.month]
658+
return '%s-%s' % (quarterly_rule, month)
659+
660+
elif _is_quarterly_deltas(day_list):
661+
pass
662+
elif _is_monthly_deltas(day_list):
663+
pass
664+
else:
665+
# Business daily. Maybe
666+
pass
667+
668+
elif _is_multiple(deltas[0], 60 * 60 * 1000000):
669+
if not is_unique:
670+
return None
671+
# Hours
672+
return '%dH' % (deltas[0] / (60 * 60 * 1000000))
673+
elif _is_multiple(deltas[0], 60 * 1000000):
674+
if not is_unique:
675+
return None
676+
# Minutes
677+
return '%dT' % (deltas[0] / (60 * 1000000))
678+
elif _is_multiple(deltas[0], 1000000):
679+
if not is_unique:
680+
return None
681+
# Seconds
682+
return '%dS' % (deltas[0] / 1000000)
683+
elif _is_multiple(deltas[0], 1000):
684+
if not is_unique:
685+
return None
686+
# Milliseconds
687+
return '%dL' % (deltas[0] / 1000)
688+
else:
689+
if not is_unique:
690+
return None
691+
# Microseconds
692+
return '%dU' % deltas[0]
693+
694+
695+
import pandas.core.algorithms as algos
696+
697+
698+
def _get_annual_rule(fields):
699+
years = fields['Y']
700+
months = fields['M']
701+
days = fields['D']
702+
703+
ydiffs = unique_deltas(years.astype('i8'))
704+
if len(ydiffs) > 1:
705+
return False
706+
707+
if len(algos.unique(months)) == 1:
708+
if _all_last_weekday(years, months, days):
709+
return
710+
711+
712+
def _is_quarterly_deltas(day_list):
713+
pass
714+
715+
def _is_monthly_deltas(day_list):
716+
pass
717+
718+
def _is_business_years(index):
719+
pass
720+
721+
def _maybe_add_count(base, count):
722+
if count > 1:
723+
return '%d%s' % (count, base)
724+
else:
725+
return base
726+
727+
728+
729+
_weekday_rule_aliases = {
730+
0: 'MON',
731+
1: 'TUE',
732+
2: 'WED',
733+
3: 'THU',
734+
4: 'FRI',
735+
5: 'SAT',
736+
6: 'SUN'
737+
}
738+
739+
_month_aliases = {
740+
1: 'JAN',
741+
2: 'FEB',
742+
3: 'MAR',
743+
4: 'APR',
744+
5: 'MAY',
745+
6: 'JUN',
746+
7: 'JUL',
747+
8: 'AUG',
748+
9: 'SEP',
749+
10: 'OCT',
750+
11: 'NOV',
751+
12: 'DEC'
752+
}
753+
754+
def _is_multiple(us, mult):
755+
return us % mult == 0
756+
757+
_day_us = 24 * 60 * 60 * 1000000

pandas/tseries/index.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import numpy as np
55

66
from pandas.core.index import Index, Int64Index
7+
from pandas.tseries.frequencies import infer_freq
78
from pandas.tseries.tools import parse_time_string
89
from pandas.util.decorators import cache_readonly
910
import pandas.core.common as com
@@ -841,8 +842,7 @@ def freq(self):
841842

842843
@cache_readonly
843844
def inferred_freq(self):
844-
import pandas._sandbox as sbx
845-
return sbx.infer_freq(self.asi8)
845+
return infer_freq(self.asi8)
846846

847847
@property
848848
def freqstr(self):

0 commit comments

Comments
 (0)