Skip to content

Commit fcde169

Browse files
committed
pivot hourly using a new result shape
1 parent a3d10ba commit fcde169

File tree

3 files changed

+262
-0
lines changed

3 files changed

+262
-0
lines changed

pandas/tseries/pivot.py

+207
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
import numpy as np
2+
3+
from pandas.core.frame import DataFrame
4+
import pandas.core.nanops as nanops
5+
from pandas.tseries.util import isleapyear
6+
from pandas.tseries.index import date_range
7+
8+
def pivot_annual_h(series, freq=None, dt_index=False):
9+
"""
10+
Group a series by years, taking leap years into account.
11+
12+
The output has as many rows as distinct years in the original series,
13+
and as many columns as the length of a leap year in the units corresponding
14+
to the original frequency (366 for daily frequency, 366*24 for hourly...).
15+
The fist column of the output corresponds to Jan. 1st, 00:00:00,
16+
while the last column corresponds to Dec, 31st, 23:59:59.
17+
Entries corresponding to Feb. 29th are masked for non-leap years.
18+
19+
For example, if the initial series has a daily frequency, the 59th column
20+
of the output always corresponds to Feb. 28th, the 61st column to Mar. 1st,
21+
and the 60th column is masked for non-leap years.
22+
With a hourly initial frequency, the (59*24)th column of the output always
23+
correspond to Feb. 28th 23:00, the (61*24)th column to Mar. 1st, 00:00, and
24+
the 24 columns between (59*24) and (61*24) are masked.
25+
26+
If the original frequency is less than daily, the output is equivalent to
27+
``series.convert('A', func=None)``.
28+
29+
Parameters
30+
----------
31+
series : TimeSeries
32+
freq : string or None, default None
33+
34+
Returns
35+
-------
36+
annual : DataFrame
37+
38+
39+
"""
40+
#TODO: test like original pandas and the position of first and last value in arrays
41+
#TODO: reduce number of hardcoded values scattered all around.
42+
index = series.index
43+
year = index.year
44+
years = nanops.unique1d(year)
45+
46+
if freq is not None:
47+
freq = freq.upper()
48+
else:
49+
freq = series.index.freq
50+
51+
if freq == 'H':
52+
53+
##basics
54+
55+
#integer value of sum of all hours in a leap hear
56+
total_hoy_leap = (year_length(series.index.freqstr))
57+
58+
#list of all hours in a leap year
59+
hoy_leap_list = range(1, (total_hoy_leap + 1 ))
60+
61+
62+
63+
values = np.empty((total_hoy_leap, len(years)), dtype=series.dtype)
64+
values.fill(np.nan)
65+
66+
dummy_df = DataFrame(values, index=hoy_leap_list,
67+
columns=years)
68+
69+
##get offset for leap hours
70+
71+
#see:
72+
#http://stackoverflow.com/questions/2004364/increment-numpy-array-with-repeated-indices
73+
#1994-02-28 23:00:00 -> index 1415
74+
ind_z = np.array(range(0, 8760))
75+
ind_i = np.array(range(1416,8760 ))
76+
77+
ind_t = ind_z.copy()
78+
ind_t[ind_i]+=24
79+
80+
#TODO: beautify variable names
81+
for year in years:
82+
83+
# select data for the respective year
84+
ser_sel = series[ series.index.year == year]
85+
info = (ser_sel).values
86+
87+
88+
89+
if isleapyear(year):
90+
dummy_df[year] = info
91+
else:
92+
data = np.empty((total_hoy_leap), dtype=series.dtype)
93+
data.fill(np.nan)
94+
95+
ser_sel = series[ series.index.year == year]
96+
info = (ser_sel).values
97+
98+
data.put(ind_t, (series[ series.index.year == year]).values)
99+
100+
dummy_df[year] = data
101+
102+
res_df = dummy_df
103+
104+
#assign a datetime index, CAUTION: the year is definatly wrong!
105+
if dt_index:
106+
rng = default_rng()
107+
res_df = DataFrame(res_df.values, index=rng,
108+
columns=res_df.columns)
109+
110+
return res_df
111+
112+
#TDOO: use pivot_annual for D & M and minute in the same fashion
113+
if freq == 'D':
114+
raise NotImplementedError(freq), "use pandas.tseries.util.pivot_annual"
115+
116+
if freq == 'M':
117+
raise NotImplementedError(freq), "use pandas.tseries.util.pivot_annual"
118+
119+
else:
120+
raise NotImplementedError(freq)
121+
122+
123+
return res_df
124+
125+
126+
### timeseries pivoting helper
127+
128+
def last_col2front(df, col_no=1):
129+
"""shifts the last column of a data frame to the front
130+
131+
increase col_no to shift more cols
132+
"""
133+
cols = cols = df.columns.tolist()
134+
#increase index value to 2+ if more columns are to be shifted
135+
cols = cols[-col_no:] + cols[:-col_no]
136+
df = df[cols]
137+
138+
return df
139+
140+
141+
def extended_info(df, time_cols=True, aggreg=True, aggreg_func=None,
142+
datetime_index=False):
143+
"""add extended information to a timeseries pivot
144+
"""
145+
146+
df_extended = df.copy()
147+
#perform the following only on the data columns
148+
cols = df_extended.columns
149+
#TODO: add standard aggregation
150+
#TODO: make function be set by argument
151+
#TODO: is there no a SM describe function?
152+
#TODO: Maybe use http://pandas.pydata.org/pandas-docs/dev/basics.html#summarizing-data-describe
153+
if aggreg:
154+
155+
df_extended['mean'] = df_extended[cols].mean(1)
156+
df_extended['sum'] = df_extended[cols].sum(1)
157+
df_extended['min'] = df_extended[cols].min(1)
158+
df_extended['max'] = df_extended[cols].max(1)
159+
df_extended['max'] = df_extended[cols].std(1)
160+
161+
#add some metadata
162+
#TODO: add function to make index a datetime with the argument above using the rng below
163+
#TODO: convert the range to lower frequencies and reuse the function.
164+
rng = default_rng()
165+
df_extended['doy'] = rng.dayofyear
166+
# df_extended = last_col2front(df_extended)
167+
df_extended['month'] = rng.month
168+
# df_extended = last_col2front(df_extended)
169+
df_extended['day'] = rng.day
170+
# df_extended = last_col2front(df_extended)
171+
df_extended['hour'] = rng.hour + 1
172+
df_extended = last_col2front(df_extended, col_no=4)
173+
174+
return df_extended
175+
176+
###Timeseries convenience / helper functions
177+
178+
179+
def year_length(freq, leap=True):
180+
"""helper function for year length at different frequencies.
181+
to be expanded
182+
"""
183+
184+
daysofyear_leap = 366
185+
daysofyear_nonleap = 365
186+
187+
if freq == 'H':
188+
if leap:
189+
length = 24 * daysofyear_leap
190+
else:
191+
length = 24 * daysofyear_nonleap
192+
193+
return length
194+
195+
def default_rng(freq='H', leap=True):
196+
"""create default ranges
197+
"""
198+
199+
if leap:
200+
total_hoy_leap = (year_length(freq='H'))
201+
rng = date_range('1/1/2012', periods=total_hoy_leap, freq='H')
202+
203+
else:
204+
total_hoy_nonleap = (year_length(freq='H'))
205+
rng = date_range('1/1/2011', periods=total_hoy_nonleap, freq='H')
206+
207+
return rng

pandas/tseries/tests/test_util.py

+54
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,64 @@
77
import pandas.util.testing as tm
88

99
from pandas.tseries.util import pivot_annual, isleapyear
10+
from pandas.tseries import pivot
1011

1112
class TestPivotAnnual(unittest.TestCase):
1213
"""
1314
New pandas of scikits.timeseries pivot_annual
1415
"""
16+
def test_hourly(self):
17+
rng_hourly = date_range('1/1/1994', periods=(18* 8760 + 4*24), freq='H')
18+
data_hourly = np.random.randint(100, high=350, size=rng_hourly.size)
19+
data_hourly = data_hourly.astype('float64')
20+
ts_hourly = Series(data_hourly, index=rng_hourly)
21+
22+
annual = pivot.pivot_annual_h(ts_hourly, dt_index=True)
23+
24+
### general
25+
##test first column: if first value and data are the same as first value of timeseries
26+
#date
27+
def get_mdh(DatetimeIndex, index):
28+
#(m, d, h)
29+
mdh_tuple = (DatetimeIndex.month[index], DatetimeIndex.day[index],
30+
DatetimeIndex.hour[index])
31+
return mdh_tuple
32+
# ts_hourly.index.month[1], ts_hourly.index.month[1], ts_hourly.index.month[1]
33+
34+
assert get_mdh(ts_hourly.index, 1) == get_mdh(annual.index, 1)
35+
#are the last dates of ts identical with the dates last row in the last column?
36+
assert get_mdh(ts_hourly.index[-1]) == get_mdh(annual.index,
37+
(annual.index.size -1))
38+
#first values of the ts identical with the first col and last row of the df?
39+
assert ts_hourly[0] == annual.ix[1].values[0]
40+
#last values of the ts identical with the last col and last row of the df?
41+
assert ts_hourly[-1] == annual.ix[annual.index.size].values[-1]
42+
### index
43+
##test if index has the right length
44+
assert annual.index[-1] == 8784
45+
##test last column: if first value and data are the same as first value of timeseries
46+
### leap
47+
##test leap offset
48+
#leap year: 1996 - are the values of the ts and the
49+
ser96_leap = ts_hourly[(ts_hourly.index.year == 1996) &
50+
(ts_hourly.index.month == 2) &
51+
(ts_hourly.index.day == 29)
52+
]
53+
54+
df96 = annual[1996]
55+
df96_leap = df96[(df96.index.month == 2) & (df96.index.day == 29)]
56+
tm.assert_series_equal(ser96_leap, df96_leap)
57+
#non-leap year: 1994 - are all values NaN for day 29.02?
58+
nan_arr = np.empty(24)
59+
nan_arr.fill(np.nan)
60+
df94 = annual[1994]
61+
df94_noleap = df94[(df94.index.month == 2) & (df94.index.day == 29)]
62+
np.testing.assert_equal(df94_noleap.values, nan_arr)
63+
### extended functionaliy
64+
65+
66+
67+
1568
def test_daily(self):
1669
rng = date_range('1/1/2000', '12/31/2004', freq='D')
1770
ts = Series(np.random.randn(len(rng)), index=rng)
@@ -33,6 +86,7 @@ def test_daily(self):
3386
leaps.index = leaps.index.year
3487
tm.assert_series_equal(annual[day].dropna(), leaps)
3588

89+
3690
def test_weekly(self):
3791
pass
3892

pandas/tseries/util.py

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from pandas.core.frame import DataFrame
44
import pandas.core.nanops as nanops
5+
from pandas.tseries.util import isleapyear
56

67
def pivot_annual(series, freq=None):
78
"""

0 commit comments

Comments
 (0)