From fcde169a02af35e0b46d588bbecf85715d12857d Mon Sep 17 00:00:00 2001 From: TimMi Date: Thu, 1 Nov 2012 16:14:23 +0100 Subject: [PATCH 1/5] pivot hourly using a new result shape --- pandas/tseries/pivot.py | 207 ++++++++++++++++++++++++++++++ pandas/tseries/tests/test_util.py | 54 ++++++++ pandas/tseries/util.py | 1 + 3 files changed, 262 insertions(+) create mode 100644 pandas/tseries/pivot.py diff --git a/pandas/tseries/pivot.py b/pandas/tseries/pivot.py new file mode 100644 index 0000000000000..50fd60b2a03ff --- /dev/null +++ b/pandas/tseries/pivot.py @@ -0,0 +1,207 @@ +import numpy as np + +from pandas.core.frame import DataFrame +import pandas.core.nanops as nanops +from pandas.tseries.util import isleapyear +from pandas.tseries.index import date_range + +def pivot_annual_h(series, freq=None, dt_index=False): + """ + Group a series by years, taking leap years into account. + + The output has as many rows as distinct years in the original series, + and as many columns as the length of a leap year in the units corresponding + to the original frequency (366 for daily frequency, 366*24 for hourly...). + The fist column of the output corresponds to Jan. 1st, 00:00:00, + while the last column corresponds to Dec, 31st, 23:59:59. + Entries corresponding to Feb. 29th are masked for non-leap years. + + For example, if the initial series has a daily frequency, the 59th column + of the output always corresponds to Feb. 28th, the 61st column to Mar. 1st, + and the 60th column is masked for non-leap years. + With a hourly initial frequency, the (59*24)th column of the output always + correspond to Feb. 28th 23:00, the (61*24)th column to Mar. 1st, 00:00, and + the 24 columns between (59*24) and (61*24) are masked. + + If the original frequency is less than daily, the output is equivalent to + ``series.convert('A', func=None)``. + + Parameters + ---------- + series : TimeSeries + freq : string or None, default None + + Returns + ------- + annual : DataFrame + + + """ + #TODO: test like original pandas and the position of first and last value in arrays + #TODO: reduce number of hardcoded values scattered all around. + index = series.index + year = index.year + years = nanops.unique1d(year) + + if freq is not None: + freq = freq.upper() + else: + freq = series.index.freq + + if freq == 'H': + + ##basics + + #integer value of sum of all hours in a leap hear + total_hoy_leap = (year_length(series.index.freqstr)) + + #list of all hours in a leap year + hoy_leap_list = range(1, (total_hoy_leap + 1 )) + + + + values = np.empty((total_hoy_leap, len(years)), dtype=series.dtype) + values.fill(np.nan) + + dummy_df = DataFrame(values, index=hoy_leap_list, + columns=years) + + ##get offset for leap hours + + #see: + #http://stackoverflow.com/questions/2004364/increment-numpy-array-with-repeated-indices + #1994-02-28 23:00:00 -> index 1415 + ind_z = np.array(range(0, 8760)) + ind_i = np.array(range(1416,8760 )) + + ind_t = ind_z.copy() + ind_t[ind_i]+=24 + + #TODO: beautify variable names + for year in years: + + # select data for the respective year + ser_sel = series[ series.index.year == year] + info = (ser_sel).values + + + + if isleapyear(year): + dummy_df[year] = info + else: + data = np.empty((total_hoy_leap), dtype=series.dtype) + data.fill(np.nan) + + ser_sel = series[ series.index.year == year] + info = (ser_sel).values + + data.put(ind_t, (series[ series.index.year == year]).values) + + dummy_df[year] = data + + res_df = dummy_df + + #assign a datetime index, CAUTION: the year is definatly wrong! + if dt_index: + rng = default_rng() + res_df = DataFrame(res_df.values, index=rng, + columns=res_df.columns) + + return res_df + +#TDOO: use pivot_annual for D & M and minute in the same fashion + if freq == 'D': + raise NotImplementedError(freq), "use pandas.tseries.util.pivot_annual" + + if freq == 'M': + raise NotImplementedError(freq), "use pandas.tseries.util.pivot_annual" + + else: + raise NotImplementedError(freq) + + + return res_df + + +### timeseries pivoting helper + +def last_col2front(df, col_no=1): + """shifts the last column of a data frame to the front + + increase col_no to shift more cols + """ + cols = cols = df.columns.tolist() + #increase index value to 2+ if more columns are to be shifted + cols = cols[-col_no:] + cols[:-col_no] + df = df[cols] + + return df + + +def extended_info(df, time_cols=True, aggreg=True, aggreg_func=None, + datetime_index=False): + """add extended information to a timeseries pivot + """ + + df_extended = df.copy() + #perform the following only on the data columns + cols = df_extended.columns + #TODO: add standard aggregation + #TODO: make function be set by argument + #TODO: is there no a SM describe function? + #TODO: Maybe use http://pandas.pydata.org/pandas-docs/dev/basics.html#summarizing-data-describe + if aggreg: + + df_extended['mean'] = df_extended[cols].mean(1) + df_extended['sum'] = df_extended[cols].sum(1) + df_extended['min'] = df_extended[cols].min(1) + df_extended['max'] = df_extended[cols].max(1) + df_extended['max'] = df_extended[cols].std(1) + + #add some metadata + #TODO: add function to make index a datetime with the argument above using the rng below + #TODO: convert the range to lower frequencies and reuse the function. + rng = default_rng() + df_extended['doy'] = rng.dayofyear +# df_extended = last_col2front(df_extended) + df_extended['month'] = rng.month +# df_extended = last_col2front(df_extended) + df_extended['day'] = rng.day +# df_extended = last_col2front(df_extended) + df_extended['hour'] = rng.hour + 1 + df_extended = last_col2front(df_extended, col_no=4) + + return df_extended + +###Timeseries convenience / helper functions + + +def year_length(freq, leap=True): + """helper function for year length at different frequencies. + to be expanded + """ + + daysofyear_leap = 366 + daysofyear_nonleap = 365 + + if freq == 'H': + if leap: + length = 24 * daysofyear_leap + else: + length = 24 * daysofyear_nonleap + + return length + +def default_rng(freq='H', leap=True): + """create default ranges + """ + + if leap: + total_hoy_leap = (year_length(freq='H')) + rng = date_range('1/1/2012', periods=total_hoy_leap, freq='H') + + else: + total_hoy_nonleap = (year_length(freq='H')) + rng = date_range('1/1/2011', periods=total_hoy_nonleap, freq='H') + + return rng \ No newline at end of file diff --git a/pandas/tseries/tests/test_util.py b/pandas/tseries/tests/test_util.py index 02a98858ed808..1a445343390d5 100644 --- a/pandas/tseries/tests/test_util.py +++ b/pandas/tseries/tests/test_util.py @@ -7,11 +7,64 @@ import pandas.util.testing as tm from pandas.tseries.util import pivot_annual, isleapyear +from pandas.tseries import pivot class TestPivotAnnual(unittest.TestCase): """ New pandas of scikits.timeseries pivot_annual """ + def test_hourly(self): + rng_hourly = date_range('1/1/1994', periods=(18* 8760 + 4*24), freq='H') + data_hourly = np.random.randint(100, high=350, size=rng_hourly.size) + data_hourly = data_hourly.astype('float64') + ts_hourly = Series(data_hourly, index=rng_hourly) + + annual = pivot.pivot_annual_h(ts_hourly, dt_index=True) + + ### general + ##test first column: if first value and data are the same as first value of timeseries + #date + def get_mdh(DatetimeIndex, index): + #(m, d, h) + mdh_tuple = (DatetimeIndex.month[index], DatetimeIndex.day[index], + DatetimeIndex.hour[index]) + return mdh_tuple +# ts_hourly.index.month[1], ts_hourly.index.month[1], ts_hourly.index.month[1] + + assert get_mdh(ts_hourly.index, 1) == get_mdh(annual.index, 1) + #are the last dates of ts identical with the dates last row in the last column? + assert get_mdh(ts_hourly.index[-1]) == get_mdh(annual.index, + (annual.index.size -1)) + #first values of the ts identical with the first col and last row of the df? + assert ts_hourly[0] == annual.ix[1].values[0] + #last values of the ts identical with the last col and last row of the df? + assert ts_hourly[-1] == annual.ix[annual.index.size].values[-1] + ### index + ##test if index has the right length + assert annual.index[-1] == 8784 + ##test last column: if first value and data are the same as first value of timeseries + ### leap + ##test leap offset + #leap year: 1996 - are the values of the ts and the + ser96_leap = ts_hourly[(ts_hourly.index.year == 1996) & + (ts_hourly.index.month == 2) & + (ts_hourly.index.day == 29) + ] + + df96 = annual[1996] + df96_leap = df96[(df96.index.month == 2) & (df96.index.day == 29)] + tm.assert_series_equal(ser96_leap, df96_leap) + #non-leap year: 1994 - are all values NaN for day 29.02? + nan_arr = np.empty(24) + nan_arr.fill(np.nan) + df94 = annual[1994] + df94_noleap = df94[(df94.index.month == 2) & (df94.index.day == 29)] + np.testing.assert_equal(df94_noleap.values, nan_arr) + ### extended functionaliy + + + + def test_daily(self): rng = date_range('1/1/2000', '12/31/2004', freq='D') ts = Series(np.random.randn(len(rng)), index=rng) @@ -33,6 +86,7 @@ def test_daily(self): leaps.index = leaps.index.year tm.assert_series_equal(annual[day].dropna(), leaps) + def test_weekly(self): pass diff --git a/pandas/tseries/util.py b/pandas/tseries/util.py index 4b29771233c50..9dc51cbe1e175 100644 --- a/pandas/tseries/util.py +++ b/pandas/tseries/util.py @@ -2,6 +2,7 @@ from pandas.core.frame import DataFrame import pandas.core.nanops as nanops +from pandas.tseries.util import isleapyear def pivot_annual(series, freq=None): """ From 6a173d792f300cde6c2356b0e57ae4b71a9d8660 Mon Sep 17 00:00:00 2001 From: TimMi Date: Thu, 1 Nov 2012 17:44:41 +0100 Subject: [PATCH 2/5] added all the test for the pivot hourly now the functionality is there, if OK, we could improve the docstrings --- pandas/tseries/pivot.py | 11 ++++++----- pandas/tseries/tests/test_util.py | 14 +++++++++++++- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/pandas/tseries/pivot.py b/pandas/tseries/pivot.py index 50fd60b2a03ff..20009cbae546d 100644 --- a/pandas/tseries/pivot.py +++ b/pandas/tseries/pivot.py @@ -139,7 +139,7 @@ def last_col2front(df, col_no=1): def extended_info(df, time_cols=True, aggreg=True, aggreg_func=None, - datetime_index=False): + ): """add extended information to a timeseries pivot """ @@ -156,18 +156,19 @@ def extended_info(df, time_cols=True, aggreg=True, aggreg_func=None, df_extended['sum'] = df_extended[cols].sum(1) df_extended['min'] = df_extended[cols].min(1) df_extended['max'] = df_extended[cols].max(1) - df_extended['max'] = df_extended[cols].std(1) + df_extended['std'] = df_extended[cols].std(1) + #TODO: how to add more functions in flexible way? check other pandas functions + if aggreg_func: + df_extended['aggregated'] = df_extended[cols].aggreg_func(1) #add some metadata #TODO: add function to make index a datetime with the argument above using the rng below #TODO: convert the range to lower frequencies and reuse the function. rng = default_rng() df_extended['doy'] = rng.dayofyear -# df_extended = last_col2front(df_extended) df_extended['month'] = rng.month -# df_extended = last_col2front(df_extended) df_extended['day'] = rng.day -# df_extended = last_col2front(df_extended) + #add 1 to have hours formatted in "natural" and not programming counting df_extended['hour'] = rng.hour + 1 df_extended = last_col2front(df_extended, col_no=4) diff --git a/pandas/tseries/tests/test_util.py b/pandas/tseries/tests/test_util.py index 1a445343390d5..64409bc6bb527 100644 --- a/pandas/tseries/tests/test_util.py +++ b/pandas/tseries/tests/test_util.py @@ -61,9 +61,21 @@ def get_mdh(DatetimeIndex, index): df94_noleap = df94[(df94.index.month == 2) & (df94.index.day == 29)] np.testing.assert_equal(df94_noleap.values, nan_arr) ### extended functionaliy + ext = pivot.extended_info(annual) + ## descriptive statistics + #mean + tm.assert_frame_equal(annual.mean(1), ext['mean']) + tm.assert_frame_equal(annual.sum(1), ext['sum']) + tm.assert_frame_equal(annual.min(1), ext['min']) + tm.assert_frame_equal(annual.min(1), ext['max']) + tm.assert_frame_equal(annual.std(1), ext['std']) + ## additional time columns for easier filtering + np.testing.assert_equal(ext['doy'].values, annual.index.dayofyear) + np.testing.assert_equal(ext['day'].values, annual.index.day) + #the hour is incremented by 1 + np.testing.assert_equal(ext['hour'].values, (annual.index.hour +1)) - def test_daily(self): rng = date_range('1/1/2000', '12/31/2004', freq='D') From 6814408b0837cf97bf48997ebae2f7676352b02e Mon Sep 17 00:00:00 2001 From: timmie Date: Tue, 6 Nov 2012 00:35:24 +0100 Subject: [PATCH 3/5] corrected an error in the extended info for df, simplified the df tests and corrected indices in tests --- pandas/tseries/pivot.py | 11 +++++----- pandas/tseries/tests/test_util.py | 36 +++++++++++++++++-------------- 2 files changed, 25 insertions(+), 22 deletions(-) diff --git a/pandas/tseries/pivot.py b/pandas/tseries/pivot.py index 20009cbae546d..9792ef1f60550 100644 --- a/pandas/tseries/pivot.py +++ b/pandas/tseries/pivot.py @@ -139,7 +139,7 @@ def last_col2front(df, col_no=1): def extended_info(df, time_cols=True, aggreg=True, aggreg_func=None, - ): + datetime_index=False): """add extended information to a timeseries pivot """ @@ -157,18 +157,17 @@ def extended_info(df, time_cols=True, aggreg=True, aggreg_func=None, df_extended['min'] = df_extended[cols].min(1) df_extended['max'] = df_extended[cols].max(1) df_extended['std'] = df_extended[cols].std(1) - #TODO: how to add more functions in flexible way? check other pandas functions - if aggreg_func: - df_extended['aggregated'] = df_extended[cols].aggreg_func(1) #add some metadata #TODO: add function to make index a datetime with the argument above using the rng below #TODO: convert the range to lower frequencies and reuse the function. rng = default_rng() df_extended['doy'] = rng.dayofyear +# df_extended = last_col2front(df_extended) df_extended['month'] = rng.month +# df_extended = last_col2front(df_extended) df_extended['day'] = rng.day - #add 1 to have hours formatted in "natural" and not programming counting +# df_extended = last_col2front(df_extended) df_extended['hour'] = rng.hour + 1 df_extended = last_col2front(df_extended, col_no=4) @@ -205,4 +204,4 @@ def default_rng(freq='H', leap=True): total_hoy_nonleap = (year_length(freq='H')) rng = date_range('1/1/2011', periods=total_hoy_nonleap, freq='H') - return rng \ No newline at end of file + return rng diff --git a/pandas/tseries/tests/test_util.py b/pandas/tseries/tests/test_util.py index 5294b97f32de3..2548714fe76ec 100644 --- a/pandas/tseries/tests/test_util.py +++ b/pandas/tseries/tests/test_util.py @@ -12,9 +12,10 @@ from pandas.tseries.util import pivot_annual, isleapyear from pandas.tseries import pivot -class TestPivotAnnual(unittest.TestCase): + +class TestPivotAnnualHourly(unittest.TestCase): """ - New pandas of scikits.timeseries pivot_annual + New pandas of scikits.timeseries pivot_annual for hourly with a new shape """ def test_hourly(self): rng_hourly = date_range('1/1/1994', periods=(18* 8760 + 4*24), freq='H') @@ -36,15 +37,14 @@ def get_mdh(DatetimeIndex, index): assert get_mdh(ts_hourly.index, 1) == get_mdh(annual.index, 1) #are the last dates of ts identical with the dates last row in the last column? - assert get_mdh(ts_hourly.index[-1]) == get_mdh(annual.index, - (annual.index.size -1)) - #first values of the ts identical with the first col and last row of the df? - assert ts_hourly[0] == annual.ix[1].values[0] + assert get_mdh(ts_hourly.index, -1) == get_mdh(annual.index, (annual.index.size -1)) + #first values of the ts identical with the first col? + assert ts_hourly[0] == annual.ix[0].values[0] #last values of the ts identical with the last col and last row of the df? - assert ts_hourly[-1] == annual.ix[annual.index.size].values[-1] - ### index + assert ts_hourly[-1] == annual.ix[-1].values[-1] + #### index ##test if index has the right length - assert annual.index[-1] == 8784 + assert annual.index.size == 8784 ##test last column: if first value and data are the same as first value of timeseries ### leap ##test leap offset @@ -56,7 +56,7 @@ def get_mdh(DatetimeIndex, index): df96 = annual[1996] df96_leap = df96[(df96.index.month == 2) & (df96.index.day == 29)] - tm.assert_series_equal(ser96_leap, df96_leap) + np.testing.assert_equal(ser96_leap.values, df96_leap.values) #non-leap year: 1994 - are all values NaN for day 29.02? nan_arr = np.empty(24) nan_arr.fill(np.nan) @@ -67,19 +67,23 @@ def get_mdh(DatetimeIndex, index): ext = pivot.extended_info(annual) ## descriptive statistics #mean - tm.assert_frame_equal(annual.mean(1), ext['mean']) - tm.assert_frame_equal(annual.sum(1), ext['sum']) - tm.assert_frame_equal(annual.min(1), ext['min']) - tm.assert_frame_equal(annual.min(1), ext['max']) - tm.assert_frame_equal(annual.std(1), ext['std']) + np.testing.assert_equal(annual.mean(1).values, ext['mean'].values) + np.testing.assert_equal(annual.sum(1).values, ext['sum'].values) + np.testing.assert_equal(annual.min(1).values, ext['min'].values) + np.testing.assert_equal(annual.max(1).values, ext['max'].values) + np.testing.assert_equal(annual.std(1).values, ext['std'].values) ## additional time columns for easier filtering np.testing.assert_equal(ext['doy'].values, annual.index.dayofyear) np.testing.assert_equal(ext['day'].values, annual.index.day) #the hour is incremented by 1 np.testing.assert_equal(ext['hour'].values, (annual.index.hour +1)) - + +class TestPivotAnnual(unittest.TestCase): + """ + New pandas of scikits.timeseries pivot_annual + """ def test_daily(self): rng = date_range('1/1/2000', '12/31/2004', freq='D') ts = Series(np.random.randn(len(rng)), index=rng) From ed0659f3cba74e79a5932201f9c8b266e6d4539f Mon Sep 17 00:00:00 2001 From: TimMi Date: Wed, 7 Nov 2012 14:20:21 +0100 Subject: [PATCH 4/5] minor restructuring of the code --- pandas/tseries/pivot.py | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/pandas/tseries/pivot.py b/pandas/tseries/pivot.py index 9792ef1f60550..6249fa1fa12b0 100644 --- a/pandas/tseries/pivot.py +++ b/pandas/tseries/pivot.py @@ -66,44 +66,43 @@ def pivot_annual_h(series, freq=None, dt_index=False): dummy_df = DataFrame(values, index=hoy_leap_list, columns=years) - ##get offset for leap hours - + ##prepare the index for inserting the values into the result dataframe + #get offset for leap hours #see: #http://stackoverflow.com/questions/2004364/increment-numpy-array-with-repeated-indices #1994-02-28 23:00:00 -> index 1415 - ind_z = np.array(range(0, 8760)) - ind_i = np.array(range(1416,8760 )) + index_nonleap = np.array(range(0, 8760)) + index_leapshift = np.array(range(1416,8760 )) - ind_t = ind_z.copy() - ind_t[ind_i]+=24 + index_incl_leap = index_nonleap.copy() + #shift index by 24 (hours) for leap + index_incl_leap[index_leapshift]+=24 - #TODO: beautify variable names + # select data for the respective year for year in years: - # select data for the respective year - ser_sel = series[ series.index.year == year] - info = (ser_sel).values - - + #select the data for the respective year + series_year = series[ series.index.year == year] + #create a array with the values for the respecive year + values = (series_year).values if isleapyear(year): - dummy_df[year] = info + dummy_df[year] = values else: - data = np.empty((total_hoy_leap), dtype=series.dtype) - data.fill(np.nan) - - ser_sel = series[ series.index.year == year] - info = (ser_sel).values + #dummy array to be filled with non-leap values + dummy_array = np.empty((total_hoy_leap), dtype=series.dtype) + dummy_array.fill(np.nan) - data.put(ind_t, (series[ series.index.year == year]).values) + #fill dummy array with values leaving the leap day + dummy_array.put(index_incl_leap, values) - dummy_df[year] = data + dummy_df[year] = dummy_array res_df = dummy_df #assign a datetime index, CAUTION: the year is definatly wrong! if dt_index: - rng = default_rng() + rng = default_rng(freq='H', leap=True) res_df = DataFrame(res_df.values, index=rng, columns=res_df.columns) From a997751552ae2a270caede5ee55054cef99b210d Mon Sep 17 00:00:00 2001 From: TimMi Date: Wed, 7 Nov 2012 16:28:17 +0100 Subject: [PATCH 5/5] additional minor edits --- pandas/tseries/pivot.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tseries/pivot.py b/pandas/tseries/pivot.py index 6249fa1fa12b0..632e6bdab4324 100644 --- a/pandas/tseries/pivot.py +++ b/pandas/tseries/pivot.py @@ -59,10 +59,10 @@ def pivot_annual_h(series, freq=None, dt_index=False): hoy_leap_list = range(1, (total_hoy_leap + 1 )) - + #create a array template values = np.empty((total_hoy_leap, len(years)), dtype=series.dtype) values.fill(np.nan) - + #create a df to receive the resulting data dummy_df = DataFrame(values, index=hoy_leap_list, columns=years) @@ -100,7 +100,7 @@ def pivot_annual_h(series, freq=None, dt_index=False): res_df = dummy_df - #assign a datetime index, CAUTION: the year is definatly wrong! + #assign a pseudo datetime index , CAUTION: the year is definitely wrong! if dt_index: rng = default_rng(freq='H', leap=True) res_df = DataFrame(res_df.values, index=rng,