pandas/tests/resample/test_resample_api.py

from collections import OrderedDict
from datetime import datetime

import numpy as np
import pytest

import pandas as pd
from pandas import DataFrame, Series
from pandas.core.indexes.datetimes import date_range
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal, assert_series_equal

dti = date_range(start=datetime(2005, 1, 1),
                 end=datetime(2005, 1, 10), freq='Min')

test_series = Series(np.random.rand(len(dti)), dti)
_test_frame = DataFrame(
    {'A': test_series, 'B': test_series, 'C': np.arange(len(dti))})


@pytest.fixture
def test_frame():
    return _test_frame.copy()


def test_str():

    r = test_series.resample('H')
    assert ('DatetimeIndexResampler [freq=<Hour>, axis=0, closed=left, '
            'label=left, convention=start, base=0]' in str(r))


def test_api():

    r = test_series.resample('H')
    result = r.mean()
    assert isinstance(result, Series)
    assert len(result) == 217

    r = test_series.to_frame().resample('H')
    result = r.mean()
    assert isinstance(result, DataFrame)
    assert len(result) == 217


def test_groupby_resample_api():

    # GH 12448
    # .groupby(...).resample(...) hitting warnings
    # when appropriate
    df = DataFrame({'date': pd.date_range(start='2016-01-01',
                                          periods=4,
                                          freq='W'),
                    'group': [1, 1, 2, 2],
                    'val': [5, 6, 7, 8]}).set_index('date')

    # replication step
    i = pd.date_range('2016-01-03', periods=8).tolist() + \
        pd.date_range('2016-01-17', periods=8).tolist()
    index = pd.MultiIndex.from_arrays([[1] * 8 + [2] * 8, i],
                                      names=['group', 'date'])
    expected = DataFrame({'val': [5] * 7 + [6] + [7] * 7 + [8]},
                         index=index)
    result = df.groupby('group').apply(
        lambda x: x.resample('1D').ffill())[['val']]
    assert_frame_equal(result, expected)


def test_groupby_resample_on_api():

    # GH 15021
    # .groupby(...).resample(on=...) results in an unexpected
    # keyword warning.
    df = DataFrame({'key': ['A', 'B'] * 5,
                    'dates': pd.date_range('2016-01-01', periods=10),
                    'values': np.random.randn(10)})

    expected = df.set_index('dates').groupby('key').resample('D').mean()

    result = df.groupby('key').resample('D', on='dates').mean()
    assert_frame_equal(result, expected)


def test_pipe(test_frame):
    # GH17905

    # series
    r = test_series.resample('H')
    expected = r.max() - r.mean()
    result = r.pipe(lambda x: x.max() - x.mean())
    tm.assert_series_equal(result, expected)

    # dataframe
    r = test_frame.resample('H')
    expected = r.max() - r.mean()
    result = r.pipe(lambda x: x.max() - x.mean())
    tm.assert_frame_equal(result, expected)


def test_getitem(test_frame):

    r = test_frame.resample('H')
    tm.assert_index_equal(r._selected_obj.columns, test_frame.columns)

    r = test_frame.resample('H')['B']
    assert r._selected_obj.name == test_frame.columns[1]

    # technically this is allowed
    r = test_frame.resample('H')['A', 'B']
    tm.assert_index_equal(r._selected_obj.columns,
                          test_frame.columns[[0, 1]])

    r = test_frame.resample('H')['A', 'B']
    tm.assert_index_equal(r._selected_obj.columns,
                          test_frame.columns[[0, 1]])


@pytest.mark.parametrize('key', [['D'], ['A', 'D']])
def test_select_bad_cols(key, test_frame):
    g = test_frame.resample('H')
    # 'A' should not be referenced as a bad column...
    # will have to rethink regex if you change message!
    msg = r"^\"Columns not found: 'D'\"$"
    with pytest.raises(KeyError, match=msg):
        g[key]


def test_attribute_access(test_frame):

    r = test_frame.resample('H')
    tm.assert_series_equal(r.A.sum(), r['A'].sum())


def test_api_compat_before_use():

    # make sure that we are setting the binner
    # on these attributes
    for attr in ['groups', 'ngroups', 'indices']:
        rng = pd.date_range('1/1/2012', periods=100, freq='S')
        ts = Series(np.arange(len(rng)), index=rng)
        rs = ts.resample('30s')

        # before use
        getattr(rs, attr)

        # after grouper is initialized is ok
        rs.mean()
        getattr(rs, attr)


def tests_skip_nuisance(test_frame):

    df = test_frame
    df['D'] = 'foo'
    r = df.resample('H')
    result = r[['A', 'B']].sum()
    expected = pd.concat([r.A.sum(), r.B.sum()], axis=1)
    assert_frame_equal(result, expected)

    expected = r[['A', 'B', 'C']].sum()
    result = r.sum()
    assert_frame_equal(result, expected)


def test_downsample_but_actually_upsampling():

    # this is reindex / asfreq
    rng = pd.date_range('1/1/2012', periods=100, freq='S')
    ts = Series(np.arange(len(rng), dtype='int64'), index=rng)
    result = ts.resample('20s').asfreq()
    expected = Series([0, 20, 40, 60, 80],
                      index=pd.date_range('2012-01-01 00:00:00',
                                          freq='20s',
                                          periods=5))
    assert_series_equal(result, expected)


def test_combined_up_downsampling_of_irregular():

    # since we are reallydoing an operation like this
    # ts2.resample('2s').mean().ffill()
    # preserve these semantics

    rng = pd.date_range('1/1/2012', periods=100, freq='S')
    ts = Series(np.arange(len(rng)), index=rng)
    ts2 = ts.iloc[[0, 1, 2, 3, 5, 7, 11, 15, 16, 25, 30]]

    with tm.assert_produces_warning(FutureWarning,
                                    check_stacklevel=False):
        result = ts2.resample('2s', how='mean', fill_method='ffill')
    expected = ts2.resample('2s').mean().ffill()
    assert_series_equal(result, expected)


def test_transform():

    r = test_series.resample('20min')
    expected = test_series.groupby(
        pd.Grouper(freq='20min')).transform('mean')
    result = r.transform('mean')
    assert_series_equal(result, expected)


def test_fillna():

    # need to upsample here
    rng = pd.date_range('1/1/2012', periods=10, freq='2S')
    ts = Series(np.arange(len(rng), dtype='int64'), index=rng)
    r = ts.resample('s')

    expected = r.ffill()
    result = r.fillna(method='ffill')
    assert_series_equal(result, expected)

    expected = r.bfill()
    result = r.fillna(method='bfill')
    assert_series_equal(result, expected)

    msg = (r"Invalid fill method\. Expecting pad \(ffill\), backfill"
           r" \(bfill\) or nearest\. Got 0")
    with pytest.raises(ValueError, match=msg):
        r.fillna(0)


def test_apply_without_aggregation():

    # both resample and groupby should work w/o aggregation
    r = test_series.resample('20min')
    g = test_series.groupby(pd.Grouper(freq='20min'))

    for t in [g, r]:
        result = t.apply(lambda x: x)
        assert_series_equal(result, test_series)


def test_agg_consistency():

    # make sure that we are consistent across
    # similar aggregations with and w/o selection list
    df = DataFrame(np.random.randn(1000, 3),
                   index=pd.date_range('1/1/2012', freq='S', periods=1000),
                   columns=['A', 'B', 'C'])

    r = df.resample('3T')

    with tm.assert_produces_warning(FutureWarning,
                                    check_stacklevel=False):
        expected = r[['A', 'B', 'C']].agg({'r1': 'mean', 'r2': 'sum'})
        result = r.agg({'r1': 'mean', 'r2': 'sum'})
    assert_frame_equal(result, expected)

# TODO: once GH 14008 is fixed, move these tests into
# `Base` test class


def test_agg():
    # test with all three Resampler apis and TimeGrouper

    np.random.seed(1234)
    index = date_range(datetime(2005, 1, 1),
                       datetime(2005, 1, 10), freq='D')
    index.name = 'date'
    df = DataFrame(np.random.rand(10, 2), columns=list('AB'), index=index)
    df_col = df.reset_index()
    df_mult = df_col.copy()
    df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index],
                                              names=['index', 'date'])
    r = df.resample('2D')
    cases = [
        r,
        df_col.resample('2D', on='date'),
        df_mult.resample('2D', level='date'),
        df.groupby(pd.Grouper(freq='2D'))
    ]

    a_mean = r['A'].mean()
    a_std = r['A'].std()
    a_sum = r['A'].sum()
    b_mean = r['B'].mean()
    b_std = r['B'].std()
    b_sum = r['B'].sum()

    expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1)
    expected.columns = pd.MultiIndex.from_product([['A', 'B'],
                                                   ['mean', 'std']])
    for t in cases:
        result = t.aggregate([np.mean, np.std])
        assert_frame_equal(result, expected)

    expected = pd.concat([a_mean, b_std], axis=1)
    for t in cases:
        result = t.aggregate({'A': np.mean,
                              'B': np.std})
        assert_frame_equal(result, expected, check_like=True)

    expected = pd.concat([a_mean, a_std], axis=1)
    expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'),
                                                  ('A', 'std')])
    for t in cases:
        result = t.aggregate({'A': ['mean', 'std']})
        assert_frame_equal(result, expected)

    expected = pd.concat([a_mean, a_sum], axis=1)
    expected.columns = ['mean', 'sum']
    for t in cases:
        result = t['A'].aggregate(['mean', 'sum'])
    assert_frame_equal(result, expected)

    expected = pd.concat([a_mean, a_sum], axis=1)
    expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'),
                                                  ('A', 'sum')])
    for t in cases:
        with tm.assert_produces_warning(FutureWarning,
                                        check_stacklevel=False):
            result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}})
        assert_frame_equal(result, expected, check_like=True)

    expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1)
    expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'),
                                                  ('A', 'sum'),
                                                  ('B', 'mean2'),
                                                  ('B', 'sum2')])
    for t in cases:
        with tm.assert_produces_warning(FutureWarning,
                                        check_stacklevel=False):
            result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'},
                                  'B': {'mean2': 'mean', 'sum2': 'sum'}})
        assert_frame_equal(result, expected, check_like=True)

    expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1)
    expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'),
                                                  ('A', 'std'),
                                                  ('B', 'mean'),
                                                  ('B', 'std')])
    for t in cases:
        result = t.aggregate({'A': ['mean', 'std'],
                              'B': ['mean', 'std']})
        assert_frame_equal(result, expected, check_like=True)

    expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1)
    expected.columns = pd.MultiIndex.from_tuples([('r1', 'A', 'mean'),
                                                  ('r1', 'A', 'sum'),
                                                  ('r2', 'B', 'mean'),
                                                  ('r2', 'B', 'sum')])


def test_agg_misc():
    # test with all three Resampler apis and TimeGrouper

    np.random.seed(1234)
    index = date_range(datetime(2005, 1, 1),
                       datetime(2005, 1, 10), freq='D')
    index.name = 'date'
    df = DataFrame(np.random.rand(10, 2), columns=list('AB'), index=index)
    df_col = df.reset_index()
    df_mult = df_col.copy()
    df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index],
                                              names=['index', 'date'])

    r = df.resample('2D')
    cases = [
        r,
        df_col.resample('2D', on='date'),
        df_mult.resample('2D', level='date'),
        df.groupby(pd.Grouper(freq='2D'))
    ]

    # passed lambda
    for t in cases:
        result = t.agg({'A': np.sum,
                        'B': lambda x: np.std(x, ddof=1)})
        rcustom = t['B'].apply(lambda x: np.std(x, ddof=1))
        expected = pd.concat([r['A'].sum(), rcustom], axis=1)
        assert_frame_equal(result, expected, check_like=True)

    # agg with renamers
    expected = pd.concat([t['A'].sum(),
                          t['B'].sum(),
                          t['A'].mean(),
                          t['B'].mean()],
                         axis=1)
    expected.columns = pd.MultiIndex.from_tuples([('result1', 'A'),
                                                  ('result1', 'B'),
                                                  ('result2', 'A'),
                                                  ('result2', 'B')])

    for t in cases:
        with tm.assert_produces_warning(FutureWarning,
                                        check_stacklevel=False):
            result = t[['A', 'B']].agg(OrderedDict([('result1', np.sum),
                                                    ('result2', np.mean)]))
        assert_frame_equal(result, expected, check_like=True)

    # agg with different hows
    expected = pd.concat([t['A'].sum(),
                          t['A'].std(),
                          t['B'].mean(),
                          t['B'].std()],
                         axis=1)
    expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'),
                                                  ('A', 'std'),
                                                  ('B', 'mean'),
                                                  ('B', 'std')])
    for t in cases:
        result = t.agg(OrderedDict([('A', ['sum', 'std']),
                                    ('B', ['mean', 'std'])]))
        assert_frame_equal(result, expected, check_like=True)

    # equivalent of using a selection list / or not
    for t in cases:
        result = t[['A', 'B']].agg({'A': ['sum', 'std'],
                                    'B': ['mean', 'std']})
        assert_frame_equal(result, expected, check_like=True)

    # series like aggs
    for t in cases:
        with tm.assert_produces_warning(FutureWarning,
                                        check_stacklevel=False):
            result = t['A'].agg({'A': ['sum', 'std']})
        expected = pd.concat([t['A'].sum(),
                              t['A'].std()],
                             axis=1)
        expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'),
                                                      ('A', 'std')])
        assert_frame_equal(result, expected, check_like=True)

        expected = pd.concat([t['A'].agg(['sum', 'std']),
                              t['A'].agg(['mean', 'std'])],
                             axis=1)
        expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'),
                                                      ('A', 'std'),
                                                      ('B', 'mean'),
                                                      ('B', 'std')])
        with tm.assert_produces_warning(FutureWarning,
                                        check_stacklevel=False):
            result = t['A'].agg({'A': ['sum', 'std'],
                                 'B': ['mean', 'std']})
        assert_frame_equal(result, expected, check_like=True)

    # errors
    # invalid names in the agg specification
    msg = "\"Column 'B' does not exist!\""
    for t in cases:
        with pytest.raises(KeyError, match=msg):
            t[['A']].agg({'A': ['sum', 'std'],
                          'B': ['mean', 'std']})


def test_agg_nested_dicts():

    np.random.seed(1234)
    index = date_range(datetime(2005, 1, 1),
                       datetime(2005, 1, 10), freq='D')
    index.name = 'date'
    df = DataFrame(np.random.rand(10, 2), columns=list('AB'), index=index)
    df_col = df.reset_index()
    df_mult = df_col.copy()
    df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index],
                                              names=['index', 'date'])
    r = df.resample('2D')
    cases = [
        r,
        df_col.resample('2D', on='date'),
        df_mult.resample('2D', level='date'),
        df.groupby(pd.Grouper(freq='2D'))
    ]

    msg = r"cannot perform renaming for r(1|2) with a nested dictionary"
    for t in cases:
        with pytest.raises(pd.core.base.SpecificationError, match=msg):
            t.aggregate({'r1': {'A': ['mean', 'sum']},
                         'r2': {'B': ['mean', 'sum']}})

    for t in cases:
        expected = pd.concat([t['A'].mean(), t['A'].std(), t['B'].mean(),
                              t['B'].std()], axis=1)
        expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), (
            'ra', 'std'), ('rb', 'mean'), ('rb', 'std')])

        with tm.assert_produces_warning(FutureWarning,
                                        check_stacklevel=False):
            result = t[['A', 'B']].agg({'A': {'ra': ['mean', 'std']},
                                        'B': {'rb': ['mean', 'std']}})
        assert_frame_equal(result, expected, check_like=True)

        with tm.assert_produces_warning(FutureWarning,
                                        check_stacklevel=False):
            result = t.agg({'A': {'ra': ['mean', 'std']},
                            'B': {'rb': ['mean', 'std']}})
        assert_frame_equal(result, expected, check_like=True)


def test_try_aggregate_non_existing_column():
    # GH 16766
    data = [
        {'dt': datetime(2017, 6, 1, 0), 'x': 1.0, 'y': 2.0},
        {'dt': datetime(2017, 6, 1, 1), 'x': 2.0, 'y': 2.0},
        {'dt': datetime(2017, 6, 1, 2), 'x': 3.0, 'y': 1.5}
    ]
    df = DataFrame(data).set_index('dt')

    # Error as we don't have 'z' column
    msg = "\"Column 'z' does not exist!\""
    with pytest.raises(KeyError, match=msg):
        df.resample('30T').agg({'x': ['mean'],
                                'y': ['median'],
                                'z': ['sum']})


def test_selection_api_validation():
    # GH 13500
    index = date_range(datetime(2005, 1, 1),
                       datetime(2005, 1, 10), freq='D')

    rng = np.arange(len(index), dtype=np.int64)
    df = DataFrame({'date': index, 'a': rng},
                   index=pd.MultiIndex.from_arrays([rng, index],
                                                   names=['v', 'd']))
    df_exp = DataFrame({'a': rng}, index=index)

    # non DatetimeIndex
    msg = ("Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex,"
           " but got an instance of 'Int64Index'")
    with pytest.raises(TypeError, match=msg):
        df.resample('2D', level='v')

    msg = "The Grouper cannot specify both a key and a level!"
    with pytest.raises(ValueError, match=msg):
        df.resample('2D', on='date', level='d')

    msg = "unhashable type: 'list'"
    with pytest.raises(TypeError, match=msg):
        df.resample('2D', on=['a', 'date'])

    msg = r"\"Level \['a', 'date'\] not found\""
    with pytest.raises(KeyError, match=msg):
        df.resample('2D', level=['a', 'date'])

    # upsampling not allowed
    msg = ("Upsampling from level= or on= selection is not supported, use"
           r" \.set_index\(\.\.\.\) to explicitly set index to datetime-like")
    with pytest.raises(ValueError, match=msg):
        df.resample('2D', level='d').asfreq()
    with pytest.raises(ValueError, match=msg):
        df.resample('2D', on='date').asfreq()

    exp = df_exp.resample('2D').sum()
    exp.index.name = 'date'
    assert_frame_equal(exp, df.resample('2D', on='date').sum())

    exp.index.name = 'd'
    assert_frame_equal(exp, df.resample('2D', level='d').sum())


@pytest.mark.parametrize('col_name', ['t2', 't2x', 't2q', 'T_2M',
                                      't2p', 't2m', 't2m1', 'T2M'])
def test_agg_with_datetime_index_list_agg_func(col_name):
    # GH 22660
    # The parametrized column names would get converted to dates by our
    # date parser. Some would result in OutOfBoundsError (ValueError) while
    # others would result in OverflowError when passed into Timestamp.
    # We catch these errors and move on to the correct branch.
    df = pd.DataFrame(list(range(200)),
                      index=pd.date_range(start='2017-01-01', freq='15min',
                                          periods=200, tz='Europe/Berlin'),
                      columns=[col_name])
    result = df.resample('1d').aggregate(['mean'])
    expected = pd.DataFrame([47.5, 143.5, 195.5],
                            index=pd.date_range(start='2017-01-01', freq='D',
                                                periods=3, tz='Europe/Berlin'),
                            columns=pd.MultiIndex(levels=[[col_name],
                                                          ['mean']],
                                                  codes=[[0], [0]]))
    assert_frame_equal(result, expected)