From 4d7e1cfaeb6b3c60f90d77b2f7614f8fd58c2a8e Mon Sep 17 00:00:00 2001 From: reidy-p Date: Tue, 19 Jun 2018 21:25:13 +0100 Subject: [PATCH 1/4] BUG: first/last lose timezone in groupby --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/core/groupby/groupby.py | 5 +++- pandas/tests/groupby/test_nth.py | 44 +++++++++++++++++++++++++++++++- 3 files changed, 48 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 4bfae7de01b8f..23978e0703fe2 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -225,7 +225,7 @@ Plotting Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- +- Bug in :func:`pandas.core.groupby.first` and :func:`pandas.core.groupby.last` with ``as_index=False`` leading to the loss of timezone information (:issue:`15884`) - - diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 3bc59157055ce..01cda89cee35d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4740,7 +4740,10 @@ def _wrap_transformed_output(self, output, names=None): def _wrap_agged_blocks(self, items, blocks): if not self.as_index: - index = np.arange(blocks[0].values.shape[1]) + if blocks[0].values.ndim > 1: + index = np.arange(blocks[0].values.shape[1]) + else: + index = np.arange(blocks[0].values.shape[0]) mgr = BlockManager(blocks, [items, index]) result = DataFrame(mgr) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index a32ba9ad76f14..1817ef99f6acf 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -1,11 +1,12 @@ import numpy as np import pandas as pd -from pandas import DataFrame, MultiIndex, Index, Series, isna +from pandas import DataFrame, MultiIndex, Index, Series, isna, Timestamp from pandas.compat import lrange from pandas.util.testing import ( assert_frame_equal, assert_produces_warning, assert_series_equal) +import pytest def test_first_last_nth(df): @@ -219,6 +220,47 @@ def test_nth_multi_index(three_group): assert_frame_equal(result, expected) +@pytest.mark.parametrize('data, expected_first, expected_last', [ + ({'id': ['A'], 'time': Timestamp('2012-02-01 14:00:00', + tz='US/Central')}, + {'id': ['A'], 'time': Timestamp('2012-02-01 14:00:00', + tz='US/Central')}, + {'id': ['A'], 'time': Timestamp('2012-02-01 14:00:00', + tz='US/Central')}), + ({'id': ['A', 'B', 'A'], + 'time': [Timestamp('2012-01-01 13:00:00', + tz='America/New_York'), + Timestamp('2012-02-01 14:00:00', + tz='US/Central'), + Timestamp('2012-03-01 12:00:00', + tz='Europe/London')]}, + {'id': ['A', 'B'], + 'time': [Timestamp('2012-01-01 13:00:00', + tz='America/New_York'), + Timestamp('2012-02-01 14:00:00', + tz='US/Central')]}, + {'id': ['A', 'B'], + 'time': [Timestamp('2012-03-01 12:00:00', + tz='Europe/London'), + Timestamp('2012-02-01 14:00:00', + tz='US/Central')]}) +]) +def test_first_last_tz(data, expected_first, expected_last): + # GH15884 + # Test that the timezone is retained when calling first + # or last on groupby with as_index=False + + df = DataFrame(data) + + result = df.groupby('id', as_index=False).first() + expected = DataFrame(expected_first) + assert_frame_equal(result, expected) + + result = df.groupby('id', as_index=False).last() + expected = DataFrame(expected_last) + assert_frame_equal(result, expected) + + def test_nth_multi_index_as_expected(): # PR 9090, related to issue 8979 # test nth on MultiIndex From f46ce84dfab126fa506e415a88e9a2047b11c5ad Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 22 Jun 2018 06:29:16 -0400 Subject: [PATCH 2/4] simplify --- pandas/core/groupby/groupby.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 01cda89cee35d..0bbdfbbe52ac4 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4740,10 +4740,7 @@ def _wrap_transformed_output(self, output, names=None): def _wrap_agged_blocks(self, items, blocks): if not self.as_index: - if blocks[0].values.ndim > 1: - index = np.arange(blocks[0].values.shape[1]) - else: - index = np.arange(blocks[0].values.shape[0]) + index = np.arange(blocks[0].values.shape[-1]) mgr = BlockManager(blocks, [items, index]) result = DataFrame(mgr) From 7408aabd11059608c7949ec1e4a10ebb9fc024a2 Mon Sep 17 00:00:00 2001 From: reidy-p Date: Fri, 22 Jun 2018 15:55:15 +0100 Subject: [PATCH 3/4] Fix whatsnew and add tests --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/tests/groupby/test_nth.py | 38 +++++++++++++++++++++++--------- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 23978e0703fe2..3c3f6358d6579 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -225,7 +225,7 @@ Plotting Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug in :func:`pandas.core.groupby.first` and :func:`pandas.core.groupby.last` with ``as_index=False`` leading to the loss of timezone information (:issue:`15884`) +- Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` with ``as_index=False`` leading to the loss of timezone information (:issue:`15884`) - - diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 1817ef99f6acf..da56e5c09b0a7 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -221,29 +221,38 @@ def test_nth_multi_index(three_group): @pytest.mark.parametrize('data, expected_first, expected_last', [ - ({'id': ['A'], 'time': Timestamp('2012-02-01 14:00:00', - tz='US/Central')}, - {'id': ['A'], 'time': Timestamp('2012-02-01 14:00:00', - tz='US/Central')}, - {'id': ['A'], 'time': Timestamp('2012-02-01 14:00:00', - tz='US/Central')}), + ({'id': ['A'], + 'time': Timestamp('2012-02-01 14:00:00', + tz='US/Central'), + 'foo': [1]}, + {'id': ['A'], + 'time': Timestamp('2012-02-01 14:00:00', + tz='US/Central'), + 'foo': [1]}, + {'id': ['A'], + 'time': Timestamp('2012-02-01 14:00:00', + tz='US/Central'), + 'foo': [1]}), ({'id': ['A', 'B', 'A'], 'time': [Timestamp('2012-01-01 13:00:00', tz='America/New_York'), Timestamp('2012-02-01 14:00:00', tz='US/Central'), Timestamp('2012-03-01 12:00:00', - tz='Europe/London')]}, + tz='Europe/London')], + 'foo': [1, 2, 3]}, {'id': ['A', 'B'], 'time': [Timestamp('2012-01-01 13:00:00', tz='America/New_York'), Timestamp('2012-02-01 14:00:00', - tz='US/Central')]}, + tz='US/Central')], + 'foo': [1, 2]}, {'id': ['A', 'B'], 'time': [Timestamp('2012-03-01 12:00:00', tz='Europe/London'), Timestamp('2012-02-01 14:00:00', - tz='US/Central')]}) + tz='US/Central')], + 'foo': [3, 2]}) ]) def test_first_last_tz(data, expected_first, expected_last): # GH15884 @@ -254,12 +263,19 @@ def test_first_last_tz(data, expected_first, expected_last): result = df.groupby('id', as_index=False).first() expected = DataFrame(expected_first) - assert_frame_equal(result, expected) + cols = ['id', 'time', 'foo'] + assert_frame_equal(result[cols], expected[cols]) + + result = df.groupby('id', as_index=False)['time'].first() + assert_frame_equal(result, expected[['id', 'time']]) result = df.groupby('id', as_index=False).last() expected = DataFrame(expected_last) - assert_frame_equal(result, expected) + cols = ['id', 'time', 'foo'] + assert_frame_equal(result[cols], expected[cols]) + result = df.groupby('id', as_index=False)['time'].last() + assert_frame_equal(result, expected[['id', 'time']]) def test_nth_multi_index_as_expected(): # PR 9090, related to issue 8979 From ef5d5b180a6c9eb2abbfc40b731ba84368266bcf Mon Sep 17 00:00:00 2001 From: reidy-p Date: Fri, 22 Jun 2018 18:15:44 +0100 Subject: [PATCH 4/4] lint --- pandas/tests/groupby/test_nth.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index da56e5c09b0a7..a1b748cd50e8f 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -221,15 +221,15 @@ def test_nth_multi_index(three_group): @pytest.mark.parametrize('data, expected_first, expected_last', [ - ({'id': ['A'], + ({'id': ['A'], 'time': Timestamp('2012-02-01 14:00:00', tz='US/Central'), 'foo': [1]}, - {'id': ['A'], + {'id': ['A'], 'time': Timestamp('2012-02-01 14:00:00', tz='US/Central'), 'foo': [1]}, - {'id': ['A'], + {'id': ['A'], 'time': Timestamp('2012-02-01 14:00:00', tz='US/Central'), 'foo': [1]}), @@ -277,6 +277,7 @@ def test_first_last_tz(data, expected_first, expected_last): result = df.groupby('id', as_index=False)['time'].last() assert_frame_equal(result, expected[['id', 'time']]) + def test_nth_multi_index_as_expected(): # PR 9090, related to issue 8979 # test nth on MultiIndex