From 155fc90e2d8fa68bc97724ee642c5a7c22c34cd3 Mon Sep 17 00:00:00 2001 From: Paul Mannino Date: Wed, 17 Jan 2018 22:25:12 -0600 Subject: [PATCH 1/4] BUG: Concatentation of TZ-aware dataframes (#12396) (#18447) --- doc/source/whatsnew/v0.23.0.txt | 2 + pandas/core/indexes/base.py | 10 +- pandas/core/internals.py | 6 +- pandas/tests/reshape/test_concat.py | 155 ++++++++++++++++++++++++++++ 4 files changed, 167 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index ffa4f1068f84d..d7fd17db48600 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1237,6 +1237,8 @@ Reshaping - Bug in :meth:`DataFrame.astype` where column metadata is lost when converting to categorical or a dictionary of dtypes (:issue:`19920`) - Bug in :func:`cut` and :func:`qcut` where timezone information was dropped (:issue:`19872`) - Bug in :class:`Series` constructor with a ``dtype=str``, previously raised in some cases (:issue:`19853`) +- Bug in :func:`concat` which raises an error when concatenating TZ-aware dataframes and all-NaT dataframes (:issue:`12396`) +- Bug in :func:`concat` which raises an error when concatenating empty TZ-aware series (:issue:`18447`) Other ^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2ceec1592d49b..683f0ea795bd7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2170,17 +2170,19 @@ def _assert_take_fillable(self, values, indices, allow_fill=True, fill_value=None, na_value=np.nan): """ Internal method to handle NA filling of take """ indices = _ensure_platform_int(indices) - # only fill if we are passing a non-None fill_value if allow_fill and fill_value is not None: if (indices < -1).any(): msg = ('When allow_fill=True and fill_value is not None, ' 'all indices must be >= -1') raise ValueError(msg) - taken = values.take(indices) mask = indices == -1 - if mask.any(): - taken[mask] = na_value + if mask.all(): + taken = np.full(indices.shape, fill_value=na_value) + else: + taken = values.take(indices) + if mask.any(): + taken[mask] = na_value else: taken = values.take(indices) return taken diff --git a/pandas/core/internals.py b/pandas/core/internals.py index a266ea620bd9f..a54dac099b037 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -5815,8 +5815,10 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): if len(values) and values[0] is None: fill_value = None - if getattr(self.block, 'is_datetimetz', False): - pass + if getattr(self.block, 'is_datetimetz', False) or \ + is_datetimetz(empty_dtype): + missing_arr = np.full(np.prod(self.shape), fill_value) + return DatetimeIndex(missing_arr, dtype=empty_dtype) elif getattr(self.block, 'is_categorical', False): pass elif getattr(self.block, 'is_sparse', False): diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 640d09f3587fb..5a5163eb7d5e1 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1865,6 +1865,135 @@ def test_concat_tz_series_tzlocal(self): tm.assert_series_equal(result, pd.Series(x + y)) assert result.dtype == 'datetime64[ns, tzlocal()]' + def test_concat_NaT_dataframes_all_NaT_axis_0(self): + # GH 12396 + + # tz-naive + first = pd.DataFrame([[pd.NaT], [pd.NaT]]) + second = pd.DataFrame([[pd.NaT]]) + + result = pd.concat([first, second], axis=0) + expected = pd.DataFrame([pd.NaT, pd.NaT, pd.NaT], index=[0, 1, 0]) + assert_frame_equal(result, expected) + + # one side timezone-aware + # upcasts for mixed case + first = pd.DataFrame(pd.Series([pd.NaT, pd.NaT]).dt.tz_localize('UTC')) + result = pd.concat([first, second], axis=0) + expected = pd.DataFrame( + pd.Series([pd.NaT, pd.NaT, pd.NaT]).dt.tz_localize('UTC'), + index=[0, 1, 0] + ) + assert_frame_equal(result, expected) + + # both sides timezone-aware + # upcasts to tz-aware + second = pd.DataFrame(pd.Series([pd.NaT]).dt.tz_localize('UTC')) + result = pd.concat([first, second], axis=0) + assert_frame_equal(result, expected) + + def test_concat_NaT_dataframes_all_NaT_axis_1(self): + # GH 12396 + + # tz-naive + first = pd.DataFrame([[pd.NaT], [pd.NaT]]) + second = pd.DataFrame([[pd.NaT]], columns=[1]) + expected = pd.DataFrame([[pd.NaT, pd.NaT], [pd.NaT, pd.NaT]], + columns=[0, 1]) + result = pd.concat([first, second], axis=1) + assert_frame_equal(result, expected) + + # one side timezone-aware + # upcasts result to tz-aware + first = pd.DataFrame(pd.Series([pd.NaT, pd.NaT]).dt.tz_localize('UTC')) + expected = pd.DataFrame( + {0: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize('UTC'), + 1: pd.Series([pd.NaT, pd.NaT])} + ) + result = pd.concat([first, second], axis=1) + assert_frame_equal(result, expected) + + # both sides timezone-aware + # upcasts result to tz-aware + second[1] = second[1].dt.tz_localize('UTC') + expected = pd.DataFrame( + {0: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize('UTC'), + 1: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize('UTC')} + ) + result = pd.concat([first, second], axis=1) + assert_frame_equal(result, expected) + + def test_concat_NaT_dataframes_mixed_timestamps_and_NaT(self): + # GH 12396 + + # tz-naive + first = pd.DataFrame([[pd.NaT], [pd.NaT]]) + second = pd.DataFrame([[pd.Timestamp('2015/01/01')], + [pd.Timestamp('2016/01/01')]], + index=[2, 3]) + expected = pd.DataFrame([pd.NaT, pd.NaT, + pd.Timestamp('2015/01/01'), + pd.Timestamp('2016/01/01')]) + + result = pd.concat([first, second], axis=0) + assert_frame_equal(result, expected) + + # one side timezone-aware + second = second[0].dt.tz_localize('UTC') + expected = pd.DataFrame( + pd.Series([pd.NaT, pd.NaT, + pd.Timestamp('2015/01/01'), + pd.Timestamp('2016/01/01')]).dt.tz_localize('UTC') + ) + result = pd.concat([first, second], axis=0) + assert_frame_equal(result, expected) + + def test_concat_NaT_series_dataframe_all_NaT(self): + # GH 12396 + + # tz-naive + first = pd.Series([pd.NaT, pd.NaT]) + second = pd.DataFrame([[pd.Timestamp('2015/01/01')], + [pd.Timestamp('2016/01/01')]], + index=[2, 3]) + + expected = pd.DataFrame([pd.NaT, pd.NaT, + pd.Timestamp('2015/01/01'), + pd.Timestamp('2016/01/01')]) + + result = pd.concat([first, second]) + assert_frame_equal(result, expected) + + # one side timezone-aware + second[0] = second[0].dt.tz_localize('UTC') + result = pd.concat([first, second]) + + expected = pd.DataFrame( + pd.Series([pd.NaT, pd.NaT, + pd.Timestamp('2015/01/01'), + pd.Timestamp('2016/01/01')]).dt.tz_localize('UTC') + ) + assert_frame_equal(result, expected) + + # both sides timezone-aware + first = first.dt.tz_localize('UTC') + result = pd.concat([first, second]) + assert_frame_equal(result, expected) + + # mixed tz + first = pd.DataFrame([[pd.NaT], [pd.NaT]]) + second = pd.DataFrame([[pd.Timestamp('2015/01/01', tz='UTC')], + [pd.Timestamp('2016/01/01', tz='US/Eastern')]], + index=[2, 3]) + + expected = pd.DataFrame([pd.NaT, + pd.NaT, + pd.Timestamp('2015/01/01', tz='UTC'), + pd.Timestamp('2016/01/01', tz='US/Eastern')]) + + result = pd.concat([first, second], axis=0) + assert_frame_equal(result, expected) + def test_concat_period_series(self): x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='D')) @@ -1926,6 +2055,32 @@ def test_concat_empty_series(self): columns=['x', 0]) tm.assert_frame_equal(res, exp) + # GH 18447 + # tz-naive + first = Series(pd.to_datetime([], utc=False)) + second = Series([1, 2, 3]) + expected = DataFrame([[pd.NaT, 1], [pd.NaT, 2], [pd.NaT, 3]]) + result = concat([first, second], axis=1) + assert_frame_equal(result, expected) + + # timezone-aware + first = Series(pd.to_datetime([], utc=True)) + second = Series([1, 2, 3]) + expected = DataFrame( + {0: pd.Series([pd.NaT, pd.NaT, pd.NaT]).dt.tz_localize('UTC'), + 1: pd.Series([1, 2, 3])} + ) + result = concat([first, second], axis=1) + assert_frame_equal(result, expected) + + # both empty + first = Series(pd.to_datetime([], utc=True)) + second = Series([]) + result = concat([first, second], axis=1) + assert result.size == 0 + assert result.dtypes[0] == 'datetime64[ns, UTC]' + assert result.dtypes[1] == 'float64' + def test_default_index(self): # is_series and ignore_index s1 = pd.Series([1, 2, 3], name='x') From cf618db5ea2a0e8661dbb5176b488b4fd954a790 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 28 Apr 2018 07:27:59 -0500 Subject: [PATCH 2/4] use new take --- pandas/core/indexes/base.py | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 683f0ea795bd7..a2df52e3cc4e3 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2170,21 +2170,11 @@ def _assert_take_fillable(self, values, indices, allow_fill=True, fill_value=None, na_value=np.nan): """ Internal method to handle NA filling of take """ indices = _ensure_platform_int(indices) - # only fill if we are passing a non-None fill_value - if allow_fill and fill_value is not None: - if (indices < -1).any(): - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') - raise ValueError(msg) - mask = indices == -1 - if mask.all(): - taken = np.full(indices.shape, fill_value=na_value) - else: - taken = values.take(indices) - if mask.any(): - taken[mask] = na_value - else: - taken = values.take(indices) + # TODO: figure out what is going on with fill_value. It seems + # to be unused, other than that assertion that it's not None + # when allow_fill & any -1. + taken = algos.take(values, indices, allow_fill=allow_fill, + fill_value=na_value) return taken @cache_readonly From 49eefd7a2a5212ff5483a4324e5322a07b723d2d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 28 Apr 2018 14:02:34 -0500 Subject: [PATCH 3/4] Revert "use new take" This reverts commit cf618db5ea2a0e8661dbb5176b488b4fd954a790. --- pandas/core/indexes/base.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a2df52e3cc4e3..683f0ea795bd7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2170,11 +2170,21 @@ def _assert_take_fillable(self, values, indices, allow_fill=True, fill_value=None, na_value=np.nan): """ Internal method to handle NA filling of take """ indices = _ensure_platform_int(indices) - # TODO: figure out what is going on with fill_value. It seems - # to be unused, other than that assertion that it's not None - # when allow_fill & any -1. - taken = algos.take(values, indices, allow_fill=allow_fill, - fill_value=na_value) + # only fill if we are passing a non-None fill_value + if allow_fill and fill_value is not None: + if (indices < -1).any(): + msg = ('When allow_fill=True and fill_value is not None, ' + 'all indices must be >= -1') + raise ValueError(msg) + mask = indices == -1 + if mask.all(): + taken = np.full(indices.shape, fill_value=na_value) + else: + taken = values.take(indices) + if mask.any(): + taken[mask] = na_value + else: + taken = values.take(indices) return taken @cache_readonly From 4323f5ee05c53faa03c1983ae2543e17f4bb4a27 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 10 May 2018 14:34:51 -0400 Subject: [PATCH 4/4] fix some tests --- pandas/core/dtypes/concat.py | 8 +- pandas/tests/reshape/test_concat.py | 180 ++++++++++------------------ 2 files changed, 69 insertions(+), 119 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 4aa74cdbbc2c0..1f4d3069838ba 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -465,8 +465,12 @@ def convert_to_pydatetime(x, axis): if _contains_datetime: if 'datetime' in typs: - new_values = np.concatenate([x.view(np.int64) for x in - to_concat], axis=axis) + to_concat = [np.array(x, copy=False).view(np.int64) + for x in to_concat] + if axis == 1: + to_concat = [np.atleast_2d(x) for x in to_concat] + + new_values = np.concatenate(to_concat, axis=axis) return new_values.view(_NS_DTYPE) else: # when to_concat has different tz, len(typs) > 1. diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 567256df51bd9..7d4ffc964c7af 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1917,131 +1917,88 @@ def test_concat_tz_series_tzlocal(self): tm.assert_series_equal(result, pd.Series(x + y)) assert result.dtype == 'datetime64[ns, tzlocal()]' - def test_concat_NaT_dataframes_all_NaT_axis_0(self): + @pytest.mark.parametrize('tz1', [None, 'UTC']) + @pytest.mark.parametrize('tz2', [None, 'UTC']) + @pytest.mark.parametrize('s', [pd.NaT, pd.Timestamp('20150101')]) + def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, s): # GH 12396 # tz-naive - first = pd.DataFrame([[pd.NaT], [pd.NaT]]) - second = pd.DataFrame([[pd.NaT]]) - - result = pd.concat([first, second], axis=0) - expected = pd.DataFrame([pd.NaT, pd.NaT, pd.NaT], index=[0, 1, 0]) - assert_frame_equal(result, expected) - - # one side timezone-aware - # upcasts for mixed case - first = pd.DataFrame(pd.Series([pd.NaT, pd.NaT]).dt.tz_localize('UTC')) - result = pd.concat([first, second], axis=0) - expected = pd.DataFrame( - pd.Series([pd.NaT, pd.NaT, pd.NaT]).dt.tz_localize('UTC'), - index=[0, 1, 0] - ) - assert_frame_equal(result, expected) + first = pd.DataFrame([[pd.NaT], [pd.NaT]]).apply( + lambda x: x.dt.tz_localize(tz1)) + second = pd.DataFrame([s]).apply(lambda x: x.dt.tz_localize(tz2)) + + # we are all NaT so this is ok + if tz1 is None: + tz = tz2 + elif tz2 is None: + tz = tz1 + elif tz1 == tz2: + tz = tz1 + else: + tz = None - # both sides timezone-aware - # upcasts to tz-aware - second = pd.DataFrame(pd.Series([pd.NaT]).dt.tz_localize('UTC')) result = pd.concat([first, second], axis=0) + expected = pd.DataFrame(pd.Series( + [pd.NaT, pd.NaT, s], index=[0, 1, 0])) + expected = expected.apply(lambda x: x.dt.tz_localize(tz)) assert_frame_equal(result, expected) - def test_concat_NaT_dataframes_all_NaT_axis_1(self): + @pytest.mark.parametrize('tz1', [None, 'UTC']) + @pytest.mark.parametrize('tz2', [None, 'UTC']) + def test_concat_NaT_dataframes_all_NaT_axis_1(self, tz1, tz2): # GH 12396 - # tz-naive - first = pd.DataFrame([[pd.NaT], [pd.NaT]]) - second = pd.DataFrame([[pd.NaT]], columns=[1]) - expected = pd.DataFrame([[pd.NaT, pd.NaT], [pd.NaT, pd.NaT]], - columns=[0, 1]) - result = pd.concat([first, second], axis=1) - assert_frame_equal(result, expected) - - # one side timezone-aware - # upcasts result to tz-aware - first = pd.DataFrame(pd.Series([pd.NaT, pd.NaT]).dt.tz_localize('UTC')) + first = pd.DataFrame(pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1)) + second = pd.DataFrame(pd.Series( + [pd.NaT]).dt.tz_localize(tz2), columns=[1]) expected = pd.DataFrame( - {0: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize('UTC'), - 1: pd.Series([pd.NaT, pd.NaT])} + {0: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1), + 1: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz2)} ) result = pd.concat([first, second], axis=1) assert_frame_equal(result, expected) - # both sides timezone-aware - # upcasts result to tz-aware - second[1] = second[1].dt.tz_localize('UTC') - expected = pd.DataFrame( - {0: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize('UTC'), - 1: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize('UTC')} - ) - result = pd.concat([first, second], axis=1) - assert_frame_equal(result, expected) - - def test_concat_NaT_dataframes_mixed_timestamps_and_NaT(self): + @pytest.mark.parametrize('tz1', [None, 'UTC']) + @pytest.mark.parametrize('tz2', [None, 'UTC']) + def test_concat_NaT_series_dataframe_all_NaT(self, tz1, tz2): # GH 12396 # tz-naive - first = pd.DataFrame([[pd.NaT], [pd.NaT]]) - second = pd.DataFrame([[pd.Timestamp('2015/01/01')], - [pd.Timestamp('2016/01/01')]], + first = pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1) + second = pd.DataFrame([[pd.Timestamp('2015/01/01', tz=tz2)], + [pd.Timestamp('2016/01/01', tz=tz2)]], index=[2, 3]) - expected = pd.DataFrame([pd.NaT, pd.NaT, - pd.Timestamp('2015/01/01'), - pd.Timestamp('2016/01/01')]) - result = pd.concat([first, second], axis=0) - assert_frame_equal(result, expected) - - # one side timezone-aware - second = second[0].dt.tz_localize('UTC') - expected = pd.DataFrame( - pd.Series([pd.NaT, pd.NaT, - pd.Timestamp('2015/01/01'), - pd.Timestamp('2016/01/01')]).dt.tz_localize('UTC') - ) - result = pd.concat([first, second], axis=0) - assert_frame_equal(result, expected) - - def test_concat_NaT_series_dataframe_all_NaT(self): - # GH 12396 - - # tz-naive - first = pd.Series([pd.NaT, pd.NaT]) - second = pd.DataFrame([[pd.Timestamp('2015/01/01')], - [pd.Timestamp('2016/01/01')]], - index=[2, 3]) + if tz1 is None and tz2 is None: + tz = None + # we are all NaT so this is ok + elif tz1 is None: + tz = tz2 + elif tz1 == tz2: + tz = tz1 + else: + tz = None expected = pd.DataFrame([pd.NaT, pd.NaT, - pd.Timestamp('2015/01/01'), - pd.Timestamp('2016/01/01')]) - - result = pd.concat([first, second]) - assert_frame_equal(result, expected) + pd.Timestamp('2015/01/01', tz=tz), + pd.Timestamp('2016/01/01', tz=tz)]) - # one side timezone-aware - second[0] = second[0].dt.tz_localize('UTC') result = pd.concat([first, second]) - - expected = pd.DataFrame( - pd.Series([pd.NaT, pd.NaT, - pd.Timestamp('2015/01/01'), - pd.Timestamp('2016/01/01')]).dt.tz_localize('UTC') - ) assert_frame_equal(result, expected) - # both sides timezone-aware - first = first.dt.tz_localize('UTC') - result = pd.concat([first, second]) - assert_frame_equal(result, expected) + @pytest.mark.parametrize('tz', [None, 'UTC']) + def test_concat_NaT_dataframes(self, tz): + # GH 12396 - # mixed tz first = pd.DataFrame([[pd.NaT], [pd.NaT]]) - second = pd.DataFrame([[pd.Timestamp('2015/01/01', tz='UTC')], - [pd.Timestamp('2016/01/01', tz='US/Eastern')]], + first = first.apply(lambda x: x.dt.tz_localize(tz)) + second = pd.DataFrame([[pd.Timestamp('2015/01/01', tz=tz)], + [pd.Timestamp('2016/01/01', tz=tz)]], index=[2, 3]) - - expected = pd.DataFrame([pd.NaT, - pd.NaT, - pd.Timestamp('2015/01/01', tz='UTC'), - pd.Timestamp('2016/01/01', tz='US/Eastern')]) + expected = pd.DataFrame([pd.NaT, pd.NaT, + pd.Timestamp('2015/01/01', tz=tz), + pd.Timestamp('2016/01/01', tz=tz)]) result = pd.concat([first, second], axis=0) assert_frame_equal(result, expected) @@ -2107,32 +2064,21 @@ def test_concat_empty_series(self): columns=['x', 0]) tm.assert_frame_equal(res, exp) + @pytest.mark.parametrize('tz', [None, 'UTC']) + @pytest.mark.parametrize('values', [[], [1, 2, 3]]) + def test_concat_empty_series_timelike(self, tz, values): # GH 18447 - # tz-naive - first = Series(pd.to_datetime([], utc=False)) - second = Series([1, 2, 3]) - expected = DataFrame([[pd.NaT, 1], [pd.NaT, 2], [pd.NaT, 3]]) - result = concat([first, second], axis=1) - assert_frame_equal(result, expected) - # timezone-aware - first = Series(pd.to_datetime([], utc=True)) - second = Series([1, 2, 3]) + first = Series([], dtype='M8[ns]').dt.tz_localize(tz) + second = Series(values) expected = DataFrame( - {0: pd.Series([pd.NaT, pd.NaT, pd.NaT]).dt.tz_localize('UTC'), - 1: pd.Series([1, 2, 3])} - ) + {0: pd.Series([pd.NaT] * len(values), + dtype='M8[ns]' + ).dt.tz_localize(tz), + 1: values}) result = concat([first, second], axis=1) assert_frame_equal(result, expected) - # both empty - first = Series(pd.to_datetime([], utc=True)) - second = Series([]) - result = concat([first, second], axis=1) - assert result.size == 0 - assert result.dtypes[0] == 'datetime64[ns, UTC]' - assert result.dtypes[1] == 'float64' - def test_default_index(self): # is_series and ignore_index s1 = pd.Series([1, 2, 3], name='x')