From 44a0fcf9e0411350d9ca0869a184cbf007046e68 Mon Sep 17 00:00:00 2001 From: hcontrast Date: Sun, 8 May 2016 19:13:32 +0100 Subject: [PATCH 1/3] handle tz-aware all NaT concatenation incl. mixed cases (GH12396) --- pandas/core/internals.py | 6 ++ pandas/tools/tests/test_merge.py | 115 +++++++++++++++++++++++++++++++ 2 files changed, 121 insertions(+) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index abfc5c989056e..7871b94bd1588 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -4911,6 +4911,12 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): pass elif getattr(self.block, 'is_sparse', False): pass + elif com.is_extension_type(empty_dtype) and \ + com.is_datetimetz(empty_dtype): + num_elements = np.prod(self.shape) + # handle timezone-aware all NaT cases + return DatetimeIndex([fill_value] * num_elements, + dtype=empty_dtype) else: missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 13f00afb5a489..d32416523df0f 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -12,6 +12,7 @@ from pandas.compat import range, lrange, lzip, StringIO from pandas import compat from pandas.tseries.index import DatetimeIndex +from pandas.types.dtypes import DatetimeTZDtype from pandas.tools.merge import merge, concat, ordered_merge, MergeError from pandas import Categorical, Timestamp from pandas.util.testing import (assert_frame_equal, assert_series_equal, @@ -2522,6 +2523,120 @@ def test_concat_multiindex_with_tz(self): result = concat([df, df]) tm.assert_frame_equal(result, expected) + def test_concat_NaT_dataframes_all_NaT_axis_0(self): + # GH 12396 + expect = pd.DataFrame([pd.NaT, pd.NaT, pd.NaT], index=[0, 1, 0]) + + # non-timezone aware + first = pd.DataFrame([[pd.NaT], [pd.NaT]]) + second = pd.DataFrame([[pd.NaT]]) + + result = pd.concat([first, second], axis=0) + assert_frame_equal(result, expect) + + # one side timezone-aware + dtype = DatetimeTZDtype('ns', tz='UTC') + first = pd.DataFrame([[pd.NaT], [pd.NaT]], dtype=dtype) + + result = pd.concat([first, second], axis=0) + # upcasts for mixed case + assert_frame_equal(result, expect, check_dtype=False) + self.assertEqual(result.dtypes[0], dtype) + + # both sides timezone-aware + second = pd.DataFrame([[pd.NaT]], dtype=dtype) + + result = pd.concat([first, second], axis=0) + # upcasts to tz-aware + assert_frame_equal(result, expect, check_dtype=False) + self.assertEqual(result.dtypes[0], dtype) + + def test_concat_NaT_dataframes_all_NaT_axis_1(self): + # GH 12396 + expect = pd.DataFrame([[pd.NaT, pd.NaT], [pd.NaT, pd.NaT]], + columns=[0, 0]) + + # non-timezone aware + first = pd.DataFrame([[pd.NaT], [pd.NaT]]) + second = pd.DataFrame([[pd.NaT]]) + + result = pd.concat([first, second], axis=1) + assert_frame_equal(result, expect) + + # one side timezone-aware + dtype = DatetimeTZDtype('ns', tz='UTC') + first = pd.DataFrame([[pd.NaT], [pd.NaT]], dtype=dtype) + + # upcasts result to tz-aware + assert_frame_equal(result, expect, check_dtype=False) + result = pd.concat([first, second], axis=1) + self.assertEqual(result.dtypes.iloc[0], dtype) + self.assertEqual(result.dtypes.iloc[0], first.dtypes[0]) + self.assertEqual(result.dtypes.iloc[1], second.dtypes[0]) + + # both sides timezone-aware + second = pd.DataFrame([[pd.NaT]], dtype=dtype) + + result = pd.concat([first, second], axis=1) + assert_frame_equal(result, expect, check_dtype=False) + # upcasts to tz-aware + self.assertEqual(result.dtypes.iloc[0], dtype) + self.assertEqual(result.dtypes.iloc[0], first.dtypes[0]) + self.assertEqual(result.dtypes.iloc[1], second.dtypes[0]) + + def test_concat_NaT_dataframes_mixed_timestamps_and_NaT(self): + # GH 12396 + + # non-timezone aware + first = pd.DataFrame([[pd.NaT], [pd.NaT]]) + second = pd.DataFrame([[pd.Timestamp('2015/01/01')], + [pd.Timestamp('2016/01/01')]]) + + expect = pd.DataFrame([pd.NaT, pd.NaT, second.iloc[0, 0], + second.iloc[1, 0]], index=[0, 1, 0, 1]) + + result = pd.concat([first, second], axis=0) + assert_frame_equal(result, expect) + self.assertEqual(result.dtypes.iloc[0], first.dtypes[0]) + + # one side timezone-aware + dtype = DatetimeTZDtype('ns', tz='UTC') + second = second.apply(lambda x: x.astype(dtype)) + + result = pd.concat([first, second], axis=0) + assert_frame_equal(result, expect, check_dtype=False) + # upcasts + self.assertEqual(result.dtypes.iloc[0], dtype) + self.assertEqual(result.dtypes.iloc[0], second.dtypes[0]) + + def test_concat_NaT_series_dataframe_all_NaT(self): + # GH 12396 + + # non-timezone aware + first = pd.Series([pd.NaT, pd.NaT]) + second = pd.DataFrame([[pd.Timestamp('2015/01/01')], + [pd.Timestamp('2016/01/01')]]) + + expect = pd.DataFrame([pd.NaT, pd.NaT, second.iloc[0, 0], + second.iloc[1, 0]], index=[0, 1, 0, 1]) + + result = pd.concat([first, second]) + assert_frame_equal(result, expect) + + # one side timezone-aware + dtype = DatetimeTZDtype('ns', tz='UTC') + second = second.apply(lambda x: x.astype(dtype)) + + result = pd.concat([first, second]) + + expect = expect.apply(lambda x: x.astype(dtype)) + assert_frame_equal(result, expect, check_dtype=True) + + # both sides timezone-aware + first = first.astype(dtype) + result = pd.concat([first, second]) + assert_frame_equal(result, expect, check_dtype=True) + def test_concat_keys_and_levels(self): df = DataFrame(np.random.randn(1, 3)) df2 = DataFrame(np.random.randn(1, 4)) From f95bb83c9f47c40463e741f4f7e5b16060fb8e19 Mon Sep 17 00:00:00 2001 From: afragner Date: Sun, 8 May 2016 22:39:06 +0100 Subject: [PATCH 2/3] improve tests for all-NaT concatenation (GH12396) --- pandas/core/internals.py | 14 ++++++------- pandas/tools/tests/test_merge.py | 36 +++++++++++++------------------- 2 files changed, 20 insertions(+), 30 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 7871b94bd1588..188b00348b4e1 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -4905,18 +4905,16 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): if len(values) and values[0] is None: fill_value = None - if getattr(self.block, 'is_datetimetz', False): - pass + if getattr(self.block, 'is_datetimetz', False) \ + or com.is_datetimetz(empty_dtype): + # handle timezone-aware all NaT cases + num_elements = np.prod(self.shape) + return DatetimeIndex([fill_value] * num_elements, + dtype=empty_dtype) elif getattr(self.block, 'is_categorical', False): pass elif getattr(self.block, 'is_sparse', False): pass - elif com.is_extension_type(empty_dtype) and \ - com.is_datetimetz(empty_dtype): - num_elements = np.prod(self.shape) - # handle timezone-aware all NaT cases - return DatetimeIndex([fill_value] * num_elements, - dtype=empty_dtype) else: missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index d32416523df0f..6831f1a315640 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -2540,25 +2540,24 @@ def test_concat_NaT_dataframes_all_NaT_axis_0(self): result = pd.concat([first, second], axis=0) # upcasts for mixed case - assert_frame_equal(result, expect, check_dtype=False) - self.assertEqual(result.dtypes[0], dtype) + expect = expect.apply(lambda x: x.astype(dtype)) + assert_frame_equal(result, expect) # both sides timezone-aware second = pd.DataFrame([[pd.NaT]], dtype=dtype) - result = pd.concat([first, second], axis=0) # upcasts to tz-aware - assert_frame_equal(result, expect, check_dtype=False) - self.assertEqual(result.dtypes[0], dtype) + result = pd.concat([first, second], axis=0) + assert_frame_equal(result, expect) def test_concat_NaT_dataframes_all_NaT_axis_1(self): # GH 12396 expect = pd.DataFrame([[pd.NaT, pd.NaT], [pd.NaT, pd.NaT]], - columns=[0, 0]) + columns=[0, 1]) # non-timezone aware first = pd.DataFrame([[pd.NaT], [pd.NaT]]) - second = pd.DataFrame([[pd.NaT]]) + second = pd.DataFrame([[pd.NaT]], columns=[1]) result = pd.concat([first, second], axis=1) assert_frame_equal(result, expect) @@ -2568,21 +2567,17 @@ def test_concat_NaT_dataframes_all_NaT_axis_1(self): first = pd.DataFrame([[pd.NaT], [pd.NaT]], dtype=dtype) # upcasts result to tz-aware - assert_frame_equal(result, expect, check_dtype=False) + expect.loc[:, 0] = expect.loc[:, 0].astype(dtype) result = pd.concat([first, second], axis=1) - self.assertEqual(result.dtypes.iloc[0], dtype) - self.assertEqual(result.dtypes.iloc[0], first.dtypes[0]) - self.assertEqual(result.dtypes.iloc[1], second.dtypes[0]) + assert_frame_equal(result, expect) # both sides timezone-aware - second = pd.DataFrame([[pd.NaT]], dtype=dtype) + second = pd.DataFrame([[pd.NaT]], dtype=dtype, columns=[1]) - result = pd.concat([first, second], axis=1) - assert_frame_equal(result, expect, check_dtype=False) # upcasts to tz-aware - self.assertEqual(result.dtypes.iloc[0], dtype) - self.assertEqual(result.dtypes.iloc[0], first.dtypes[0]) - self.assertEqual(result.dtypes.iloc[1], second.dtypes[0]) + expect = expect.apply(lambda x: x.astype(dtype)) + result = pd.concat([first, second], axis=1) + assert_frame_equal(result, expect) def test_concat_NaT_dataframes_mixed_timestamps_and_NaT(self): # GH 12396 @@ -2597,17 +2592,14 @@ def test_concat_NaT_dataframes_mixed_timestamps_and_NaT(self): result = pd.concat([first, second], axis=0) assert_frame_equal(result, expect) - self.assertEqual(result.dtypes.iloc[0], first.dtypes[0]) # one side timezone-aware dtype = DatetimeTZDtype('ns', tz='UTC') second = second.apply(lambda x: x.astype(dtype)) result = pd.concat([first, second], axis=0) - assert_frame_equal(result, expect, check_dtype=False) - # upcasts - self.assertEqual(result.dtypes.iloc[0], dtype) - self.assertEqual(result.dtypes.iloc[0], second.dtypes[0]) + expect = expect.apply(lambda x: x.astype(dtype)) + assert_frame_equal(result, expect) def test_concat_NaT_series_dataframe_all_NaT(self): # GH 12396 From 10406dfa330d6c8f4f5f983bde239777686d60a1 Mon Sep 17 00:00:00 2001 From: afragner Date: Sun, 8 May 2016 22:41:10 +0100 Subject: [PATCH 3/3] clean up tests for GH12396 --- pandas/tools/tests/test_merge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 6831f1a315640..a44db54e36f38 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -2622,12 +2622,12 @@ def test_concat_NaT_series_dataframe_all_NaT(self): result = pd.concat([first, second]) expect = expect.apply(lambda x: x.astype(dtype)) - assert_frame_equal(result, expect, check_dtype=True) + assert_frame_equal(result, expect) # both sides timezone-aware first = first.astype(dtype) result = pd.concat([first, second]) - assert_frame_equal(result, expect, check_dtype=True) + assert_frame_equal(result, expect) def test_concat_keys_and_levels(self): df = DataFrame(np.random.randn(1, 3))