From 24d67830b4421663601d43c8448fc919d2080529 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 15 Nov 2018 16:51:34 -0800 Subject: [PATCH 1/4] Fix+test dataframe tranpose with datetimeTZ --- pandas/core/generic.py | 51 +++++++++++++++++++-- pandas/core/internals/managers.py | 4 ++ pandas/tests/arithmetic/conftest.py | 27 +++++++++++ pandas/tests/arithmetic/test_datetime64.py | 29 ++++++------ pandas/tests/arithmetic/test_timedelta64.py | 10 ++-- pandas/tests/frame/test_reshape.py | 31 +++++++++++++ 6 files changed, 128 insertions(+), 24 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a1d60c493dda0..d57649f7f60ec 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -21,6 +21,7 @@ is_integer, is_bool, is_bool_dtype, is_numeric_dtype, + is_datetime64_dtype, is_datetime64_any_dtype, is_timedelta64_dtype, is_datetime64tz_dtype, @@ -34,7 +35,8 @@ from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import isna, notna -from pandas.core.dtypes.generic import ABCSeries, ABCPanel, ABCDataFrame +from pandas.core.dtypes.generic import ( + ABCSeries, ABCPanel, ABCDataFrame, ABCDatetimeIndex) from pandas.core.base import PandasObject, SelectionMixin from pandas.core.index import (Index, MultiIndex, ensure_index, @@ -683,11 +685,52 @@ def transpose(self, *args, **kwargs): new_axes = self._construct_axes_dict_from(self, [self._get_axis(x) for x in axes_names]) - new_values = self.values.transpose(axes_numbers) - if kwargs.pop('copy', None) or (len(args) and args[-1]): - new_values = new_values.copy() + copy = kwargs.pop('copy', None) or (len(args) and args[-1]) nv.validate_transpose_for_generic(self, kwargs) + + values = self.values + if (isinstance(values, ABCDatetimeIndex) and + values.tz is not None and self.ndim > 1): + # transpose is a no-op, and passing axes would raise ValueError + # as the DatetimeIndex.transpose method does not accept that kwarg + tz = values.tz + utc_values = values.asi8.reshape(self.shape) + utc_values = utc_values.transpose(axes_numbers) + if copy: + utc_values = utc_values.copy() + result = self._constructor(utc_values, **new_axes) + if self.ndim > 2: + # We're assuming DataFrame from here on + raise NotImplementedError + + for col in result.columns: + result[col] = pd.DatetimeIndex(result[col], tz=tz) + return result.__finalize__(self) + + else: + new_values = values.transpose(axes_numbers) + if copy: + new_values = new_values.copy() + + if is_datetime64_dtype(new_values): + # case where we have multiple columns with identical + # datetime64tz dtypes; the dtype will be lost in the call + # to `self.values`, so we need to restore it. + dtypes = self.dtypes + if (any(is_datetime64tz_dtype(d) for d in dtypes) and + all(d == dtypes[0] for d in dtypes)): + # these values represent UTC timestamps + new_values = new_values.view('i8') + result = self._constructor(new_values, **new_axes) + if self.ndim != 2: + # assuming DataFrame from here on out + raise NotImplementedError + tz = self.dtypes[0].tz + for col in result.columns: + result[col] = pd.DatetimeIndex(result[col], tz=tz) + return result.__finalize__(self) + return self._constructor(new_values, **new_axes).__finalize__(self) def swapaxes(self, axis1, axis2, copy=True): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 0519c5e5abe33..c8fb203b16ec6 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -14,6 +14,7 @@ from pandas.core.dtypes.common import ( _NS_DTYPE, + is_datetime64tz_dtype, is_datetimelike_v_numeric, is_numeric_v_string_like, is_extension_type, is_extension_array_dtype, @@ -781,6 +782,9 @@ def _interleave(self): dtype = dtype.subtype elif is_extension_array_dtype(dtype): dtype = 'object' + elif is_datetime64tz_dtype(dtype): + # TODO: we shouldn't be temporarily-dropping dtype information + dtype = 'M8[ns]' result = np.empty(self.shape, dtype=dtype) diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index 9ee5e05638978..a30d340f66fb8 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -158,6 +158,19 @@ def box_df_fail(request): return request.param +@pytest.fixture(params=[(pd.Index, False), + (pd.Series, False), + (pd.DataFrame, False), + (pd.DataFrame, True)], + ids=lambda x: x[0].__name__ + '-' + str(x[1])) +def box_with_transpose(request): + """ + Fixture similar to `box` but testing both transpose cases for DataFrame + """ + # GH#23620 + return request.param + + @pytest.fixture(params=[(pd.Index, False), (pd.Series, False), (pd.DataFrame, False), @@ -189,3 +202,17 @@ def box_with_datetime(request): Like `box`, but specific to datetime64 for also testing DatetimeArray """ return request.param + + +@pytest.fixture(params=[(pd.Index, False), + (pd.Series, False), + (pd.DataFrame, False), + (pd.DataFrame, True), + (DatetimeArray, False)], + ids=lambda x: x[0].__name__ + '-' + str(x[1])) +def box_T_with_datetime(request): + """ + Like `box`, but specific to datetime64 for also testing DatetimeArray, + and both transpose cases for DataFrame + """ + return request.param diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index b25e9a9a485c2..f2f879f3772df 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1153,20 +1153,19 @@ def test_dti_add_intarray_no_freq(self, box): # Binary operations DatetimeIndex and timedelta-like def test_dti_add_timedeltalike(self, tz_naive_fixture, two_hours, - box_with_datetime): + box_T_with_datetime): # GH#22005, GH#22163 check DataFrame doesn't raise TypeError - box = box_with_datetime + box, transpose = box_T_with_datetime tz = tz_naive_fixture rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz) - # FIXME: calling with transpose=True raises ValueError - rng = tm.box_expected(rng, box, transpose=False) + rng = tm.box_expected(rng, box, transpose=transpose) result = rng + two_hours expected = pd.date_range('2000-01-01 02:00', '2000-02-01 02:00', tz=tz) - expected = tm.box_expected(expected, box, transpose=False) + expected = tm.box_expected(expected, box, transpose=transpose) tm.assert_equal(result, expected) def test_dti_iadd_timedeltalike(self, tz_naive_fixture, two_hours): @@ -1193,18 +1192,18 @@ def test_dti_isub_timedeltalike(self, tz_naive_fixture, two_hours): rng -= two_hours tm.assert_index_equal(rng, expected) - def test_dt64arr_add_sub_td64_nat(self, box, tz_naive_fixture): + def test_dt64arr_add_sub_td64_nat(self, box_with_transpose, + tz_naive_fixture): # GH#23320 special handling for timedelta64("NaT") + box, transpose = box_with_transpose tz = tz_naive_fixture dti = pd.date_range("1994-04-01", periods=9, tz=tz, freq="QS") other = np.timedelta64("NaT") expected = pd.DatetimeIndex(["NaT"] * 9, tz=tz) - # FIXME: fails with transpose=True due to tz-aware DataFrame - # transpose bug - obj = tm.box_expected(dti, box, transpose=False) - expected = tm.box_expected(expected, box, transpose=False) + obj = tm.box_expected(dti, box, transpose=transpose) + expected = tm.box_expected(expected, box, transpose=transpose) result = obj + other tm.assert_equal(result, expected) @@ -1785,9 +1784,10 @@ def test_dti_with_offset_series(self, tz_naive_fixture, names): res3 = dti - other tm.assert_series_equal(res3, expected_sub) - def test_dti_add_offset_tzaware(self, tz_aware_fixture, box_with_datetime): + def test_dti_add_offset_tzaware(self, tz_aware_fixture, + box_T_with_datetime): # GH#21610, GH#22163 ensure DataFrame doesn't return object-dtype - box = box_with_datetime + box, transpose = box_T_with_datetime timezone = tz_aware_fixture if timezone == 'US/Pacific': @@ -1800,9 +1800,8 @@ def test_dti_add_offset_tzaware(self, tz_aware_fixture, box_with_datetime): expected = DatetimeIndex(['2010-11-01 05:00', '2010-11-01 06:00', '2010-11-01 07:00'], freq='H', tz=timezone) - # FIXME: these raise ValueError with transpose=True - dates = tm.box_expected(dates, box, transpose=False) - expected = tm.box_expected(expected, box, transpose=False) + dates = tm.box_expected(dates, box, transpose=transpose) + expected = tm.box_expected(expected, box, transpose=transpose) # TODO: parametrize over the scalar being added? radd? sub? offset = dates + pd.offsets.Hour(5) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 58c8b3b07f723..d7042f3cca8af 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -413,19 +413,19 @@ def test_td64arr_sub_timestamp_raises(self, box): with pytest.raises(TypeError, match=msg): idx - Timestamp('2011-01-01') - def test_td64arr_add_timestamp(self, box, tz_naive_fixture): + def test_td64arr_add_timestamp(self, box_with_transpose, tz_naive_fixture): # GH#23215 # TODO: parametrize over scalar datetime types? + box, transpose = box_with_transpose + tz = tz_naive_fixture other = Timestamp('2011-01-01', tz=tz) idx = TimedeltaIndex(['1 day', '2 day']) expected = DatetimeIndex(['2011-01-02', '2011-01-03'], tz=tz) - # FIXME: fails with transpose=True because of tz-aware DataFrame - # transpose bug - idx = tm.box_expected(idx, box, transpose=False) - expected = tm.box_expected(expected, box, transpose=False) + idx = tm.box_expected(idx, box, transpose=transpose) + expected = tm.box_expected(expected, box, transpose=transpose) result = idx + other tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index a53b01466c7a4..da48c2ded2b4f 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -939,3 +939,34 @@ def test_unstack_fill_frame_object(): index=list('xyz') ) assert_frame_equal(result, expected) + + +def test_transpose_dt64tz(): + # GH#???? transposing a DataFrame with a single datetime64tz column should + # not raise ValueError + + dti = pd.date_range('1977-04-15', periods=3, freq='MS', tz='US/Hawaii') + + # For reasons unknown this error shows up differently depending on how the + # DataFrame was constructed, so we do this several different ways. + + df1 = dti.to_series(keep_tz=True).to_frame() + df2 = pd.DataFrame(dti, index=dti) + df3 = pd.Series(dti, index=dti).to_frame() + + tm.assert_frame_equal(df1, df2) + tm.assert_frame_equal(df2, df3) + + for frame in [df1, df2, df3]: + frame.T + tm.assert_frame_equal(frame.T.T, frame) + + # Now going the other direction, we have to manually construct the + # transposed dataframe + df = pd.DataFrame(np.arange(9).reshape(3, 3)) + df[0] = dti[0] + df[1] = dti[1] + df[2] = dti[2] + + df.T + tm.assert_frame_equal(df.T.T, df) From a76581fbde603c023dbc95632407297e64bce639 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Fri, 16 Nov 2018 11:46:14 -0800 Subject: [PATCH 2/4] tests for mixed dtypes --- pandas/tests/frame/test_reshape.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index da48c2ded2b4f..beb046bd6bd06 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -942,7 +942,7 @@ def test_unstack_fill_frame_object(): def test_transpose_dt64tz(): - # GH#???? transposing a DataFrame with a single datetime64tz column should + # GH#23730 transposing a DataFrame with a single datetime64tz column should # not raise ValueError dti = pd.date_range('1977-04-15', periods=3, freq='MS', tz='US/Hawaii') @@ -970,3 +970,24 @@ def test_transpose_dt64tz(): df.T tm.assert_frame_equal(df.T.T, df) + + +def test_transpose_dt64tz_mixed_tz(): + # GH#23730 transposing two datetimetz columns with different tzs + dti = pd.date_range('1977-04-15', periods=3, freq='MS', tz='US/Hawaii') + dti2 = pd.date_range('1977-04-15', periods=3, freq='MS', tz='UTC') + + df = pd.DataFrame({"A": dti, "B": dti2}, columns=["A", "B"]) + df.T + tm.assert_frame_equal(df.T.T, df.astype(object)) + + +def test_transpose_dt64tz_mixed(): + # GH#23730 transposing with datetimetz column and numeric column, + # did not raise before but covering our bases + + dti = pd.date_range('1977-04-15', periods=3, freq='MS', tz='US/Hawaii') + df = pd.DataFrame({"A": dti, "B": [3, 4, 5]}, columns=["A", "B"]) + + df.T + tm.assert_frame_equal(df.T.T, df.astype(object)) From d0de3d83e02e61364e965298e7cbb0f7efd6e0c6 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sun, 18 Nov 2018 08:21:26 -0800 Subject: [PATCH 3/4] implement maybe_restore_dtypes --- pandas/core/frame.py | 4 +- pandas/core/generic.py | 91 +++++++++++++++---------------- pandas/core/internals/managers.py | 4 +- 3 files changed, 49 insertions(+), 50 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8d6e403783fc9..d3a93a576e55f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -568,7 +568,9 @@ def _get_axes(N, K, index=index, columns=columns): # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type - if dtype is None and is_object_dtype(values): + if dtype is None and is_object_dtype(values) and values.shape[0] == 1: + # only do this inference for single-column DataFrame, otherwise + # create_block_manager_from_blocks will raise a ValueError values = maybe_infer_to_datetimelike(values) return create_block_manager_from_blocks([values], [columns, index]) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1740c2a5023eb..46eaae2d7db5e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -21,10 +21,10 @@ is_integer, is_bool, is_bool_dtype, is_numeric_dtype, - is_datetime64_dtype, is_datetime64_any_dtype, - is_timedelta64_dtype, + is_datetime64_dtype, is_datetime64tz_dtype, + is_timedelta64_dtype, is_list_like, is_dict_like, is_re_compilable, @@ -32,7 +32,8 @@ is_object_dtype, is_extension_array_dtype, pandas_dtype) -from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask +from pandas.core.dtypes.cast import ( + maybe_promote, maybe_upcast_putmask, maybe_infer_to_datetimelike) from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import isna, notna from pandas.core.dtypes.generic import ( @@ -685,53 +686,20 @@ def transpose(self, *args, **kwargs): new_axes = self._construct_axes_dict_from(self, [self._get_axis(x) for x in axes_names]) - - copy = kwargs.pop('copy', None) or (len(args) and args[-1]) - nv.validate_transpose_for_generic(self, kwargs) - values = self.values - if (isinstance(values, ABCDatetimeIndex) and - values.tz is not None and self.ndim > 1): - # transpose is a no-op, and passing axes would raise ValueError - # as the DatetimeIndex.transpose method does not accept that kwarg - tz = values.tz - utc_values = values.asi8.reshape(self.shape) - utc_values = utc_values.transpose(axes_numbers) - if copy: - utc_values = utc_values.copy() - result = self._constructor(utc_values, **new_axes) - if self.ndim > 2: - # We're assuming DataFrame from here on - raise NotImplementedError - - for col in result.columns: - result[col] = pd.DatetimeIndex(result[col], tz=tz) - return result.__finalize__(self) + if isinstance(values, ABCDatetimeIndex): + # we must case to numpy array otherwise transpose raises ValueError + values = np.array(values.astype(np.object)).reshape(self.shape) - else: - new_values = values.transpose(axes_numbers) - if copy: - new_values = new_values.copy() - - if is_datetime64_dtype(new_values): - # case where we have multiple columns with identical - # datetime64tz dtypes; the dtype will be lost in the call - # to `self.values`, so we need to restore it. - dtypes = self.dtypes - if (any(is_datetime64tz_dtype(d) for d in dtypes) and - all(d == dtypes[0] for d in dtypes)): - # these values represent UTC timestamps - new_values = new_values.view('i8') - result = self._constructor(new_values, **new_axes) - if self.ndim != 2: - # assuming DataFrame from here on out - raise NotImplementedError - tz = self.dtypes[0].tz - for col in result.columns: - result[col] = pd.DatetimeIndex(result[col], tz=tz) - return result.__finalize__(self) + new_values = values.transpose(axes_numbers) + if kwargs.pop('copy', None) or (len(args) and args[-1]): + new_values = new_values.copy() + + nv.validate_transpose_for_generic(self, kwargs) + result = self._constructor(new_values, **new_axes) - return self._constructor(new_values, **new_axes).__finalize__(self) + result = maybe_restore_dtypes(result, self) + return result.__finalize__(self) def swapaxes(self, axis1, axis2, copy=True): """ @@ -10771,6 +10739,35 @@ def logical_func(self, axis=0, bool_only=None, skipna=True, level=None, return set_function_name(logical_func, name, cls) +def maybe_restore_dtypes(result, orig): + # GH#23730 + if orig.ndim != 2: + return result + + if orig.size == 0: + # ensure both orig.dtypes and result.dtypes have length >= 1 + return result + + if ((result.dtypes == np.object_).all() and + not (orig.dtypes == np.object_).any()): + # the transpose was lossy + if (orig.dtypes == orig.dtypes[0]).all(): + if is_datetime64tz_dtype(orig.dtypes[0]): + tz = orig.dtypes[0].tz + for col in result.columns: + result[col] = maybe_infer_to_datetimelike(result[col]) + if (is_datetime64_dtype(result[col]) and + isna(result[col]).all()): + # all-NaT gets inferred as tz-naive + result[col] = pd.DatetimeIndex(result[col], tz=tz) + + else: + # TODO: consider doing something useful in this case? + pass + + return result + + # install the indexes for _name, _indexer in indexing.get_indexers_list(): NDFrame._create_indexer(_name, _indexer) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index c8fb203b16ec6..c0d7a42cddd32 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -783,8 +783,8 @@ def _interleave(self): elif is_extension_array_dtype(dtype): dtype = 'object' elif is_datetime64tz_dtype(dtype): - # TODO: we shouldn't be temporarily-dropping dtype information - dtype = 'M8[ns]' + # TODO: avoid this conversion by allowing 2D DatetimeArray + dtype = 'object' result = np.empty(self.shape, dtype=dtype) From ec4cfeb173ca7ccc3d9dbb3d8b03b4c8754f87c5 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sun, 18 Nov 2018 13:32:57 -0800 Subject: [PATCH 4/4] xfail where appropriate --- pandas/tests/arithmetic/test_datetime64.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 1133e4d16aea5..c7e5b354c9177 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -824,6 +824,9 @@ def test_dt64arr_sub_datetime64_not_ns(self, box): def test_dt64arr_sub_timestamp(self, box_T_with_array): box, transpose = box_T_with_array + if box is tm.to_array: + pytest.xfail("DatetimeArray.__sub__ returns ndarray instead " + "of TimedeltaArray") ser = pd.date_range('2014-03-17', periods=2, freq='D', tz='US/Eastern') @@ -841,6 +844,9 @@ def test_dt64arr_sub_timestamp(self, box_T_with_array): def test_dt64arr_sub_NaT(self, box_T_with_array): # GH#18808 box, transpose = box_T_with_array + if box is tm.to_array: + pytest.xfail("DatetimeArray.__sub__ returns ndarray instead " + "of TimedeltaArray") dti = pd.DatetimeIndex([pd.NaT, pd.Timestamp('19900315')]) ser = tm.box_expected(dti, box, transpose=transpose)