From 381b07309a8c6aa60d3862747302e4810b66c05f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 22 Oct 2018 15:35:29 -0500 Subject: [PATCH 1/8] Preserve EA dtype in DataFrame.stack --- doc/source/whatsnew/v0.24.0.txt | 2 ++ pandas/core/reshape/reshape.py | 9 ++++++++- pandas/tests/extension/base/reshaping.py | 8 ++++++++ pandas/tests/frame/test_reshape.py | 11 +++++++++++ 4 files changed, 29 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index d0aa156cf5059..5aa5088d5028b 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -724,6 +724,8 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your - Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`) - :func:`ExtensionArray.isna` is allowed to return an ``ExtensionArray`` (:issue:`22325`). - Support for reduction operations such as ``sum``, ``mean`` via opt-in base class method override (:issue:`22762`) +- :meth:`DataFrame.stack` no longer converts to object dtype for DataFrames where each column has the same extension dtype. The output Series will have the same dtype as the columns (:issue:`23077`). + .. _whatsnew_0240.api.incompatibilities: diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 03b77f0e787f0..d4fc3ed66bce6 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -470,8 +470,15 @@ def factorize(index): if is_extension_array_dtype(dtype): arr = dtype.construct_array_type() new_values = arr._concat_same_type([ - col for _, col in frame.iteritems() + col._values for _, col in frame.iteritems() ]) + # final take to get the order correct. + # idx is an indexer like + # [c0r0, c1r0, c2r0, ..., + # c0r1, c1r1, c241, ...] + idx = np.arange(N * K).reshape(K, N).T.ravel() + new_values = new_values.take(idx) + else: # homogeneous, non-EA new_values = frame.values.ravel() diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 7f13c2cd67373..d985bd6785715 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -170,3 +170,11 @@ def test_merge(self, data, na_value): [data[0], data[0], data[1], data[2], na_value], dtype=data.dtype)}) self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) + + def test_stack(self, data): + df = pd.DataFrame({"A": data[:5], "B": data[:5]}) + result = df.stack() + assert result.dtype == df.A.dtype + result = result.astype(object) + expected = df.astype(object).stack() + self.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 9f6735c7ba2bf..4ddd3141b5f8b 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -872,6 +872,17 @@ def test_stack_preserve_categorical_dtype(self, ordered, labels): tm.assert_series_equal(result, expected) + def test_stack_preserve_categorical_dtype_values(self): + # GH-23077 + cat = pd.Categorical(['a', 'a', 'b', 'c']) + df = pd.DataFrame({"A": cat, "B": cat}) + result = df.stack() + index = pd.MultiIndex.from_product([[0, 1, 2, 3], ['A', 'B']]) + expected = pd.Series(pd.Categorical(['a', 'a', 'a', 'a', + 'b', 'b', 'c', 'c']), + index=index) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("level", [0, 'baz']) def test_unstack_swaplevel_sortlevel(self, level): # GH 20994 From 428f2308b67324aeda125be085bb6387b77037fd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 23 Oct 2018 11:29:34 -0500 Subject: [PATCH 2/8] sparse --- pandas/core/internals/blocks.py | 5 ++++- pandas/tests/sparse/frame/test_frame.py | 10 ++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6fb1184f48b69..60200d7c3cdfa 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -36,6 +36,7 @@ is_list_like, is_re, is_re_compilable, + is_sparse, pandas_dtype) from pandas.core.dtypes.cast import ( maybe_downcast_to_dtype, @@ -633,7 +634,9 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, return self if klass is None: - if dtype == np.object_: + # sparse is "special" and preserves sparsity. + # We're changing this in GH-23125 + if dtype == np.object_ and is_sparse(values): klass = ObjectBlock elif is_extension_array_dtype(dtype): klass = ExtensionBlock diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 03143488c3874..10074a2e5ad99 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -736,6 +736,16 @@ def test_astype_bool(self): assert res['A'].dtype == SparseDtype(np.bool) assert res['B'].dtype == SparseDtype(np.bool) + def test_astype_object(self): + # This may change in GH-23125 + df = pd.DataFrame({"A": SparseArray([0, 1]), + "B": SparseArray([0, 1])}) + result = df.astype(object) + dtype = SparseDtype(object, 0) + expected = pd.DataFrame({"A": SparseArray([0, 1], dtype=dtype), + "B": SparseArray([0, 1], dtype=dtype)}) + tm.assert_frame_equal(result, expected) + def test_fillna(self, float_frame_fill0, float_frame_fill0_dense): df = float_frame_fill0.reindex(lrange(5)) dense = float_frame_fill0_dense.reindex(lrange(5)) From 0d39be0ec5028c058164ac5f7f162cb8f2b4f793 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 24 Oct 2018 06:34:15 -0500 Subject: [PATCH 3/8] multi test --- pandas/tests/extension/base/reshaping.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index d985bd6785715..113daeb0d5359 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -171,10 +171,21 @@ def test_merge(self, data, na_value): dtype=data.dtype)}) self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) - def test_stack(self, data): + @pytest.mark.parametrize("columns", [ + ["A", "B"], + pd.MultiIndex.from_tuples([('A', 'a'), ('A', 'b')], + names=['outer', 'inner']), + ]) + def test_stack(self, data, columns): df = pd.DataFrame({"A": data[:5], "B": data[:5]}) + df.columns = columns result = df.stack() - assert result.dtype == df.A.dtype - result = result.astype(object) expected = df.astype(object).stack() - self.assert_series_equal(result, expected) + + if isinstance(expected, pd.Series): + assert result.dtype == df.iloc[:, 0].dtype + else: + assert all(result.dtypes == df.iloc[:, 0].dtype) + + result = result.astype(object) + self.assert_equal(result, expected) From 7e9224a9b56876fd5e9024fabd6102b8b71bd4f8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 24 Oct 2018 15:27:52 -0500 Subject: [PATCH 4/8] multiple columns --- pandas/core/internals/blocks.py | 7 ++++--- pandas/core/reshape/reshape.py | 25 +++++++++++++++++++++--- pandas/tests/extension/json/test_json.py | 9 ++++++++- 3 files changed, 34 insertions(+), 7 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index c346252666173..18ec046d0e614 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -633,9 +633,10 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, return self if klass is None: - # sparse is "special" and preserves sparsity. - # We're changing this in GH-23125 - if dtype == np.object_ and is_sparse(values): + if is_sparse(self.values): + # Series[Sparse].astype(object) is sparse. + klass = ExtensionBlock + elif is_object_dtype(dtype): klass = ObjectBlock elif is_extension_array_dtype(dtype): klass = ExtensionBlock diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index d4fc3ed66bce6..496a946b0bcc3 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -475,7 +475,7 @@ def factorize(index): # final take to get the order correct. # idx is an indexer like # [c0r0, c1r0, c2r0, ..., - # c0r1, c1r1, c241, ...] + # c0r1, c1r1, c2r1, ...] idx = np.arange(N * K).reshape(K, N).T.ravel() new_values = new_values.take(idx) @@ -603,20 +603,39 @@ def _convert_level_number(level_num, columns): # indexer if not isinstance(loc, slice): slice_len = len(loc) + locs = list(loc) else: slice_len = loc.stop - loc.start + locs = list(range(loc.start, loc.stop)) if slice_len != levsize: chunk = this.loc[:, this.columns[loc]] chunk.columns = level_vals.take(chunk.columns.labels[-1]) value_slice = chunk.reindex(columns=level_vals_used).values else: - if frame._is_mixed_type: + if (frame._is_homogeneous_type and + is_extension_array_dtype(frame.dtypes.iloc[0])): + import pdb; pdb.set_trace() + dtype = this.loc[:, this.columns[loc]].dtypes.iloc[0] + subset = this.loc[:, this.columns[loc]] + + value_slice = dtype.construct_array_type()._concat_same_type( + [x._values for _, x in subset.iteritems()] + ) + N, K = this.shape + idx = np.arange(N * K).reshape(K, N).T.ravel() + value_slice = value_slice.take(idx) + + elif frame._is_mixed_type: value_slice = this.loc[:, this.columns[loc]].values else: value_slice = this.values[:, loc] - new_data[key] = value_slice.ravel() + if value_slice.ndim > 1: + # i.e. not extension + value_slice = value_slice.ravel() + + new_data[key] = value_slice if len(drop_cols) > 0: new_columns = new_columns.difference(drop_cols) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 15d99f6c5d2fc..15aebf3e17ecb 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -138,7 +138,14 @@ def test_from_dtype(self, data): class TestReshaping(BaseJSON, base.BaseReshapingTests): - pass + + @pytest.mark.skip(reason="Different definitions of NA") + def test_stack(self): + """ + The test does .astype(object).stack(). If we happen to have + any missing values in `data`, then we'll end up with different + rows since we consider `{}` NA, but `.astype(object)` doesn't. + """ class TestGetitem(BaseJSON, base.BaseGetitemTests): From d6661cb6490916b808b2f9c0ac650bec56ad8cf7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 24 Oct 2018 16:12:29 -0500 Subject: [PATCH 5/8] remove pdb --- pandas/core/reshape/reshape.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 496a946b0bcc3..6dbf7765b68d5 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -603,10 +603,8 @@ def _convert_level_number(level_num, columns): # indexer if not isinstance(loc, slice): slice_len = len(loc) - locs = list(loc) else: slice_len = loc.stop - loc.start - locs = list(range(loc.start, loc.stop)) if slice_len != levsize: chunk = this.loc[:, this.columns[loc]] @@ -615,7 +613,6 @@ def _convert_level_number(level_num, columns): else: if (frame._is_homogeneous_type and is_extension_array_dtype(frame.dtypes.iloc[0])): - import pdb; pdb.set_trace() dtype = this.loc[:, this.columns[loc]].dtypes.iloc[0] subset = this.loc[:, this.columns[loc]] From 144d11748421c9a0d346696735180751014b2c93 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 5 Nov 2018 10:00:57 -0600 Subject: [PATCH 6/8] really object --- pandas/tests/extension/base/reshaping.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 32645f1da9397..a96dc9977a413 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -181,6 +181,9 @@ def test_stack(self, data, columns): df.columns = columns result = df.stack() expected = df.astype(object).stack() + # we need a second astype(object), in case the constructor inferred + # object -> specialized, as is done for period. + expected = expected.astype(object) if isinstance(expected, pd.Series): assert result.dtype == df.iloc[:, 0].dtype From 98f75c917130db8810ac3a46f39a2caa686ec1b2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 5 Nov 2018 10:01:49 -0600 Subject: [PATCH 7/8] remove loc --- pandas/core/reshape/reshape.py | 51 ++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 11 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index a375b7e7f4529..c3add07fbb831 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -467,13 +467,7 @@ def factorize(index): new_values = arr._concat_same_type([ col._values for _, col in frame.iteritems() ]) - # final take to get the order correct. - # idx is an indexer like - # [c0r0, c1r0, c2r0, ..., - # c0r1, c1r1, c2r1, ...] - idx = np.arange(N * K).reshape(K, N).T.ravel() - new_values = new_values.take(idx) - + new_values = _reorder_for_extension_array_stack(new_values, N, K) else: # homogeneous, non-EA new_values = frame.values.ravel() @@ -602,14 +596,14 @@ def _convert_level_number(level_num, columns): slice_len = loc.stop - loc.start if slice_len != levsize: - chunk = this.loc[:, this.columns[loc]] + chunk = this[this.columns[loc]] chunk.columns = level_vals.take(chunk.columns.labels[-1]) value_slice = chunk.reindex(columns=level_vals_used).values else: if (frame._is_homogeneous_type and is_extension_array_dtype(frame.dtypes.iloc[0])): - dtype = this.loc[:, this.columns[loc]].dtypes.iloc[0] - subset = this.loc[:, this.columns[loc]] + dtype = this[this.columns[loc]].dtypes.iloc[0] + subset = this[this.columns[loc]] value_slice = dtype.construct_array_type()._concat_same_type( [x._values for _, x in subset.iteritems()] @@ -619,7 +613,7 @@ def _convert_level_number(level_num, columns): value_slice = value_slice.take(idx) elif frame._is_mixed_type: - value_slice = this.loc[:, this.columns[loc]].values + value_slice = this[this.columns[loc]].values else: value_slice = this.values[:, loc] @@ -965,3 +959,38 @@ def make_axis_dummies(frame, axis='minor', transform=None): values = values.take(labels, axis=0) return DataFrame(values, columns=items, index=frame.index) + + +def _reorder_for_extension_array_stack(arr, n_rows, n_columns): + """ + Re-orders the values when stacking multiple extension-arrays. + + The indirect stacking method used for EAs requires a followup + take to get the order correct. + + Parameters + ---------- + arr : ExtensionArray + n_rows, n_columns : int + The number of rows and columns in the original DataFrame. + + Returns + ------- + taken : ExtensionArray + The original `arr` with elements re-ordered appropriately + + Examples + -------- + >>> arr = np.array(['a', 'b', 'c', 'd', 'e', 'f']) + >>> _reorder_for_extension_array_stack(arr, 2, 3) + array(['a', 'c', 'e', 'b', 'd', 'f'], dtype='>> _reorder_for_extension_array_stack(arr, 3, 2) + array(['a', 'd', 'b', 'e', 'c', 'f'], dtype=' Date: Thu, 8 Nov 2018 06:26:13 -0600 Subject: [PATCH 8/8] lint --- pandas/core/internals/blocks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 1b05b9cf02827..1f2a1ee52159e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -35,7 +35,6 @@ is_numeric_v_string_like, is_extension_type, is_extension_array_dtype, is_list_like, - is_sparse, is_re, is_re_compilable, is_sparse,