From b85bdb95fd551453a39a33815f4af3e349e8beae Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Sat, 17 Nov 2018 10:51:41 -0800 Subject: [PATCH 01/16] BUG-23744 DataFrame.apply keeps dtype sparseness --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/core/apply.py | 13 ++++++++++--- pandas/tests/frame/test_apply.py | 10 ++++++++++ 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index b5d0532c6dfa3..e33aab0f6be45 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1259,6 +1259,7 @@ Numeric - Bug in :meth:`Series.rpow` with object dtype ``NaN`` for ``1 ** NA`` instead of ``1`` (:issue:`22922`). - :meth:`Series.agg` can now handle numpy NaN-aware methods like :func:`numpy.nansum` (:issue:`19629`) - Bug in :meth:`Series.rank` and :meth:`DataFrame.rank` when ``pct=True`` and more than 2:sup:`24` rows are present resulted in percentages greater than 1.0 (:issue:`18271`) +- Bug in :meth:`DataFrame.apply` where dtypes would lose sparseness (:issue:`23744`) Strings ^^^^^^^ diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 40cd952a62138..91e975ec3442b 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -7,7 +7,8 @@ is_extension_type, is_dict_like, is_list_like, - is_sequence) + is_sequence, + is_sparse) from pandas.util._decorators import cache_readonly from pandas.io.formats.printing import pprint_thing @@ -133,8 +134,14 @@ def get_result(self): elif isinstance(self.f, np.ufunc): with np.errstate(all='ignore'): results = self.f(self.values) - return self.obj._constructor(data=results, index=self.index, - columns=self.columns, copy=False) + result = self.obj._constructor(data=results, index=self.index, + columns=self.columns, copy=False) + for col in range(self.obj.shape[1]): + if is_sparse(self.obj.dtypes.values[col]): + fill = self.f(self.obj.dtypes.values[col].fill_value) + sparse_col = result.iloc[:, col].to_sparse(fill_value=fill) + result.iloc[:, col] = sparse_col + return result # broadcasting if self.result_type == 'broadcast': diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index c43872bfc3ddb..799694f54aad6 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -570,6 +570,16 @@ def test_apply_dup_names_multi_agg(self): tm.assert_frame_equal(result, expected) + def test_apply_keep_sparse_dtype(self): + # GH 23744 + df = pd.SparseDataFrame(np.array([[0, 1, 0], [0, 0, 0], [0, 0, 1]]), + columns=['a', 'b', 'c'], default_fill_value=1) + df2 = pd.DataFrame(df) + + df = df.apply(np.exp) + df2 = df2.apply(np.exp) + tm.assert_frame_equal(df, df2) + class TestInferOutputShape(object): # the user has supplied an opaque UDF where From ad33f76327db6dccffd7f2110f9189c1cc5f54f7 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Sat, 17 Nov 2018 12:15:36 -0800 Subject: [PATCH 02/16] BUG-23744 Fix memory usage --- pandas/core/apply.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 91e975ec3442b..cfe65f248c6c4 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -132,15 +132,12 @@ def get_result(self): # ufunc elif isinstance(self.f, np.ufunc): - with np.errstate(all='ignore'): - results = self.f(self.values) - result = self.obj._constructor(data=results, index=self.index, - columns=self.columns, copy=False) - for col in range(self.obj.shape[1]): - if is_sparse(self.obj.dtypes.values[col]): - fill = self.f(self.obj.dtypes.values[col].fill_value) - sparse_col = result.iloc[:, col].to_sparse(fill_value=fill) - result.iloc[:, col] = sparse_col + result = self.obj._constructor(index=self.index, copy=False) + for col in self.columns: + if is_sparse(self.obj.dtypes[col]): + result[col] = self.f(self.obj[col].values) + else: + result[col] = self.f(self.obj[col]) return result # broadcasting From c39fe11d01f5f4eac8fe2af4d54e01b0cc952fbb Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Sat, 17 Nov 2018 12:30:49 -0800 Subject: [PATCH 03/16] BUG-23744 Remove unnecessary check --- pandas/core/apply.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index cfe65f248c6c4..c765a0e0e7476 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -134,10 +134,7 @@ def get_result(self): elif isinstance(self.f, np.ufunc): result = self.obj._constructor(index=self.index, copy=False) for col in self.columns: - if is_sparse(self.obj.dtypes[col]): - result[col] = self.f(self.obj[col].values) - else: - result[col] = self.f(self.obj[col]) + result[col] = self.f(self.obj[col].values) return result # broadcasting From 4aba3f8bad6c895b16c81d1bace82a0ad55a48fc Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Sat, 17 Nov 2018 13:06:59 -0800 Subject: [PATCH 04/16] BUG-23744 fix import lint --- pandas/core/apply.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index c765a0e0e7476..bfef255bbf8c7 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -7,8 +7,7 @@ is_extension_type, is_dict_like, is_list_like, - is_sequence, - is_sparse) + is_sequence) from pandas.util._decorators import cache_readonly from pandas.io.formats.printing import pprint_thing From bcdf01b42326269e53420fab917115da7c944610 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Sat, 17 Nov 2018 15:24:36 -0800 Subject: [PATCH 05/16] BUG-23744 fix test --- pandas/core/apply.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index bfef255bbf8c7..32b961eae0889 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -133,7 +133,8 @@ def get_result(self): elif isinstance(self.f, np.ufunc): result = self.obj._constructor(index=self.index, copy=False) for col in self.columns: - result[col] = self.f(self.obj[col].values) + with np.errstate(all='ignore'): + result[col] = self.f(self.obj[col].values) return result # broadcasting From 99c8796183604622350e355f45d628b6c7872c2f Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Mon, 19 Nov 2018 14:37:31 -0800 Subject: [PATCH 06/16] BUG-23744 move test and avoid inefficiency --- pandas/core/apply.py | 23 +++++++++++++++++------ pandas/tests/frame/test_apply.py | 10 ---------- pandas/tests/sparse/frame/test_apply.py | 11 +++++++++++ 3 files changed, 28 insertions(+), 16 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 32b961eae0889..792f69448af9c 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -7,7 +7,8 @@ is_extension_type, is_dict_like, is_list_like, - is_sequence) + is_sequence, + is_sparse) from pandas.util._decorators import cache_readonly from pandas.io.formats.printing import pprint_thing @@ -131,11 +132,21 @@ def get_result(self): # ufunc elif isinstance(self.f, np.ufunc): - result = self.obj._constructor(index=self.index, copy=False) - for col in self.columns: - with np.errstate(all='ignore'): - result[col] = self.f(self.obj[col].values) - return result + for dtype in self.obj.dtypes: + # Column-by-column construction is slow, so only use + # when necessary (e.g. to preserve special dtypes) + if is_sparse(dtype): # GH 23744 + result = self.obj._constructor(index=self.index, + copy=False) + with np.errstate(all='ignore'): + for col in self.columns: + result[col] = self.f(self.obj[col].values) + return result + + with np.errstate(all='ignore'): + results = self.f(self.values) + return self.obj._constructor(data=results, index=self.index, + columns=self.columns, copy=False) # broadcasting if self.result_type == 'broadcast': diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 799694f54aad6..c43872bfc3ddb 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -570,16 +570,6 @@ def test_apply_dup_names_multi_agg(self): tm.assert_frame_equal(result, expected) - def test_apply_keep_sparse_dtype(self): - # GH 23744 - df = pd.SparseDataFrame(np.array([[0, 1, 0], [0, 0, 0], [0, 0, 1]]), - columns=['a', 'b', 'c'], default_fill_value=1) - df2 = pd.DataFrame(df) - - df = df.apply(np.exp) - df2 = df2.apply(np.exp) - tm.assert_frame_equal(df, df2) - class TestInferOutputShape(object): # the user has supplied an opaque UDF where diff --git a/pandas/tests/sparse/frame/test_apply.py b/pandas/tests/sparse/frame/test_apply.py index 2d7a537f0fb3b..67ee6f1d5da35 100644 --- a/pandas/tests/sparse/frame/test_apply.py +++ b/pandas/tests/sparse/frame/test_apply.py @@ -91,3 +91,14 @@ def test_applymap(frame): # just test that it works result = frame.applymap(lambda x: x * 2) assert isinstance(result, SparseDataFrame) + + +def test_apply_keep_sparse_dtype(): + # GH 23744 + df = SparseDataFrame(np.array([[0, 1, 0], [0, 0, 0], [0, 0, 1]]), + columns=['a', 'b', 'c'], default_fill_value=1) + df2 = DataFrame(df) + + df = df.apply(np.exp) + df2 = df2.apply(np.exp) + tm.assert_frame_equal(df, df2) From de0ecf35365729beb54cc2ec0d95cd0d0b71c93f Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Thu, 22 Nov 2018 20:05:38 -0800 Subject: [PATCH 07/16] BUG-23744 make requested changes --- doc/source/whatsnew/v0.24.0.rst | 3 +-- pandas/core/apply.py | 19 +++++++++---------- pandas/tests/sparse/frame/test_apply.py | 12 ++++++------ 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 134f4eb98c20d..062dd20a13ab0 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1275,7 +1275,6 @@ Numeric - Bug in :meth:`Series.rpow` with object dtype ``NaN`` for ``1 ** NA`` instead of ``1`` (:issue:`22922`). - :meth:`Series.agg` can now handle numpy NaN-aware methods like :func:`numpy.nansum` (:issue:`19629`) - Bug in :meth:`Series.rank` and :meth:`DataFrame.rank` when ``pct=True`` and more than 2:sup:`24` rows are present resulted in percentages greater than 1.0 (:issue:`18271`) -- Bug in :meth:`DataFrame.apply` where dtypes would lose sparseness (:issue:`23744`) Strings ^^^^^^^ @@ -1461,6 +1460,7 @@ Sparse - Bug in unary inversion operator (``~``) on a ``SparseSeries`` with boolean values. The performance of this has also been improved (:issue:`22835`) - Bug in :meth:`SparseArary.unique` not returning the unique values (:issue:`19595`) - Bug in :meth:`SparseArray.nonzero` and :meth:`SparseDataFrame.dropna` returning shifted/incorrect results (:issue:`21172`) +- Bug in :meth:`DataFrame.apply` where dtypes would lose sparseness (:issue:`23744`) Build Changes ^^^^^^^^^^^^^ @@ -1487,4 +1487,3 @@ Contributors ~~~~~~~~~~~~ .. contributors:: v0.23.4..HEAD - diff --git a/pandas/core/apply.py b/pandas/core/apply.py index aeee8023c20a1..192d7647935a4 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -131,16 +131,15 @@ def get_result(self): # ufunc elif isinstance(self.f, np.ufunc): - for dtype in self.obj.dtypes: - # Column-by-column construction is slow, so only use - # when necessary (e.g. to preserve special dtypes) - if is_sparse(dtype): # GH 23744 - result = self.obj._constructor(index=self.index, - copy=False) - with np.errstate(all='ignore'): - for col in self.columns: - result[col] = self.f(self.obj[col].values) - return result + if any(is_sparse(dtype) for dtype in self.obj.dtypes): + # Column-by-column construction is slow, so only use when + # necessary (e.g. to preserve special dtypes) GH 23744 + result = self.obj._constructor(index=self.index, + copy=False) + with np.errstate(all='ignore'): + for col in self.columns: + result[col] = self.f(self.obj[col].values) + return result with np.errstate(all='ignore'): results = self.f(self.values) diff --git a/pandas/tests/sparse/frame/test_apply.py b/pandas/tests/sparse/frame/test_apply.py index 67ee6f1d5da35..e8c42fe705f49 100644 --- a/pandas/tests/sparse/frame/test_apply.py +++ b/pandas/tests/sparse/frame/test_apply.py @@ -95,10 +95,10 @@ def test_applymap(frame): def test_apply_keep_sparse_dtype(): # GH 23744 - df = SparseDataFrame(np.array([[0, 1, 0], [0, 0, 0], [0, 0, 1]]), - columns=['a', 'b', 'c'], default_fill_value=1) - df2 = DataFrame(df) + expected = SparseDataFrame(np.array([[0, 1, 0], [0, 0, 0], [0, 0, 1]]), + columns=['a', 'b', 'c'], default_fill_value=1) + result = DataFrame(expected) - df = df.apply(np.exp) - df2 = df2.apply(np.exp) - tm.assert_frame_equal(df, df2) + expected = expected.apply(np.exp) + result = result.apply(np.exp) + tm.assert_frame_equal(expected, result) From 491b908cdc3dcd98c592bcf7c3050abc6a334a20 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Thu, 22 Nov 2018 20:05:38 -0800 Subject: [PATCH 08/16] BUG-23744 make requested changes --- doc/source/whatsnew/v0.24.0.rst | 3 +-- pandas/core/apply.py | 19 +++++++++---------- pandas/tests/sparse/frame/test_apply.py | 12 ++++++------ 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 134f4eb98c20d..062dd20a13ab0 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1275,7 +1275,6 @@ Numeric - Bug in :meth:`Series.rpow` with object dtype ``NaN`` for ``1 ** NA`` instead of ``1`` (:issue:`22922`). - :meth:`Series.agg` can now handle numpy NaN-aware methods like :func:`numpy.nansum` (:issue:`19629`) - Bug in :meth:`Series.rank` and :meth:`DataFrame.rank` when ``pct=True`` and more than 2:sup:`24` rows are present resulted in percentages greater than 1.0 (:issue:`18271`) -- Bug in :meth:`DataFrame.apply` where dtypes would lose sparseness (:issue:`23744`) Strings ^^^^^^^ @@ -1461,6 +1460,7 @@ Sparse - Bug in unary inversion operator (``~``) on a ``SparseSeries`` with boolean values. The performance of this has also been improved (:issue:`22835`) - Bug in :meth:`SparseArary.unique` not returning the unique values (:issue:`19595`) - Bug in :meth:`SparseArray.nonzero` and :meth:`SparseDataFrame.dropna` returning shifted/incorrect results (:issue:`21172`) +- Bug in :meth:`DataFrame.apply` where dtypes would lose sparseness (:issue:`23744`) Build Changes ^^^^^^^^^^^^^ @@ -1487,4 +1487,3 @@ Contributors ~~~~~~~~~~~~ .. contributors:: v0.23.4..HEAD - diff --git a/pandas/core/apply.py b/pandas/core/apply.py index aeee8023c20a1..192d7647935a4 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -131,16 +131,15 @@ def get_result(self): # ufunc elif isinstance(self.f, np.ufunc): - for dtype in self.obj.dtypes: - # Column-by-column construction is slow, so only use - # when necessary (e.g. to preserve special dtypes) - if is_sparse(dtype): # GH 23744 - result = self.obj._constructor(index=self.index, - copy=False) - with np.errstate(all='ignore'): - for col in self.columns: - result[col] = self.f(self.obj[col].values) - return result + if any(is_sparse(dtype) for dtype in self.obj.dtypes): + # Column-by-column construction is slow, so only use when + # necessary (e.g. to preserve special dtypes) GH 23744 + result = self.obj._constructor(index=self.index, + copy=False) + with np.errstate(all='ignore'): + for col in self.columns: + result[col] = self.f(self.obj[col].values) + return result with np.errstate(all='ignore'): results = self.f(self.values) diff --git a/pandas/tests/sparse/frame/test_apply.py b/pandas/tests/sparse/frame/test_apply.py index 67ee6f1d5da35..e8c42fe705f49 100644 --- a/pandas/tests/sparse/frame/test_apply.py +++ b/pandas/tests/sparse/frame/test_apply.py @@ -95,10 +95,10 @@ def test_applymap(frame): def test_apply_keep_sparse_dtype(): # GH 23744 - df = SparseDataFrame(np.array([[0, 1, 0], [0, 0, 0], [0, 0, 1]]), - columns=['a', 'b', 'c'], default_fill_value=1) - df2 = DataFrame(df) + expected = SparseDataFrame(np.array([[0, 1, 0], [0, 0, 0], [0, 0, 1]]), + columns=['a', 'b', 'c'], default_fill_value=1) + result = DataFrame(expected) - df = df.apply(np.exp) - df2 = df2.apply(np.exp) - tm.assert_frame_equal(df, df2) + expected = expected.apply(np.exp) + result = result.apply(np.exp) + tm.assert_frame_equal(expected, result) From bca539f16a4b6ec2c21c90f388991bd130a1e2c3 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Fri, 23 Nov 2018 15:14:23 -0800 Subject: [PATCH 09/16] BUG-23744 use list comprehension --- pandas/core/apply.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 192d7647935a4..960b32c0f3101 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -9,6 +9,7 @@ from pandas.core.dtypes.common import ( is_dict_like, is_extension_type, is_list_like, is_sequence, is_sparse) from pandas.core.dtypes.generic import ABCSeries +from pandas import concat from pandas.io.formats.printing import pprint_thing @@ -131,20 +132,13 @@ def get_result(self): # ufunc elif isinstance(self.f, np.ufunc): - if any(is_sparse(dtype) for dtype in self.obj.dtypes): - # Column-by-column construction is slow, so only use when - # necessary (e.g. to preserve special dtypes) GH 23744 - result = self.obj._constructor(index=self.index, - copy=False) - with np.errstate(all='ignore'): - for col in self.columns: - result[col] = self.f(self.obj[col].values) - return result - - with np.errstate(all='ignore'): - results = self.f(self.values) - return self.obj._constructor(data=results, index=self.index, - columns=self.columns, copy=False) + result = [self.f(self.obj[col]) + .to_sparse( + fill_value=self.f(self.obj.dtypes[col].fill_value)) + if is_sparse(self.obj.dtypes[col]) + else self.f(self.obj[col]) + for col in self.columns] + return concat(result, axis=1, copy=False).set_index(self.index) # broadcasting if self.result_type == 'broadcast': From d8670ef30a861b9a342a468eb224ad2cebbc0a1d Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Fri, 23 Nov 2018 15:49:26 -0800 Subject: [PATCH 10/16] BUG-23744 use for loop instead --- pandas/core/apply.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 960b32c0f3101..551c7e2bfc6c8 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -132,13 +132,15 @@ def get_result(self): # ufunc elif isinstance(self.f, np.ufunc): - result = [self.f(self.obj[col]) - .to_sparse( - fill_value=self.f(self.obj.dtypes[col].fill_value)) - if is_sparse(self.obj.dtypes[col]) - else self.f(self.obj[col]) - for col in self.columns] - return concat(result, axis=1, copy=False).set_index(self.index) + results = [] + for col in self.columns: + if is_sparse(self.obj.dtypes[col]): + fill = self.f(self.obj.dtypes[col].fill_value) + result = self.f(self.obj[col]).to_sparse(fill_value=fill) + else: + result = self.f(self.obj[col]) + results.append(result) + return concat(results, axis=1, copy=False).set_index(self.index) # broadcasting if self.result_type == 'broadcast': From c15afe3b19d2083fb46e71537fce259c54ffcee1 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Fri, 23 Nov 2018 19:09:15 -0800 Subject: [PATCH 11/16] BUG-23744 fix test --- pandas/core/apply.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 551c7e2bfc6c8..0d1e3edd964dc 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -133,13 +133,15 @@ def get_result(self): # ufunc elif isinstance(self.f, np.ufunc): results = [] - for col in self.columns: - if is_sparse(self.obj.dtypes[col]): - fill = self.f(self.obj.dtypes[col].fill_value) - result = self.f(self.obj[col]).to_sparse(fill_value=fill) - else: - result = self.f(self.obj[col]) - results.append(result) + with np.errstate(all='ignore'): + for col in self.columns: + if is_sparse(self.obj.dtypes[col]): + fill = self.f(self.obj.dtypes[col].fill_value) + result = self.f(self.obj[col]) + result = result.to_sparse(fill_value=fill) + else: + result = self.f(self.obj[col]) + results.append(result) return concat(results, axis=1, copy=False).set_index(self.index) # broadcasting From b4ab44b8f9669056beebb3bc8d1126a3b5c75302 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Fri, 23 Nov 2018 20:03:55 -0800 Subject: [PATCH 12/16] BUG-23744 fix other test --- pandas/core/apply.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 0d1e3edd964dc..0a45acd5fd38d 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -9,6 +9,7 @@ from pandas.core.dtypes.common import ( is_dict_like, is_extension_type, is_list_like, is_sequence, is_sparse) from pandas.core.dtypes.generic import ABCSeries + from pandas import concat from pandas.io.formats.printing import pprint_thing From d153f74708ab76a1e41944dde650df6ccbe52a58 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Sun, 25 Nov 2018 15:26:34 -0800 Subject: [PATCH 13/16] BUG-23744 use constructor properly --- pandas/core/apply.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 0a45acd5fd38d..2a3aedc88a385 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -10,8 +10,6 @@ is_dict_like, is_extension_type, is_list_like, is_sequence, is_sparse) from pandas.core.dtypes.generic import ABCSeries -from pandas import concat - from pandas.io.formats.printing import pprint_thing @@ -133,7 +131,7 @@ def get_result(self): # ufunc elif isinstance(self.f, np.ufunc): - results = [] + results = {} with np.errstate(all='ignore'): for col in self.columns: if is_sparse(self.obj.dtypes[col]): @@ -142,8 +140,9 @@ def get_result(self): result = result.to_sparse(fill_value=fill) else: result = self.f(self.obj[col]) - results.append(result) - return concat(results, axis=1, copy=False).set_index(self.index) + results[col] = result + return self.obj._constructor(data=results, index=self.index, + copy=False) # broadcasting if self.result_type == 'broadcast': From d6e22a8b34691f1f87c08876d7777ddf3351362a Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Mon, 26 Nov 2018 09:36:18 -0800 Subject: [PATCH 14/16] BUG-23744 use block apply --- pandas/core/apply.py | 14 +++----------- pandas/tests/sparse/frame/test_apply.py | 2 +- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 2a3aedc88a385..5658094ec36c6 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -7,7 +7,7 @@ from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( - is_dict_like, is_extension_type, is_list_like, is_sequence, is_sparse) + is_dict_like, is_extension_type, is_list_like, is_sequence) from pandas.core.dtypes.generic import ABCSeries from pandas.io.formats.printing import pprint_thing @@ -131,18 +131,10 @@ def get_result(self): # ufunc elif isinstance(self.f, np.ufunc): - results = {} with np.errstate(all='ignore'): - for col in self.columns: - if is_sparse(self.obj.dtypes[col]): - fill = self.f(self.obj.dtypes[col].fill_value) - result = self.f(self.obj[col]) - result = result.to_sparse(fill_value=fill) - else: - result = self.f(self.obj[col]) - results[col] = result + results = self.obj._data.apply('apply', func=self.f) return self.obj._constructor(data=results, index=self.index, - copy=False) + columns=self.columns, copy=False) # broadcasting if self.result_type == 'broadcast': diff --git a/pandas/tests/sparse/frame/test_apply.py b/pandas/tests/sparse/frame/test_apply.py index e8c42fe705f49..42c6c908207ed 100644 --- a/pandas/tests/sparse/frame/test_apply.py +++ b/pandas/tests/sparse/frame/test_apply.py @@ -96,7 +96,7 @@ def test_applymap(frame): def test_apply_keep_sparse_dtype(): # GH 23744 expected = SparseDataFrame(np.array([[0, 1, 0], [0, 0, 0], [0, 0, 1]]), - columns=['a', 'b', 'c'], default_fill_value=1) + columns=['b', 'a', 'c'], default_fill_value=1) result = DataFrame(expected) expected = expected.apply(np.exp) From 8f151dc91e799087d10281ff875660fa1025f16a Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Mon, 26 Nov 2018 19:23:47 -0800 Subject: [PATCH 15/16] BUG-23744 clarify test --- pandas/tests/sparse/frame/test_apply.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/sparse/frame/test_apply.py b/pandas/tests/sparse/frame/test_apply.py index 42c6c908207ed..c26776ac4fd49 100644 --- a/pandas/tests/sparse/frame/test_apply.py +++ b/pandas/tests/sparse/frame/test_apply.py @@ -95,10 +95,10 @@ def test_applymap(frame): def test_apply_keep_sparse_dtype(): # GH 23744 - expected = SparseDataFrame(np.array([[0, 1, 0], [0, 0, 0], [0, 0, 1]]), - columns=['b', 'a', 'c'], default_fill_value=1) - result = DataFrame(expected) + sdf = SparseDataFrame(np.array([[0, 1, 0], [0, 0, 0], [0, 0, 1]]), + columns=['b', 'a', 'c'], default_fill_value=1) + df = DataFrame(sdf) - expected = expected.apply(np.exp) - result = result.apply(np.exp) + expected = sdf.apply(np.exp) + result = df.apply(np.exp) tm.assert_frame_equal(expected, result) From be8750febc38606a2c6c3ffa7b7aa3857053d893 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Mon, 26 Nov 2018 19:23:47 -0800 Subject: [PATCH 16/16] BUG-23744 clarify test --- pandas/tests/sparse/frame/test_apply.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/sparse/frame/test_apply.py b/pandas/tests/sparse/frame/test_apply.py index 42c6c908207ed..c26776ac4fd49 100644 --- a/pandas/tests/sparse/frame/test_apply.py +++ b/pandas/tests/sparse/frame/test_apply.py @@ -95,10 +95,10 @@ def test_applymap(frame): def test_apply_keep_sparse_dtype(): # GH 23744 - expected = SparseDataFrame(np.array([[0, 1, 0], [0, 0, 0], [0, 0, 1]]), - columns=['b', 'a', 'c'], default_fill_value=1) - result = DataFrame(expected) + sdf = SparseDataFrame(np.array([[0, 1, 0], [0, 0, 0], [0, 0, 1]]), + columns=['b', 'a', 'c'], default_fill_value=1) + df = DataFrame(sdf) - expected = expected.apply(np.exp) - result = result.apply(np.exp) + expected = sdf.apply(np.exp) + result = df.apply(np.exp) tm.assert_frame_equal(expected, result)