From 5195dad542518f8c83da48bd7acfeaf41364ebf5 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sun, 22 Jul 2018 13:03:48 -0700 Subject: [PATCH 01/20] avoid casting to object dtype in mixed-type frames --- pandas/core/frame.py | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4578d2ac08199..c6b7603496480 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4921,12 +4921,35 @@ def _arith_op(left, right): copy=False) def _combine_match_index(self, other, func, level=None): + assert isinstance(other, Series) left, right = self.align(other, join='outer', axis=0, level=level, copy=False) - new_data = func(left.values.T, right.values).T - return self._constructor(new_data, - index=left.index, columns=self.columns, - copy=False) + assert left.index.equals(right.index) + + if left._is_mixed_type or right._is_mixed_type: + if self.columns.is_unique: + new_data = {col: func(left[col], right) + for col in left.columns} + result = self._constructor(new_data, + index=left.index, + columns=left.columns, + copy=False) + return result + else: + new_data = [func(left.iloc[:, idx], right) + for idx in range(len(left.columns))] + result = self._constructor(new_data, + index=left.index, + copy=False) + result.columns = left.columns + return result + + else: + # easy case, operate directly on values + result = func(left.values.T, right.values).T + return self._constructor(result, + index=left.index, columns=self.columns, + copy=False) def _combine_match_columns(self, other, func, level=None, try_cast=True): left, right = self.align(other, join='outer', axis=1, level=level, From 1a6690695e20762efa25752b9649374eb70e1f1e Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sun, 22 Jul 2018 14:51:40 -0700 Subject: [PATCH 02/20] Dispatch to Series ops in _combine_match_columns --- pandas/core/frame.py | 33 +++++++-- pandas/core/internals/__init__.py | 68 +++++++++++-------- pandas/core/sparse/frame.py | 29 +++----- .../tests/frame/test_axis_select_reindex.py | 2 +- pandas/tests/reshape/test_pivot.py | 8 ++- pandas/tests/series/test_operators.py | 7 +- 6 files changed, 88 insertions(+), 59 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c6b7603496480..54f6483d73f2c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -70,7 +70,8 @@ check_bool_indexer) from pandas.core.internals import (BlockManager, create_block_manager_from_arrays, - create_block_manager_from_blocks) + create_block_manager_from_blocks, + try_cast_result) from pandas.core.series import Series from pandas.core.arrays import Categorical, ExtensionArray import pandas.core.algorithms as algorithms @@ -4929,7 +4930,7 @@ def _combine_match_index(self, other, func, level=None): if left._is_mixed_type or right._is_mixed_type: if self.columns.is_unique: new_data = {col: func(left[col], right) - for col in left.columns} + for col in left.columns} result = self._constructor(new_data, index=left.index, columns=left.columns, @@ -4937,7 +4938,7 @@ def _combine_match_index(self, other, func, level=None): return result else: new_data = [func(left.iloc[:, idx], right) - for idx in range(len(left.columns))] + for idx in range(len(left.columns))] result = self._constructor(new_data, index=left.index, copy=False) @@ -4952,13 +4953,31 @@ def _combine_match_index(self, other, func, level=None): copy=False) def _combine_match_columns(self, other, func, level=None, try_cast=True): + assert isinstance(other, Series) left, right = self.align(other, join='outer', axis=1, level=level, copy=False) + assert left.columns.equals(right.index), (left.columns, right.index) - new_data = left._data.eval(func=func, other=right, - axes=[left.columns, self.index], - try_cast=try_cast) - return self._constructor(new_data) + new_data = [func(left.iloc[:, n], right.iloc[n]) + for n in range(len(left.columns))] + + if try_cast: + new_data = [try_cast_result(left.iloc[:, n], new_data[n]) + for n in range(len(left.columns))] + + if left.columns.is_unique: + new_data = {left.columns[n]: new_data[n] + for n in range(len(left.columns))} + result = self._constructor(new_data, + index=left.index, columns=left.columns, + copy=False) + return result + + else: + new_data = {i: new_data[i] for i in range(len(new_data))} + result = self._constructor(new_data, index=left.index, copy=False) + result.columns = left.columns + return result def _combine_const(self, other, func, errors='raise', try_cast=True): new_data = self._data.eval(func=func, other=other, diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index fde3aaa14ac5d..0bbecf04acdc6 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -86,6 +86,45 @@ from pandas.compat import range, map, zip, u +def try_cast_result(left, result, dtype=None): + """ + Try to cast the result to our original type, we may have + roundtripped thru object in the mean-time + """ + if dtype is None: + dtype = left.dtype + + if (is_integer_dtype(left) or is_bool_dtype(left) or + is_datetime64_dtype(left) or is_datetime64tz_dtype(left)): + pass + elif is_object_dtype(left) and lib.infer_dtype(left) == 'boolean': + # disguised is_bool_dtype + pass + elif is_float_dtype(left) and result.dtype == left.dtype: + + # protect against a bool/object showing up here + if isinstance(dtype, compat.string_types) and dtype == 'infer': + return result + if not isinstance(dtype, type): + dtype = dtype.type + if issubclass(dtype, (np.bool_, np.object_)): + if issubclass(dtype, np.bool_): + if isna(result).all(): + return result.astype(np.bool_) + else: + result = result.astype(np.object_) + result[result == 1] = True + result[result == 0] = False + return result + else: + return result.astype(np.object_) + + return result + + # may need to change the dtype here + return maybe_downcast_to_dtype(result, dtype) + + class Block(PandasObject): """ Canonical n-dimensional unit of homogeneous dtype contained in a pandas @@ -711,34 +750,7 @@ def _try_cast_result(self, result, dtype=None): """ try to cast the result to our original type, we may have roundtripped thru object in the mean-time """ - if dtype is None: - dtype = self.dtype - - if self.is_integer or self.is_bool or self.is_datetime: - pass - elif self.is_float and result.dtype == self.dtype: - - # protect against a bool/object showing up here - if isinstance(dtype, compat.string_types) and dtype == 'infer': - return result - if not isinstance(dtype, type): - dtype = dtype.type - if issubclass(dtype, (np.bool_, np.object_)): - if issubclass(dtype, np.bool_): - if isna(result).all(): - return result.astype(np.bool_) - else: - result = result.astype(np.object_) - result[result == 1] = True - result[result == 0] = False - return result - else: - return result.astype(np.object_) - - return result - - # may need to change the dtype here - return maybe_downcast_to_dtype(result, dtype) + return try_cast_result(self, result, dtype) def _try_coerce_args(self, values, other): """ provide coercion to our input arguments """ diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index f7071061d07ab..1134260173280 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -585,19 +585,14 @@ def _combine_match_index(self, other, func, level=None): if level is not None: raise NotImplementedError("'level' argument is not supported") - new_index = self.index.union(other.index) - this = self - if self.index is not new_index: - this = self.reindex(new_index) - - if other.index is not new_index: - other = other.reindex(new_index) + this, other = self.align(other, join='outer', axis=0, + level=level, copy=False) + new_data = {} for col, series in compat.iteritems(this): new_data[col] = func(series.values, other.values) # fill_value is a function of our operator - fill_value = None if isna(other.fill_value) or isna(self.default_fill_value): fill_value = np.nan else: @@ -605,7 +600,7 @@ def _combine_match_index(self, other, func, level=None): np.float64(other.fill_value)) return self._constructor( - new_data, index=new_index, columns=self.columns, + new_data, index=this.index, columns=this.columns, default_fill_value=fill_value).__finalize__(self) def _combine_match_columns(self, other, func, level=None, try_cast=True): @@ -617,19 +612,17 @@ def _combine_match_columns(self, other, func, level=None, try_cast=True): if level is not None: raise NotImplementedError("'level' argument is not supported") - new_data = {} - - union = intersection = self.columns - - if not union.equals(other.index): - union = other.index.union(self.columns) - intersection = other.index.intersection(self.columns) + left, right = self.align(other, join='outer', axis=1, + level=level, copy=False) + new_index, new_columns = left.index, left.columns - for col in intersection: + new_data = {} + for col in other.index.intersection(self.columns): new_data[col] = func(self[col], float(other[col])) + # TODO: Why are we casting other[col] to float? return self._constructor( - new_data, index=self.index, columns=union, + new_data, index=self.index, columns=new_columns, default_fill_value=self.default_fill_value).__finalize__(self) def _combine_const(self, other, func, errors='raise', try_cast=True): diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index 004fb4eb0c128..e47ae5f7a8f4d 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -738,7 +738,7 @@ def test_align_int_fill_bug(self): result = df1 - df1.mean() expected = df2 - df2.mean() - assert_frame_equal(result, expected) + assert_frame_equal(result.astype('f8'), expected) def test_align_multiindex(self): # GH 10665 diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 7e7e081408534..bb464dec2c65f 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1565,8 +1565,9 @@ def test_crosstab_normalize(self): full_normal) tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index'), row_normal) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns'), - col_normal) + tm.assert_frame_equal( + pd.crosstab(df.a, df.b, normalize='columns').astype('f8'), + col_normal) tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=1), pd.crosstab(df.a, df.b, normalize='columns')) tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=0), @@ -1599,7 +1600,8 @@ def test_crosstab_normalize(self): tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index', margins=True), row_normal_margins) tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns', - margins=True), col_normal_margins) + margins=True).astype('f8'), + col_normal_margins) tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=True, margins=True), all_normal_margins) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index ecb74622edf10..e2f646bb6f551 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -1547,8 +1547,11 @@ def tester(a, b): # this is an alignment issue; these are equivalent # https://github.com/pandas-dev/pandas/issues/5284 - pytest.raises(ValueError, lambda: d.__and__(s, axis='columns')) - pytest.raises(ValueError, tester, s, d) + with pytest.raises(TypeError): + d.__and__(s, axis='columns') + + with pytest.raises(TypeError): + tester(s, d) # this is wrong as its not a boolean result # result = d.__and__(s,axis='index') From a45abecdde40e9cb7c5dca18c965fc3be04585af Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sun, 22 Jul 2018 15:34:56 -0700 Subject: [PATCH 03/20] comment --- pandas/core/frame.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 54f6483d73f2c..b3a3c3971b9ae 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4953,6 +4953,8 @@ def _combine_match_index(self, other, func, level=None): copy=False) def _combine_match_columns(self, other, func, level=None, try_cast=True): + # TODO: `func` passed here is wrapped in core.ops; if we are + # dispatching to Series implementation, should we pass unwrapped? assert isinstance(other, Series) left, right = self.align(other, join='outer', axis=1, level=level, copy=False) From 8594c48051ff5992a6e5ef9f5386fc66ae772aaf Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sun, 22 Jul 2018 15:37:11 -0700 Subject: [PATCH 04/20] docstring --- pandas/core/internals/__init__.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 0bbecf04acdc6..cab767e3a3e95 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -88,8 +88,18 @@ def try_cast_result(left, result, dtype=None): """ - Try to cast the result to our original type, we may have - roundtripped thru object in the mean-time + Try to cast the result to the original dtype for `left`; we may have + roundtripped thru object in the mean-time. + + Parameters + ---------- + left : array-like + result : array-like + dtype : np.dtype, pd.dtype, or None (default None) + + Returns + ------- + maybe_casted : same type as `result` """ if dtype is None: dtype = left.dtype From 108550e3a1a98c133c1986d43311bc0ee6db8d03 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sun, 22 Jul 2018 16:57:16 -0700 Subject: [PATCH 05/20] flake8 fixup --- pandas/core/sparse/frame.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 1134260173280..bb024db47b503 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -614,7 +614,6 @@ def _combine_match_columns(self, other, func, level=None, try_cast=True): left, right = self.align(other, join='outer', axis=1, level=level, copy=False) - new_index, new_columns = left.index, left.columns new_data = {} for col in other.index.intersection(self.columns): @@ -622,7 +621,7 @@ def _combine_match_columns(self, other, func, level=None, try_cast=True): # TODO: Why are we casting other[col] to float? return self._constructor( - new_data, index=self.index, columns=new_columns, + new_data, index=left.index, columns=left.columns, default_fill_value=self.default_fill_value).__finalize__(self) def _combine_const(self, other, func, errors='raise', try_cast=True): From 323f45eee093adcf9103b488d4ae05b7f8649390 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 24 Jul 2018 17:35:20 -0700 Subject: [PATCH 06/20] dont bother with try_cast_result --- pandas/core/frame.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c6dcad69a158a..65cdfd479705a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -70,8 +70,7 @@ check_bool_indexer) from pandas.core.internals import (BlockManager, create_block_manager_from_arrays, - create_block_manager_from_blocks, - try_cast_result) + create_block_manager_from_blocks) from pandas.core.series import Series from pandas.core.arrays import Categorical, ExtensionArray import pandas.core.algorithms as algorithms @@ -4964,10 +4963,6 @@ def _combine_match_columns(self, other, func, level=None, try_cast=True): new_data = [func(left.iloc[:, n], right.iloc[n]) for n in range(len(left.columns))] - if try_cast: - new_data = [try_cast_result(left.iloc[:, n], new_data[n]) - for n in range(len(left.columns))] - if left.columns.is_unique: new_data = {left.columns[n]: new_data[n] for n in range(len(left.columns))} From a0708d160f2df15f75e9d32fd3b8909d81c2d372 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 24 Jul 2018 17:36:59 -0700 Subject: [PATCH 07/20] revert non-central change --- pandas/core/frame.py | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2b03a2f38079f..5bef50320e4fc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4926,31 +4926,10 @@ def _combine_match_index(self, other, func, level=None): left, right = self.align(other, join='outer', axis=0, level=level, copy=False) assert left.index.equals(right.index) - - if left._is_mixed_type or right._is_mixed_type: - if self.columns.is_unique: - new_data = {col: func(left[col], right) - for col in left.columns} - result = self._constructor(new_data, - index=left.index, - columns=left.columns, - copy=False) - return result - else: - new_data = [func(left.iloc[:, idx], right) - for idx in range(len(left.columns))] - result = self._constructor(new_data, - index=left.index, - copy=False) - result.columns = left.columns - return result - - else: - # easy case, operate directly on values - result = func(left.values.T, right.values).T - return self._constructor(result, - index=left.index, columns=self.columns, - copy=False) + new_data = func(left.values.T, right.values).T + return self._constructor(new_data, + index=left.index, columns=self.columns, + copy=False) def _combine_match_columns(self, other, func, level=None, try_cast=True): # TODO: `func` passed here is wrapped in core.ops; if we are From 5d3db89e74cf35133492852d3e4319e49dbed494 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 24 Jul 2018 17:38:59 -0700 Subject: [PATCH 08/20] simplify --- pandas/core/frame.py | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5bef50320e4fc..2dbe4ff21ab64 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4932,29 +4932,20 @@ def _combine_match_index(self, other, func, level=None): copy=False) def _combine_match_columns(self, other, func, level=None, try_cast=True): - # TODO: `func` passed here is wrapped in core.ops; if we are - # dispatching to Series implementation, should we pass unwrapped? assert isinstance(other, Series) left, right = self.align(other, join='outer', axis=1, level=level, copy=False) assert left.columns.equals(right.index), (left.columns, right.index) - new_data = [func(left.iloc[:, n], right.iloc[n]) + # Note: we use iloc instead of loc for compat with + # case with non-unique columns; same reason why we pin columns + # to result below instead of passing it to the constructor. + new_data = {n: func(left.iloc[:, n], right.iloc[n]) for n in range(len(left.columns))] - if left.columns.is_unique: - new_data = {left.columns[n]: new_data[n] - for n in range(len(left.columns))} - result = self._constructor(new_data, - index=left.index, columns=left.columns, - copy=False) - return result - - else: - new_data = {i: new_data[i] for i in range(len(new_data))} - result = self._constructor(new_data, index=left.index, copy=False) - result.columns = left.columns - return result + result = self._constructor(new_data, index=left.index, copy=False) + result.columns = left.columns + return result def _combine_const(self, other, func, errors='raise', try_cast=True): new_data = self._data.eval(func=func, other=other, From aa41de3d80f24d1f9ae6a45f2910e1572888de2f Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 24 Jul 2018 17:40:08 -0700 Subject: [PATCH 09/20] revert try_cast_results --- pandas/core/internals/__init__.py | 1 - pandas/core/internals/blocks.py | 78 +++++++++++-------------------- 2 files changed, 28 insertions(+), 51 deletions(-) diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 0bdac6c231138..22caa577c2891 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- from .blocks import ( # noqa:F401 - try_cast_result, # frame _block2d_to_blocknd, _factor_indexer, _block_shape, # io.pytables _safe_reshape, # io.packers make_block, # io.pytables, io.packers diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index a1a44dc3bd7e3..ffa2267dd6877 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -74,55 +74,6 @@ from pandas.io.formats.printing import pprint_thing -def try_cast_result(left, result, dtype=None): - """ - Try to cast the result to the original dtype for `left`; we may have - roundtripped thru object in the mean-time. - - Parameters - ---------- - left : array-like - result : array-like - dtype : np.dtype, pd.dtype, or None (default None) - - Returns - ------- - maybe_casted : same type as `result` - """ - if dtype is None: - dtype = left.dtype - - if (is_integer_dtype(left) or is_bool_dtype(left) or - is_datetime64_dtype(left) or is_datetime64tz_dtype(left)): - pass - elif is_object_dtype(left) and lib.infer_dtype(left) == 'boolean': - # disguised is_bool_dtype - pass - elif is_float_dtype(left) and result.dtype == left.dtype: - - # protect against a bool/object showing up here - if isinstance(dtype, compat.string_types) and dtype == 'infer': - return result - if not isinstance(dtype, type): - dtype = dtype.type - if issubclass(dtype, (np.bool_, np.object_)): - if issubclass(dtype, np.bool_): - if isna(result).all(): - return result.astype(np.bool_) - else: - result = result.astype(np.object_) - result[result == 1] = True - result[result == 0] = False - return result - else: - return result.astype(np.object_) - - return result - - # may need to change the dtype here - return maybe_downcast_to_dtype(result, dtype) - - class Block(PandasObject): """ Canonical n-dimensional unit of homogeneous dtype contained in a pandas @@ -748,7 +699,34 @@ def _try_cast_result(self, result, dtype=None): """ try to cast the result to our original type, we may have roundtripped thru object in the mean-time """ - return try_cast_result(self, result, dtype) + if dtype is None: + dtype = self.dtype + + if self.is_integer or self.is_bool or self.is_datetime: + pass + elif self.is_float and result.dtype == self.dtype: + + # protect against a bool/object showing up here + if isinstance(dtype, compat.string_types) and dtype == 'infer': + return result + if not isinstance(dtype, type): + dtype = dtype.type + if issubclass(dtype, (np.bool_, np.object_)): + if issubclass(dtype, np.bool_): + if isna(result).all(): + return result.astype(np.bool_) + else: + result = result.astype(np.object_) + result[result == 1] = True + result[result == 0] = False + return result + else: + return result.astype(np.object_) + + return result + + # may need to change the dtype here + return maybe_downcast_to_dtype(result, dtype) def _try_coerce_args(self, values, other): """ provide coercion to our input arguments """ From 01c37207c76de3d5f4681ecc6a14a211bc780dc0 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 24 Jul 2018 17:40:55 -0700 Subject: [PATCH 10/20] revert non-central changes --- pandas/core/sparse/frame.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index a5b809cfe8ffa..5cb9f4744cc58 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -585,14 +585,19 @@ def _combine_match_index(self, other, func, level=None): if level is not None: raise NotImplementedError("'level' argument is not supported") - this, other = self.align(other, join='outer', axis=0, - level=level, copy=False) + new_index = self.index.union(other.index) + this = self + if self.index is not new_index: + this = self.reindex(new_index) + + if other.index is not new_index: + other = other.reindex(new_index) - new_data = {} for col, series in compat.iteritems(this): new_data[col] = func(series.values, other.values) # fill_value is a function of our operator + fill_value = None if isna(other.fill_value) or isna(self.default_fill_value): fill_value = np.nan else: @@ -600,7 +605,7 @@ def _combine_match_index(self, other, func, level=None): np.float64(other.fill_value)) return self._constructor( - new_data, index=this.index, columns=this.columns, + new_data, index=new_index, columns=self.columns, default_fill_value=fill_value).__finalize__(self) def _combine_match_columns(self, other, func, level=None, try_cast=True): @@ -612,16 +617,19 @@ def _combine_match_columns(self, other, func, level=None, try_cast=True): if level is not None: raise NotImplementedError("'level' argument is not supported") - left, right = self.align(other, join='outer', axis=1, - level=level, copy=False) - new_data = {} - for col in other.index.intersection(self.columns): + + union = intersection = self.columns + + if not union.equals(other.index): + union = other.index.union(self.columns) + intersection = other.index.intersection(self.columns) + + for col in intersection: new_data[col] = func(self[col], float(other[col])) - # TODO: Why are we casting other[col] to float? return self._constructor( - new_data, index=left.index, columns=left.columns, + new_data, index=self.index, columns=union, default_fill_value=self.default_fill_value).__finalize__(self) def _combine_const(self, other, func, errors='raise', try_cast=True): From bcb6735d8d7376086492a4a18b624eaeb1f2d052 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 24 Jul 2018 19:56:45 -0700 Subject: [PATCH 11/20] Fixup typo syntaxerror --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2dbe4ff21ab64..472d27b950404 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4941,7 +4941,7 @@ def _combine_match_columns(self, other, func, level=None, try_cast=True): # case with non-unique columns; same reason why we pin columns # to result below instead of passing it to the constructor. new_data = {n: func(left.iloc[:, n], right.iloc[n]) - for n in range(len(left.columns))] + for n in range(len(left.columns))} result = self._constructor(new_data, index=left.index, copy=False) result.columns = left.columns From b3ef4171889010c3fed3c0fdb1f44edeabcad42f Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 24 Jul 2018 19:57:12 -0700 Subject: [PATCH 12/20] simplify assertion --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 472d27b950404..13337a5529e07 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4935,7 +4935,7 @@ def _combine_match_columns(self, other, func, level=None, try_cast=True): assert isinstance(other, Series) left, right = self.align(other, join='outer', axis=1, level=level, copy=False) - assert left.columns.equals(right.index), (left.columns, right.index) + assert left.columns.equals(right.index) # Note: we use iloc instead of loc for compat with # case with non-unique columns; same reason why we pin columns From ff96c0deb5507f1b89fdb2d36ce9d913debf71ed Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Fri, 7 Sep 2018 20:20:28 -0700 Subject: [PATCH 13/20] use dispatch_to_series in combine_match_columns --- pandas/core/frame.py | 11 +---------- pandas/core/ops.py | 10 +++++++++- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index df15dec7b479f..d63d924f322fb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4856,16 +4856,7 @@ def _combine_match_columns(self, other, func, level=None, try_cast=True): left, right = self.align(other, join='outer', axis=1, level=level, copy=False) assert left.columns.equals(right.index) - - # Note: we use iloc instead of loc for compat with - # case with non-unique columns; same reason why we pin columns - # to result below instead of passing it to the constructor. - new_data = {n: func(left.iloc[:, n], right.iloc[n]) - for n in range(len(left.columns))} - - result = self._constructor(new_data, index=left.index, copy=False) - result.columns = left.columns - return result + return ops.dispatch_to_series(left, right, func, axis="columns") def _combine_const(self, other, func, errors='raise', try_cast=True): if lib.is_scalar(other) or np.ndim(other) == 0: diff --git a/pandas/core/ops.py b/pandas/core/ops.py index ca9c2528f0aef..11c7f32dcae9e 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -1621,7 +1621,7 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0): # ----------------------------------------------------------------------------- # DataFrame -def dispatch_to_series(left, right, func, str_rep=None): +def dispatch_to_series(left, right, func, str_rep=None, axis=None): """ Evaluate the frame operation func(left, right) by evaluating column-by-column, dispatching to the Series implementation. @@ -1632,6 +1632,7 @@ def dispatch_to_series(left, right, func, str_rep=None): right : scalar or DataFrame func : arithmetic or comparison operator str_rep : str or None, default None + axis : {None, "index", "columns"} Returns ------- @@ -1655,6 +1656,13 @@ def column_op(a, b): return {i: func(a.iloc[:, i], b.iloc[:, i]) for i in range(len(a.columns))} + elif isinstance(right, ABCSeries) and axis == "columns": + assert right.index.equals(left.columns) + + def column_op(a, b): + return {i: func(a.iloc[:, i], b.iloc[i]) + for i in range(len(a.columns))} + elif isinstance(right, ABCSeries): assert right.index.equals(left.index) # Handle other cases later From c30f89843b88144dc25ba03f5537e8a6463a366d Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 11 Sep 2018 18:50:19 -0700 Subject: [PATCH 14/20] Pass unwrapped op where appropriate --- pandas/core/ops.py | 5 ++- pandas/tests/arithmetic/test_timedelta64.py | 35 ++++++--------------- 2 files changed, 14 insertions(+), 26 deletions(-) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 11c7f32dcae9e..1a2f3e4c7692a 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -1805,7 +1805,10 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): if isinstance(other, ABCDataFrame): # Another DataFrame return self._combine_frame(other, na_op, fill_value, level) elif isinstance(other, ABCSeries): - return _combine_series_frame(self, other, na_op, + # For these values of `axis`, we end up dispatching to Series op, + # so do not want the masked op. + pass_op = op if axis in [0, "columns", None] else na_op + return _combine_series_frame(self, other, pass_op, fill_value=fill_value, axis=axis, level=level, try_cast=True) else: diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index def7a8be95fc8..f09e5977c8fa6 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -559,43 +559,25 @@ def test_tdi_add_dt64_array(self, box_df_fail): # ------------------------------------------------------------------ # Operations with int-like others - @pytest.mark.parametrize('box', [ - pd.Index, - Series, - pytest.param(pd.DataFrame, - marks=pytest.mark.xfail(reason="Attempts to broadcast " - "incorrectly", - strict=True, raises=ValueError)) - ], ids=lambda x: x.__name__) def test_td64arr_add_int_series_invalid(self, box, tdser): tdser = tm.box_expected(tdser, box) err = TypeError if box is not pd.Index else NullFrequencyError with pytest.raises(err): tdser + Series([2, 3, 4]) - def test_td64arr_radd_int_series_invalid(self, box_df_fail, tdser): - box = box_df_fail # Tries to broadcast incorrectly + def test_td64arr_radd_int_series_invalid(self, box, tdser): tdser = tm.box_expected(tdser, box) err = TypeError if box is not pd.Index else NullFrequencyError with pytest.raises(err): Series([2, 3, 4]) + tdser - @pytest.mark.parametrize('box', [ - pd.Index, - Series, - pytest.param(pd.DataFrame, - marks=pytest.mark.xfail(reason="Attempts to broadcast " - "incorrectly", - strict=True, raises=ValueError)) - ], ids=lambda x: x.__name__) def test_td64arr_sub_int_series_invalid(self, box, tdser): tdser = tm.box_expected(tdser, box) err = TypeError if box is not pd.Index else NullFrequencyError with pytest.raises(err): tdser - Series([2, 3, 4]) - def test_td64arr_rsub_int_series_invalid(self, box_df_fail, tdser): - box = box_df_fail # Tries to broadcast incorrectly + def test_td64arr_rsub_int_series_invalid(self, box, tdser): tdser = tm.box_expected(tdser, box) err = TypeError if box is not pd.Index else NullFrequencyError with pytest.raises(err): @@ -669,9 +651,10 @@ def test_td64arr_add_sub_numeric_scalar_invalid(self, box, scalar, tdser): Series([1, 2, 3]) # TODO: Add DataFrame in here? ], ids=lambda x: type(x).__name__) - def test_td64arr_add_sub_numeric_arr_invalid(self, box_df_fail, vec, - dtype, tdser): - box = box_df_fail # tries to broadcast incorrectly + def test_td64arr_add_sub_numeric_arr_invalid(self, box, vec, dtype, tdser): + if box is pd.DataFrame and not isinstance(vec, Series): + raise pytest.xfail(reason="Tries to broadcast incorrectly") + tdser = tm.box_expected(tdser, box) err = TypeError if box is pd.Index and not dtype.startswith('float'): @@ -1027,9 +1010,11 @@ def test_td64arr_with_offset_series(self, names, box_df_fail): tm.assert_equal(res3, expected_sub) @pytest.mark.parametrize('obox', [np.array, pd.Index, pd.Series]) - def test_td64arr_addsub_anchored_offset_arraylike(self, obox, box_df_fail): + def test_td64arr_addsub_anchored_offset_arraylike(self, obox, box): # GH#18824 - box = box_df_fail # DataFrame tries to broadcast incorrectly + if box is pd.DataFrame and obox is not pd.Series: + raise pytest.xfail(reason="Attempts to broadcast incorrectly") + tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00']) tdi = tm.box_expected(tdi, box) From 52b71025ea8fb03920cd3d283e203d7b2f8ff790 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Wed, 12 Sep 2018 12:22:34 -0700 Subject: [PATCH 15/20] catch correct error --- pandas/core/ops.py | 2 +- pandas/tests/arithmetic/test_timedelta64.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 1a2f3e4c7692a..b4fa7e7ed204f 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -1632,7 +1632,7 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None): right : scalar or DataFrame func : arithmetic or comparison operator str_rep : str or None, default None - axis : {None, "index", "columns"} + axis : {None, 0, 1, "index", "columns"} Returns ------- diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index acc19ba4da314..a09efe6d4761c 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -923,9 +923,9 @@ def test_td64arr_sub_offset_array(self, box_df_broadcast_failure): @pytest.mark.parametrize('names', [(None, None, None), ('foo', 'bar', None), ('foo', 'foo', 'foo')]) - def test_td64arr_with_offset_series(self, names, box_df_broadcast_failure): + def test_td64arr_with_offset_series(self, names, box_df_fail): # GH#18849 - box = box_df_broadcast_failure + box = box_df_fail box2 = Series if box is pd.Index else box tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00'], From 453ae8e01f521b5ae3de05b6341cad238d0f179b Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Wed, 12 Sep 2018 15:22:21 -0700 Subject: [PATCH 16/20] whatsnew note --- doc/source/whatsnew/v0.24.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 3660c1e843f6c..24944cb1aa33e 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -542,6 +542,7 @@ Other API Changes - :class:`Index` subtraction will attempt to operate element-wise instead of raising ``TypeError`` (:issue:`19369`) - :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`) - :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`) +- :func:`pandas.crosstab` will preserve dtypes in some cases that previously would cast from integer dtype to floating dtype (:issue:`22019`) .. _whatsnew_0240.deprecations: From f574c24a58b07ae30cc2db764a1a66c2cac5a921 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 27 Sep 2018 12:24:39 -0700 Subject: [PATCH 17/20] comment --- pandas/core/ops.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index e9b7b108f05e2..957f7eb245bce 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -1665,6 +1665,8 @@ def column_op(a, b): for i in range(len(a.columns))} elif isinstance(right, ABCSeries) and axis == "columns": + # We only get here if called via left._combine_match_columns, + # in which case we specifically want to operate row-by-row assert right.index.equals(left.columns) def column_op(a, b): From e6821a2c56aae6657be0ecf6d39311dfa1bb14ea Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 27 Sep 2018 12:33:58 -0700 Subject: [PATCH 18/20] whatsnew section --- doc/source/whatsnew/v0.24.0.txt | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index c9eb06ba1867f..48ee809e2de18 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -530,6 +530,35 @@ Current Behavior: ... OverflowError: Trying to coerce negative values to unsigned integers +.. _whatsnew_0240.api.crosstab_dtypes + +Crosstab Preserves Dtypes +^^^^^^^^^^^^^^^^^^^^^^^^^ + +`pd.crosstab` will preserve now dtypes in some cases that previously would +cast from integer dtype to floating dtype (:issue:`22019`) + +Previous Behavior: + +.. code-block:: ipython + + In [3]: df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], + ...: 'c': [1, 1, np.nan, 1, 1]}) + In [4]: pd.crosstab(df.a, df.b, normalize='columns') + Out[4]: + b 3 4 + a + 1 0.5 0.0 + 2 0.5 1.0 + +Current Behavior: + +.. code-block:: ipython + + In [3]: df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], + ...: 'c': [1, 1, np.nan, 1, 1]}) + In [4]: pd.crosstab(df.a, df.b, normalize='columns') + Datetimelike API Changes ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -555,7 +584,6 @@ Other API Changes - :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`) - :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`) - :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`) -- :func:`pandas.crosstab` will preserve dtypes in some cases that previously would cast from integer dtype to floating dtype (:issue:`22019`) .. _whatsnew_0240.deprecations: From a6a7f58552c26403df8f679f3e8958aadd203164 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sat, 29 Sep 2018 09:26:08 -0700 Subject: [PATCH 19/20] remove unnecessary tester --- pandas/tests/series/test_operators.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 8be78e9b4e90c..f3ab197771d53 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -758,9 +758,6 @@ def test_operators_bitwise(self): def test_scalar_na_cmp_corners(self): s = Series([2, 3, 4, 5, 6, 7, 8, 9, 10]) - def tester(a, b): - return a & b - with pytest.raises(TypeError): s & datetime(2005, 1, 1) @@ -784,7 +781,7 @@ def tester(a, b): d.__and__(s, axis='columns') with pytest.raises(TypeError): - tester(s, d) + s & d # this is wrong as its not a boolean result # result = d.__and__(s,axis='index') From 5832c2b81bc7999877f3fc83ddc29484fe6e5da6 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 2 Oct 2018 15:17:32 -0700 Subject: [PATCH 20/20] doc fixup --- doc/source/whatsnew/v0.24.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 155df64623ccc..700916ba6066e 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -537,7 +537,7 @@ Current Behavior: Crosstab Preserves Dtypes ^^^^^^^^^^^^^^^^^^^^^^^^^ -`pd.crosstab` will preserve now dtypes in some cases that previously would +:func:`crosstab` will preserve now dtypes in some cases that previously would cast from integer dtype to floating dtype (:issue:`22019`) Previous Behavior: