From 1ee81a5a77149e143549c69bd7d1d5e2b78be7bf Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 1 Nov 2019 17:51:41 -0700 Subject: [PATCH 1/3] maybe_upcast_putmask: require other to be a scalar --- pandas/core/dtypes/cast.py | 10 ++- pandas/core/nanops.py | 6 ++ pandas/tests/dtypes/cast/test_upcast.py | 93 ++++++++++--------------- 3 files changed, 49 insertions(+), 60 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 3e92906be706c..304eeac87f64d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -232,7 +232,7 @@ def trans(x): return result -def maybe_upcast_putmask(result, mask, other): +def maybe_upcast_putmask(result: np.ndarray, mask: np.ndarray, other): """ A safe version of putmask that potentially upcasts the result. The result is replaced with the first N elements of other, @@ -245,8 +245,8 @@ def maybe_upcast_putmask(result, mask, other): The destination array. This will be mutated in-place if no upcasting is necessary. mask : boolean ndarray - other : ndarray or scalar - The source array or value + other : scalar + The source value Returns ------- @@ -264,6 +264,10 @@ def maybe_upcast_putmask(result, mask, other): if not isinstance(result, np.ndarray): raise ValueError("The result input must be a ndarray.") + if not is_scalar(other): + # We _could_ support non-scalar other, but until we have a compelling + # use case, we assume away the possibility. + raise ValueError("other must be a scalar") if mask.any(): # Two conversions for date-like dtypes that can't be done automatically diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index b9267db76e1a8..5d4f75d93db1c 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -273,6 +273,12 @@ def _get_values( fill_value : Any fill value used """ + + # In _get_values is only called from within nanops, and in all cases + # with scalar fill_value. This guarantee is important for the + # maybe_upcast_putmask call below + assert is_scalar(fill_value) + mask = _maybe_get_mask(values, skipna, mask) if is_datetime64tz_dtype(values): diff --git a/pandas/tests/dtypes/cast/test_upcast.py b/pandas/tests/dtypes/cast/test_upcast.py index b22ed0bcd0a11..49e850f3e87b5 100644 --- a/pandas/tests/dtypes/cast/test_upcast.py +++ b/pandas/tests/dtypes/cast/test_upcast.py @@ -9,7 +9,7 @@ @pytest.mark.parametrize("result", [Series([10, 11, 12]), [10, 11, 12], (10, 11, 12)]) def test_upcast_error(result): - # GH23823 + # GH23823 require result arg to be ndarray mask = np.array([False, True, False]) other = np.array([61, 62, 63]) with pytest.raises(ValueError): @@ -17,76 +17,55 @@ def test_upcast_error(result): @pytest.mark.parametrize( - "arr, other, exp_changed, expected", + "arr, other", [ - (np.arange(1, 6), np.array([61, 62, 63]), False, np.array([1, 61, 3, 62, 63])), + (np.arange(1, 6), np.array([61, 62, 63])), + (np.arange(1, 6), np.array([61.1, 62.2, 63.3])), + (np.arange(10, 15), np.array([61, 62])), + (np.arange(10, 15), np.array([61, np.nan])), ( - np.arange(1, 6), - np.array([61.1, 62.2, 63.3]), - True, - np.array([1, 61.1, 3, 62.2, 63.3]), + np.arange("2019-01-01", "2019-01-06", dtype="datetime64[D]"), + np.arange("2018-01-01", "2018-01-04", dtype="datetime64[D]"), ), - (np.arange(1, 6), np.nan, True, np.array([1, np.nan, 3, np.nan, np.nan])), - (np.arange(10, 15), np.array([61, 62]), False, np.array([10, 61, 12, 62, 61])), ( - np.arange(10, 15), - np.array([61, np.nan]), - True, - np.array([10, 61, 12, np.nan, 61]), + np.arange("2019-01-01", "2019-01-06", dtype="datetime64[D]"), + np.arange("2018-01-01", "2018-01-03", dtype="datetime64[D]"), ), ], ) -def test_upcast(arr, other, exp_changed, expected): +def test_upcast_scalar_other(arr, other): + # for now we do not support non-scalar `other` + mask = np.array([False, True, False, True, True]) + with pytest.raises(ValueError, match="other must be a scalar"): + maybe_upcast_putmask(arr, mask, other) + + +def test_upcast(): # GH23823 + arr = np.arange(1, 6) mask = np.array([False, True, False, True, True]) - result, changed = maybe_upcast_putmask(arr, mask, other) + result, changed = maybe_upcast_putmask(arr, mask, other=np.nan) - assert changed == exp_changed + expected = np.array([1, np.nan, 3, np.nan, np.nan]) + assert changed tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize( - "arr, other, exp_changed, expected", - [ - ( - np.arange("2019-01-01", "2019-01-06", dtype="datetime64[D]"), - np.arange("2018-01-01", "2018-01-04", dtype="datetime64[D]"), - False, - np.array( - ["2019-01-01", "2018-01-01", "2019-01-03", "2018-01-02", "2018-01-03"], - dtype="datetime64[D]", - ), - ), - ( - np.arange("2019-01-01", "2019-01-06", dtype="datetime64[D]"), - np.nan, - False, - np.array( - [ - "2019-01-01", - np.datetime64("NaT"), - "2019-01-03", - np.datetime64("NaT"), - np.datetime64("NaT"), - ], - dtype="datetime64[D]", - ), - ), - ( - np.arange("2019-01-01", "2019-01-06", dtype="datetime64[D]"), - np.arange("2018-01-01", "2018-01-03", dtype="datetime64[D]"), - False, - np.array( - ["2019-01-01", "2018-01-01", "2019-01-03", "2018-01-02", "2018-01-01"], - dtype="datetime64[D]", - ), - ), - ], -) -def test_upcast_datetime(arr, other, exp_changed, expected): +def test_upcast_datetime(): # GH23823 + arr = np.arange("2019-01-01", "2019-01-06", dtype="datetime64[D]") mask = np.array([False, True, False, True, True]) - result, changed = maybe_upcast_putmask(arr, mask, other) + result, changed = maybe_upcast_putmask(arr, mask, other=np.nan) - assert changed == exp_changed + expected = np.array( + [ + "2019-01-01", + np.datetime64("NaT"), + "2019-01-03", + np.datetime64("NaT"), + np.datetime64("NaT"), + ], + dtype="datetime64[D]", + ) + assert not changed tm.assert_numpy_array_equal(result, expected) From 9a15ee14fff8271711f7798c56efe7a4984a6d3c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 2 Nov 2019 08:35:42 -0700 Subject: [PATCH 2/3] require maybe_upcast fill_value to be scalar --- pandas/core/dtypes/cast.py | 2 ++ pandas/core/internals/blocks.py | 4 ++++ pandas/core/internals/construction.py | 4 ++++ 3 files changed, 10 insertions(+) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 304eeac87f64d..69c2e7fef365f 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -686,6 +686,8 @@ def maybe_upcast(values, fill_value=np.nan, dtype=None, copy=False): dtype : if None, then use the dtype of the values, else coerce to this type copy : if True always make a copy even if no upcast is required """ + if not is_scalar(fill_value): + raise ValueError("fill_value must be a scalar") if is_extension_type(values): if copy: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 51108d9a5a573..1f5a14a41e6a3 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1286,6 +1286,10 @@ def diff(self, n: int, axis: int = 1) -> List["Block"]: def shift(self, periods, axis=0, fill_value=None): """ shift the block by periods, possibly upcast """ + if not lib.is_scalar(fill_value): + # We could go further and require e.g. self._can_hold_element(fv) + raise ValueError("fill_value must be a scalar") + # convert integer to float if necessary. need to do a lot more than # that, handle boolean etc also new_values, fill_value = maybe_upcast(self.values, fill_value) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 176f4acd113fe..ba57232cb9e92 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -83,6 +83,7 @@ def masked_rec_array_to_mgr(data, index, columns, dtype, copy): # essentially process a record array then fill it fill_value = data.fill_value + fdata = ma.getdata(data) if index is None: index = get_names_from_index(fdata) @@ -97,6 +98,9 @@ def masked_rec_array_to_mgr(data, index, columns, dtype, copy): # fill if needed new_arrays = [] for fv, arr, col in zip(fill_value, arrays, arr_columns): + # TODO: numpy docs suggest fv must be scalar, but could it be + # non-scalar for object dtype? + assert lib.is_scalar(fv), fv mask = ma.getmaskarray(data[col]) if mask.any(): arr, fv = maybe_upcast(arr, fill_value=fv, copy=True) From 980237dd67f654e8cacb6c92177e9426f7a3e507 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 2 Nov 2019 09:23:34 -0700 Subject: [PATCH 3/3] revert whitespace change --- pandas/core/internals/construction.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index ba57232cb9e92..4a8216cc73264 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -83,7 +83,6 @@ def masked_rec_array_to_mgr(data, index, columns, dtype, copy): # essentially process a record array then fill it fill_value = data.fill_value - fdata = ma.getdata(data) if index is None: index = get_names_from_index(fdata)