From fc1b3837ca40b0ecdf663be54dcf8715a27b0d8c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 16 Mar 2020 19:56:35 -0700 Subject: [PATCH 1/5] checkpoint passing --- pandas/core/arrays/datetimes.py | 6 ++++-- pandas/core/generic.py | 1 + pandas/core/internals/blocks.py | 36 ++++++++++++++++++++++++------- pandas/core/internals/managers.py | 6 +++++- 4 files changed, 38 insertions(+), 11 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 2110f782330fb..c7e3bcc29d1a4 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1845,18 +1845,20 @@ def objects_to_datetime64ns( try: result, tz_parsed = tslib.array_to_datetime( - data, + data.ravel(), errors=errors, utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, require_iso8601=require_iso8601, ) + result = result.reshape(data.shape) except ValueError as e: try: - values, tz_parsed = conversion.datetime_to_datetime64(data) + values, tz_parsed = conversion.datetime_to_datetime64(data.ravel()) # If tzaware, these values represent unix timestamps, so we # return them as i8 to distinguish from wall times + values = values.reshape(data.shape) return values.view("i8"), tz_parsed except (ValueError, TypeError): raise e diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6b0f7de11a3e7..5dc03863dc641 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8612,6 +8612,7 @@ def _where( # treat like a scalar if len(other) == 1: other = np.array(other[0]) + # TODO: should this be extracting the scalar? # GH 3235 # match True cond to other diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index cab2bd5146745..da1a1c509474f 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -934,7 +934,7 @@ def putmask( Parameters ---------- - mask : the condition to respect + mask : np.ndarray[bool] new : a ndarray/object align : boolean, perform alignment on other/cond, default is True inplace : perform inplace modification, default is False @@ -946,10 +946,14 @@ def putmask( ------- a list of new blocks, the result of the putmask """ + assert isinstance(mask, np.ndarray), type(mask) + assert mask.dtype == bool, mask.dtype + new_values = self.values if inplace else self.values.copy() - new = getattr(new, "values", new) - mask = getattr(mask, "values", mask) + assert not isinstance(new, (ABCSeries, ABCDataFrame)), type(new) + # new = getattr(new, "values", new) + # mask = getattr(mask, "values", mask) # if we are passed a scalar None, convert it here if not is_list_like(new) and isna(new) and not self.is_object: @@ -1341,7 +1345,7 @@ def shift(self, periods, axis: int = 0, fill_value=None): def where( self, other, - cond, + cond: np.ndarray, align: bool = True, errors="raise", try_cast: bool = False, @@ -1353,7 +1357,7 @@ def where( Parameters ---------- other : a ndarray/object - cond : the condition to respect + cond : np.ndarray[bool] align : bool, default True Perform alignment on other/cond. errors : str, {'raise', 'ignore'}, default 'raise' @@ -1363,10 +1367,13 @@ def where( Returns ------- - a new block(s), the result of the func + List[Block] """ import pandas.core.computation.expressions as expressions + assert isinstance(cond, np.ndarray), type(cond) + assert cond.dtype == bool, cond.dtype + assert errors in ["raise", "ignore"] transpose = self.ndim == 2 @@ -1680,17 +1687,24 @@ def putmask( Parameters ---------- - mask : the condition to respect + mask : np.ndarray[bool] + The condition to respect. new : a ndarray/object align : boolean, perform alignment on other/cond, default is True inplace : perform inplace modification, default is False Returns ------- - a new block, the result of the putmask + List[Block] """ inplace = validate_bool_kwarg(inplace, "inplace") + if isinstance(mask, ExtensionArray): + assert mask.dtype == "boolean", mask.dtype + mask = mask.astype(bool, copy=False) + assert isinstance(mask, np.ndarray), type(mask) + assert mask.dtype == bool, mask.dtype + # use block's copy logic. # .values may be an Index which does shallow copy by default new_values = self.values if inplace else self.copy().values @@ -1933,12 +1947,18 @@ def where( # `other` should be a DataFrame with a single column. assert other.shape[1] == 1 other = other.iloc[:, 0] + elif isinstance(other, np.ndarray) and other.ndim == 2: + assert other.shape[1] == 1 + other = other[:, 0] other = extract_array(other, extract_numpy=True) if isinstance(cond, ABCDataFrame): assert cond.shape[1] == 1 cond = cond.iloc[:, 0] + elif isinstance(cond, np.ndarray) and cond.ndim == 2: + assert cond.shape[1] == 1 + cond = cond[:, 0] cond = extract_array(cond, extract_numpy=True) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 3a7ab98ea6baf..b1a6f6b119a50 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -33,6 +33,7 @@ import pandas.core.algorithms as algos from pandas.core.arrays.sparse import SparseDtype from pandas.core.base import PandasObject +from pandas.core.construction import extract_array from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.api import Index, ensure_index from pandas.core.internals.blocks import ( @@ -416,6 +417,7 @@ def apply(self: T, f, filter=None, **kwargs) -> T: align_keys = ["new", "mask"] else: align_keys = ["mask"] + kwargs["new"] = extract_array(kwargs["new"], extract_numpy=True) else: align_keys = [] @@ -439,7 +441,9 @@ def apply(self: T, f, filter=None, **kwargs) -> T: for k, obj in aligned_args.items(): axis = obj._info_axis_number - kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy) + kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy)._values + # TODO: if operation commutes, dont unpack DataFrame, but defer + # TODO: will this involve casting for e.g BooleanDtype? if callable(f): applied = b.apply(f, **kwargs) From ef97a0ec9e956fec01cc34f4202e5ed7c83c772d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 17 Mar 2020 15:53:34 -0700 Subject: [PATCH 2/5] avoid passing Series/DataFrame to Block methods --- pandas/core/generic.py | 3 +-- pandas/core/internals/blocks.py | 36 ++++++++++++------------------- pandas/core/internals/managers.py | 1 + 3 files changed, 16 insertions(+), 24 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5dc03863dc641..92d82ce931dee 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8611,8 +8611,7 @@ def _where( # GH 2745 / GH 4192 # treat like a scalar if len(other) == 1: - other = np.array(other[0]) - # TODO: should this be extracting the scalar? + other = other[0] # GH 3235 # match True cond to other diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index da1a1c509474f..f657c851d466c 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -55,6 +55,7 @@ from pandas.core.dtypes.generic import ( ABCDataFrame, ABCExtensionArray, + ABCIndexClass, ABCPandasArray, ABCSeries, ) @@ -948,13 +949,10 @@ def putmask( """ assert isinstance(mask, np.ndarray), type(mask) assert mask.dtype == bool, mask.dtype + assert not isinstance(new, (ABCSeries, ABCDataFrame)), type(new) new_values = self.values if inplace else self.values.copy() - assert not isinstance(new, (ABCSeries, ABCDataFrame)), type(new) - # new = getattr(new, "values", new) - # mask = getattr(mask, "values", mask) - # if we are passed a scalar None, convert it here if not is_list_like(new) and isna(new) and not self.is_object: # FIXME: make sure we have compatible NA @@ -1357,7 +1355,7 @@ def where( Parameters ---------- other : a ndarray/object - cond : np.ndarray[bool] + cond : np.ndarray[bool] align : bool, default True Perform alignment on other/cond. errors : str, {'raise', 'ignore'}, default 'raise' @@ -1373,6 +1371,7 @@ def where( assert isinstance(cond, np.ndarray), type(cond) assert cond.dtype == bool, cond.dtype + assert not isinstance(other, (ABCSeries, ABCDataFrame)), type(other) assert errors in ["raise", "ignore"] transpose = self.ndim == 2 @@ -1382,9 +1381,6 @@ def where( if transpose: values = values.T - other = getattr(other, "_values", getattr(other, "values", other)) - cond = getattr(cond, "values", cond) - # If the default broadcasting would go in the wrong direction, then # explicitly reshape other instead if getattr(other, "ndim", 0) >= 1: @@ -1942,26 +1938,22 @@ def where( try_cast: bool = False, axis: int = 0, ) -> List["Block"]: - if isinstance(other, ABCDataFrame): - # ExtensionArrays are 1-D, so if we get here then - # `other` should be a DataFrame with a single column. - assert other.shape[1] == 1 - other = other.iloc[:, 0] - elif isinstance(other, np.ndarray) and other.ndim == 2: + + assert not isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)), type( + other + ) + assert not isinstance(cond, (ABCIndexClass, ABCSeries, ABCDataFrame)), type( + cond + ) + + if isinstance(other, np.ndarray) and other.ndim == 2: assert other.shape[1] == 1 other = other[:, 0] - other = extract_array(other, extract_numpy=True) - - if isinstance(cond, ABCDataFrame): - assert cond.shape[1] == 1 - cond = cond.iloc[:, 0] - elif isinstance(cond, np.ndarray) and cond.ndim == 2: + if isinstance(cond, np.ndarray) and cond.ndim == 2: assert cond.shape[1] == 1 cond = cond[:, 0] - cond = extract_array(cond, extract_numpy=True) - if lib.is_scalar(other) and isna(other): # The default `other` for Series / Frame is np.nan # we want to replace that with the correct NA value diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 0cf3987a6c74a..59693e96768f2 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -411,6 +411,7 @@ def apply(self: T, f, filter=None, **kwargs) -> T: align_keys = ["other", "cond"] else: align_keys = ["cond"] + kwargs["other"] = extract_array(kwargs["other"], extract_numpy=True) elif f == "putmask": align_copy = False if kwargs.get("align", True): From b411adac6be7b7593e1b147a125f38ca0d8683d2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 23 Mar 2020 16:38:16 -0700 Subject: [PATCH 3/5] Extract Sparse and Boolean EAs --- pandas/core/arrays/datetimes.py | 6 ++--- pandas/core/internals/blocks.py | 43 ++++++++++++++++++--------------- 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 85a9f28830745..e2a13df069ae2 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1828,20 +1828,18 @@ def objects_to_datetime64ns( try: result, tz_parsed = tslib.array_to_datetime( - data.ravel(), + data, errors=errors, utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, require_iso8601=require_iso8601, ) - result = result.reshape(data.shape) except ValueError as e: try: - values, tz_parsed = conversion.datetime_to_datetime64(data.ravel()) + values, tz_parsed = conversion.datetime_to_datetime64(data) # If tzaware, these values represent unix timestamps, so we # return them as i8 to distinguish from wall times - values = values.reshape(data.shape) return values.view("i8"), tz_parsed except (ValueError, TypeError): raise e diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b811d54294819..d48066e6693df 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -932,9 +932,8 @@ def putmask( ------- List[Block] """ - assert isinstance(mask, np.ndarray), type(mask) - assert mask.dtype == bool, mask.dtype - assert not isinstance(new, (ABCSeries, ABCDataFrame)), type(new) + mask = _extract_bool_array(mask) + assert not isinstance(new, (ABCIndexClass, ABCSeries, ABCDataFrame)) new_values = self.values if inplace else self.values.copy() @@ -1333,9 +1332,8 @@ def where( """ import pandas.core.computation.expressions as expressions - assert isinstance(cond, np.ndarray), type(cond) - assert cond.dtype == bool, cond.dtype - assert not isinstance(other, (ABCSeries, ABCDataFrame)), type(other) + cond = _extract_bool_array(cond) + assert not isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)) assert errors in ["raise", "ignore"] transpose = self.ndim == 2 @@ -1642,15 +1640,9 @@ def putmask( """ inplace = validate_bool_kwarg(inplace, "inplace") - if isinstance(mask, ExtensionArray): - assert mask.dtype == "boolean", mask.dtype - mask = mask.astype(bool, copy=False) - assert isinstance(mask, np.ndarray), type(mask) - assert mask.dtype == bool, mask.dtype + mask = _extract_bool_array(mask) - # use block's copy logic. - # .values may be an Index which does shallow copy by default - new_values = self.values if inplace else self.copy().values + new_values = self.values if inplace else self.values.copy() if isinstance(new, np.ndarray) and len(new) == len(mask): new = new[mask] @@ -1880,18 +1872,16 @@ def where( self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0, ) -> List["Block"]: - assert not isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)), type( - other - ) - assert not isinstance(cond, (ABCIndexClass, ABCSeries, ABCDataFrame)), type( - cond - ) + cond = _extract_bool_array(cond) + assert not isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)) if isinstance(other, np.ndarray) and other.ndim == 2: + # TODO(EA2D): unnecessary with 2D EAs assert other.shape[1] == 1 other = other[:, 0] if isinstance(cond, np.ndarray) and cond.ndim == 2: + # TODO(EA2D): unnecessary with 2D EAs assert cond.shape[1] == 1 cond = cond[:, 0] @@ -3135,3 +3125,16 @@ def _putmask_preserve(nv, n): v = v.astype(dtype) return _putmask_preserve(v, n) + + +def _extract_bool_array(mask: ArrayLike) -> np.ndarray: + """ + If we have a SparseArray or BooleanArray, convert it to ndarray[bool]. + """ + if isinstance(mask, ExtensionArray): + # We could have BooleanArray, Sparse[bool], ... + mask = np.asarray(mask, dtype=np.bool_) + + assert isinstance(mask, np.ndarray), type(mask) + assert mask.dtype == bool, mask.dtype + return mask From 6e51eaf784164698b0b720d56dce8abf9f711349 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 23 Mar 2020 16:40:24 -0700 Subject: [PATCH 4/5] remove comments --- pandas/core/internals/managers.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 96ff2da99b871..dda932cafe73b 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -428,8 +428,6 @@ def apply(self: T, f, filter=None, align_keys=None, **kwargs) -> T: for k, obj in aligned_args.items(): axis = obj._info_axis_number kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy)._values - # TODO: if operation commutes, dont unpack DataFrame, but defer - # TODO: will this involve casting for e.g BooleanDtype? if callable(f): applied = b.apply(f, **kwargs) From df0e959fb09846f2c2ca3fe173065a6f3fc26628 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 24 Mar 2020 15:19:21 -0700 Subject: [PATCH 5/5] make docstring more precise, remove inaccurate annotation --- pandas/core/internals/blocks.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index d48066e6693df..636d5bd9c824a 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -920,7 +920,7 @@ def putmask( Parameters ---------- - mask : np.ndarray[bool] + mask : np.ndarray[bool], SparseArray[bool], or BooleanArray new : a ndarray/object inplace : bool, default False Perform inplace modification. @@ -1307,12 +1307,7 @@ def shift(self, periods, axis: int = 0, fill_value=None): return [self.make_block(new_values)] def where( - self, - other, - cond: np.ndarray, - errors="raise", - try_cast: bool = False, - axis: int = 0, + self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0, ) -> List["Block"]: """ evaluate the block; return result block(s) from the result @@ -1320,7 +1315,7 @@ def where( Parameters ---------- other : a ndarray/object - cond : np.ndarray[bool] + cond : np.ndarray[bool], SparseArray[bool], or BooleanArray errors : str, {'raise', 'ignore'}, default 'raise' - ``raise`` : allow exceptions to be raised - ``ignore`` : suppress exceptions. On error return original object