-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
Optimize array_equivalent for NDFrame.equals #35328
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
c0d6a40
46a27b9
94f1ba9
f551a20
0c83457
b5d06a9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -355,7 +355,9 @@ def _isna_compat(arr, fill_value=np.nan) -> bool: | |
return True | ||
|
||
|
||
def array_equivalent(left, right, strict_nan: bool = False) -> bool: | ||
def array_equivalent( | ||
left, right, strict_nan: bool = False, dtype_equal: bool = False | ||
) -> bool: | ||
""" | ||
True if two arrays, left and right, have equal non-NaN elements, and NaNs | ||
in corresponding locations. False otherwise. It is assumed that left and | ||
|
@@ -368,6 +370,12 @@ def array_equivalent(left, right, strict_nan: bool = False) -> bool: | |
left, right : ndarrays | ||
strict_nan : bool, default False | ||
If True, consider NaN and None to be different. | ||
dtype_equal : bool, default False | ||
Whether `left` and `right` are known to have the same dtype | ||
according to `is_dtype_equal`. Some methods like `BlockManager.equals`. | ||
require that the dtypes match. Setting this to ``True`` can improve | ||
performance, but will give different results for arrays that are | ||
equal but different dtypes. | ||
|
||
Returns | ||
------- | ||
|
@@ -391,43 +399,28 @@ def array_equivalent(left, right, strict_nan: bool = False) -> bool: | |
if left.shape != right.shape: | ||
return False | ||
|
||
if dtype_equal: | ||
# fastpath when we require that the dtypes match (Block.equals) | ||
if is_float_dtype(left.dtype) or is_complex_dtype(left.dtype): | ||
return array_equivalent_float(left, right) | ||
elif is_datetimelike_v_numeric(left.dtype, right.dtype): | ||
return False | ||
elif needs_i8_conversion(left.dtype): | ||
return array_equivalent_datetimelike(left, right) | ||
elif is_string_dtype(left.dtype): | ||
# TODO: fastpath for pandas' StringDtype | ||
return array_equivalent_object(left, right, strict_nan) | ||
else: | ||
return np.array_equal(left, right) | ||
|
||
# Slow path when we allow comparing different dtypes. | ||
# Object arrays can contain None, NaN and NaT. | ||
# string dtypes must be come to this path for NumPy 1.7.1 compat | ||
if is_string_dtype(left.dtype) or is_string_dtype(right.dtype): | ||
|
||
if not strict_nan: | ||
# isna considers NaN and None to be equivalent. | ||
return lib.array_equivalent_object( | ||
ensure_object(left.ravel()), ensure_object(right.ravel()) | ||
) | ||
|
||
for left_value, right_value in zip(left, right): | ||
if left_value is NaT and right_value is not NaT: | ||
return False | ||
|
||
elif left_value is libmissing.NA and right_value is not libmissing.NA: | ||
return False | ||
|
||
elif isinstance(left_value, float) and np.isnan(left_value): | ||
if not isinstance(right_value, float) or not np.isnan(right_value): | ||
return False | ||
else: | ||
try: | ||
if np.any(np.asarray(left_value != right_value)): | ||
return False | ||
except TypeError as err: | ||
if "Cannot compare tz-naive" in str(err): | ||
# tzawareness compat failure, see GH#28507 | ||
return False | ||
elif "boolean value of NA is ambiguous" in str(err): | ||
return False | ||
raise | ||
return True | ||
return array_equivalent_object(left, right, strict_nan) | ||
|
||
# NaNs can occur in float and complex arrays. | ||
if is_float_dtype(left.dtype) or is_complex_dtype(left.dtype): | ||
|
||
# empty | ||
if not (np.prod(left.shape) and np.prod(right.shape)): | ||
return True | ||
return ((left == right) | (isna(left) & isna(right))).all() | ||
|
@@ -452,6 +445,45 @@ def array_equivalent(left, right, strict_nan: bool = False) -> bool: | |
return np.array_equal(left, right) | ||
|
||
|
||
def array_equivalent_float(left, right): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe make all of these helpers private functions |
||
return ((left == right) | (np.isnan(left) & np.isnan(right))).all() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what about use_inf_as_na? Can this be re-used on L426? (also the prod(shape) check on L424-425 i think is redundant with earlier shape check) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems like 1.0.5 doesn't care about that In [15]: df1 = pd.DataFrame({"A": np.array([np.nan, 1, np.inf])})
In [16]: df2 = pd.DataFrame({"A": np.array([np.nan, 1, np.nan])})
In [17]: with pd.option_context('mode.use_inf_as_na', True):
...: print(df1.equals(df2))
...:
...:
False There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes sense, thanks for checking |
||
|
||
|
||
def array_equivalent_datetimelike(left, right): | ||
return np.array_equal(left.view("i8"), right.view("i8")) | ||
|
||
|
||
def array_equivalent_object(left, right, strict_nan): | ||
if not strict_nan: | ||
# isna considers NaN and None to be equivalent. | ||
return lib.array_equivalent_object( | ||
ensure_object(left.ravel()), ensure_object(right.ravel()) | ||
) | ||
|
||
for left_value, right_value in zip(left, right): | ||
if left_value is NaT and right_value is not NaT: | ||
return False | ||
|
||
elif left_value is libmissing.NA and right_value is not libmissing.NA: | ||
return False | ||
|
||
elif isinstance(left_value, float) and np.isnan(left_value): | ||
if not isinstance(right_value, float) or not np.isnan(right_value): | ||
return False | ||
else: | ||
try: | ||
if np.any(np.asarray(left_value != right_value)): | ||
return False | ||
except TypeError as err: | ||
if "Cannot compare tz-naive" in str(err): | ||
# tzawareness compat failure, see GH#28507 | ||
return False | ||
elif "boolean value of NA is ambiguous" in str(err): | ||
return False | ||
raise | ||
return True | ||
|
||
|
||
def _infer_fill_value(val): | ||
""" | ||
infer the fill value for the nan/NaT from the provided | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i am not sure i understand why you actually need this
the dtype check is cheap compared to the actual check no?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
np.isnan
forpd.isna
for float, and we can skip the empty check at https://github.com/pandas-dev/pandas/pull/35328/files#diff-ff8364cee9a3e1ef3a3825cb2cdd26d8L431.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok that's fine