Skip to content

[WIP] Imprecise indexer #22043

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 13 additions & 4 deletions pandas/core/dtypes/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,7 @@ def _isna_compat(arr, fill_value=np.nan):
return True


def array_equivalent(left, right, strict_nan=False):
def array_equivalent(left, right, strict_nan=False, tolerance=None):
"""
True if two arrays, left and right, have equal non-NaN elements, and NaNs
in corresponding locations. False otherwise. It is assumed that left and
Expand Down Expand Up @@ -409,7 +409,7 @@ def array_equivalent(left, right, strict_nan=False):
# Object arrays can contain None, NaN and NaT.
# string dtypes must be come to this path for NumPy 1.7.1 compat
if is_string_dtype(left) or is_string_dtype(right):

# FIXME: intolerant
if not strict_nan:
# isna considers NaN and None to be equivalent.
return lib.array_equivalent_object(
Expand All @@ -434,9 +434,14 @@ def array_equivalent(left, right, strict_nan=False):
# empty
if not (np.prod(left.shape) and np.prod(right.shape)):
return True
return ((left == right) | (isna(left) & isna(right))).all()
if tolerance is None:
return ((left == right) | (isna(left) & isna(right))).all()
else:
return ((np.abs(left - right) <= tolerance) |
(isna(left) & isna(right))).all()

# numpy will will not allow this type of datetimelike vs integer comparison
# FIXME: intolerant
elif is_datetimelike_v_numeric(left, right):
return False

Expand All @@ -454,7 +459,11 @@ def array_equivalent(left, right, strict_nan=False):
if left.dtype != right.dtype:
return False

return np.array_equal(left, right)
if tolerance is None:
return np.array_equal(left, right)
else:
# The remaining dtypes left won't have NaNs to worry about.
return np.allclose(left, right, atol=tolerance, equal_nan=False)


def _infer_fill_value(val):
Expand Down
1 change: 1 addition & 0 deletions pandas/core/indexes/accessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def __init__(self, data, orig):
self.orig = orig
self.name = getattr(data, 'name', None)
self.index = getattr(data, 'index', None)
self.tolerance = getattr(data, 'tolerance', None)
self._freeze()

def _get_values(self):
Expand Down
28 changes: 17 additions & 11 deletions pandas/core/indexes/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,19 +53,21 @@ def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True):
return _get_combined_index(obs_idxes, intersect=intersect, sort=sort)


def _get_combined_index(indexes, intersect=False, sort=False):
def _get_combined_index(indexes, intersect=False, sort=False,
tolerance=None):
# TODO: handle index names!
indexes = com.get_distinct_objs(indexes)
if len(indexes) == 0:
index = Index([])
index = Index([], tolerance=tolerance)
elif len(indexes) == 1:
index = indexes[0]
elif intersect:
index = indexes[0]
tolerance = index._choose_tolerance(indexes[1:], tolerance)
for other in indexes[1:]:
index = index.intersection(other)
index = index.intersection(other, tolerance=tolerance)
else:
index = _union_indexes(indexes, sort=sort)
index = _union_indexes(indexes, sort=sort, tolerance=tolerance)
index = ensure_index(index)

if sort:
Expand All @@ -76,17 +78,20 @@ def _get_combined_index(indexes, intersect=False, sort=False):
return index


def _union_indexes(indexes, sort=True):
def _union_indexes(indexes, sort=True, tolerance=None):
if len(indexes) == 0:
raise AssertionError('Must have at least 1 Index to union')
if len(indexes) == 1:
result = indexes[0]
if tolerance is None:
tolerance = getattr(result, 'tolerance', None)
if isinstance(result, list):
result = Index(sorted(result))
result = Index(sorted(result), tolerance=tolerance)
return result

indexes, kind = _sanitize_and_check(indexes)

# FIXME: intolerant
def _unique_indices(inds):
def conv(i):
if isinstance(i, Index):
Expand All @@ -98,17 +103,18 @@ def conv(i):

if kind == 'special':
result = indexes[0]

tolerance = result._choose_tolerance(indexes[1:], tolerance=tolerance)
if hasattr(result, 'union_many'):
return result.union_many(indexes[1:])
return result.union_many(indexes[1:], tolerance=tolerance)
else:
for other in indexes[1:]:
result = result.union(other)
result = result.union(other, tolerance=tolerance)
return result
elif kind == 'array':
index = indexes[0]
tolerance = index._choose_tolerance(indexes[1:], tolerance=tolerance)
for other in indexes[1:]:
if not index.equals(other):
if not index.equals(other, tolerance=tolerance):

if sort is None:
# TODO: remove once pd.concat sort default changes
Expand All @@ -119,7 +125,7 @@ def conv(i):

name = _get_consensus_names(indexes)[0]
if name != index.name:
index = index._shallow_copy(name=name)
index = index._shallow_copy(name=name, tolerance=tolerance)
return index
else: # kind='list'
return _unique_indices(indexes)
Expand Down
Loading