Skip to content

CLN refactor core indexes #37582

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Dec 22, 2020
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
5685cb1
refactor core indexes
MarcoGorelli Nov 2, 2020
7695e66
parens
MarcoGorelli Nov 2, 2020
d5d4530
reversions
MarcoGorelli Nov 2, 2020
23be6a5
make pythonic
MarcoGorelli Nov 2, 2020
b3d1dec
try moving outside try
MarcoGorelli Nov 2, 2020
1b0d92c
Merge remote-tracking branch 'upstream/master' into refactor-core-ind…
MarcoGorelli Nov 4, 2020
e0d36eb
move out of try-except
MarcoGorelli Nov 4, 2020
fb2bea0
Merge remote-tracking branch 'upstream/master' into refactor-core-ind…
MarcoGorelli Nov 5, 2020
26c419e
keep asi8.min() inside try-except
MarcoGorelli Nov 5, 2020
d076431
Merge remote-tracking branch 'upstream/master' into refactor-core-ind…
MarcoGorelli Nov 6, 2020
bb0fc86
Merge remote-tracking branch 'upstream/master' into refactor-core-ind…
MarcoGorelli Nov 8, 2020
445f2e9
empty check
MarcoGorelli Nov 8, 2020
e8f117c
similar simplification in max
MarcoGorelli Nov 8, 2020
f1f1deb
Merge remote-tracking branch 'upstream/master' into refactor-core-ind…
MarcoGorelli Nov 9, 2020
e860c45
Merge remote-tracking branch 'upstream/master' into refactor-core-ind…
MarcoGorelli Nov 23, 2020
a58de56
fix merge error
MarcoGorelli Nov 23, 2020
98672b1
Merge remote-tracking branch 'upstream/master' into refactor-core-ind…
MarcoGorelli Nov 24, 2020
be80fb7
wip
MarcoGorelli Nov 24, 2020
dc0b84e
early return
MarcoGorelli Nov 24, 2020
4074a88
parens
MarcoGorelli Nov 24, 2020
00d6be0
:art:
MarcoGorelli Nov 24, 2020
d073da9
Merge remote-tracking branch 'upstream/master' into refactor-core-ind…
MarcoGorelli Nov 29, 2020
fdafd36
Merge remote-tracking branch 'upstream/master' into refactor-core-ind…
MarcoGorelli Dec 6, 2020
01c4a06
Merge remote-tracking branch 'upstream/master' into refactor-core-ind…
MarcoGorelli Dec 9, 2020
1b254ae
coverage
MarcoGorelli Dec 9, 2020
dffc440
:twisted_rightwards_arrows: Merge remote-tracking branch 'upstream/ma…
MarcoGorelli Dec 16, 2020
63a8575
Merge remote-tracking branch 'upstream/master' into refactor-core-ind…
MarcoGorelli Dec 22, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions pandas/core/indexes/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,4 @@ def all_indexes_same(indexes):
"""
itr = iter(indexes)
first = next(itr)
for index in itr:
if not first.equals(index):
return False
return True
return all(first.equals(index) for index in itr)
55 changes: 19 additions & 36 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@
unique="Index",
duplicated="np.ndarray",
)
_index_shared_docs = dict()
_index_shared_docs = {}
str_t = str


Expand Down Expand Up @@ -901,9 +901,7 @@ def __repr__(self) -> str_t:
if data is None:
data = ""

res = f"{klass_name}({data}{prepr})"

return res
return f"{klass_name}({data}{prepr})"

def _format_space(self) -> str_t:

Expand Down Expand Up @@ -982,7 +980,6 @@ def _format_with_header(
if is_object_dtype(values.dtype):
values = lib.maybe_convert_objects(values, safe=1)

if is_object_dtype(values.dtype):
result = [pprint_thing(x, escape_chars=("\t", "\r", "\n")) for x in values]

# could have nans
Expand Down Expand Up @@ -1583,7 +1580,7 @@ def _drop_level_numbers(self, levnums: List[int]):
Drop MultiIndex levels by level _number_, not name.
"""

if len(levnums) == 0:
if not levnums:
return self
if len(levnums) >= self.nlevels:
raise ValueError(
Expand Down Expand Up @@ -3120,7 +3117,7 @@ def get_indexer(
"Reindexing only valid with uniquely valued Index objects"
)

if method == "pad" or method == "backfill":
if method in ["pad", "backfill"]:
indexer = self._get_fill_indexer(target, method, limit, tolerance)
elif method == "nearest":
indexer = self._get_nearest_indexer(target, limit, tolerance)
Expand Down Expand Up @@ -3233,8 +3230,7 @@ def _filter_indexer_tolerance(
tolerance,
) -> np.ndarray:
distance = abs(self._values[indexer] - target)
indexer = np.where(distance <= tolerance, indexer, -1)
return indexer
return np.where(distance <= tolerance, indexer, -1)

# --------------------------------------------------------------------
# Indexer Conversion Methods
Expand Down Expand Up @@ -3343,9 +3339,7 @@ def _convert_listlike_indexer(self, keyarr):
keyarr : numpy.ndarray
Return tuple-safe keys.
"""
if isinstance(keyarr, Index):
pass
else:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think we did it this way to make coverage obvious

if not isinstance(keyarr, Index):
keyarr = self._convert_arr_indexer(keyarr)

indexer = self._convert_list_indexer(keyarr)
Expand All @@ -3364,8 +3358,7 @@ def _convert_arr_indexer(self, keyarr):
-------
converted_keyarr : array-like
"""
keyarr = com.asarray_tuplesafe(keyarr)
return keyarr
return com.asarray_tuplesafe(keyarr)

def _convert_list_indexer(self, keyarr):
"""
Expand Down Expand Up @@ -3737,9 +3730,8 @@ def _join_multi(self, other, how, return_indexers=True):
other, level, how=how, return_indexers=return_indexers
)

if flip_order:
if isinstance(result, tuple):
return result[0], result[2], result[1]
if flip_order and isinstance(result, tuple):
return result[0], result[2], result[1]
return result

@final
Expand Down Expand Up @@ -4284,7 +4276,7 @@ def append(self, other):
to_concat = [self]

if isinstance(other, (list, tuple)):
to_concat = to_concat + list(other)
to_concat += list(other)
else:
to_concat.append(other)

Expand Down Expand Up @@ -4773,9 +4765,7 @@ def _should_fallback_to_positional(self) -> bool:
"""
Should an integer key be treated as positional?
"""
if self.holds_integer() or self.is_boolean():
return False
return True
return not self.holds_integer() and not self.is_boolean()

def _get_values_for_loc(self, series: "Series", loc, key):
"""
Expand Down Expand Up @@ -5168,11 +5158,7 @@ def _validate_indexer(self, form: str_t, key, kind: str_t):
"""
assert kind in ["getitem", "iloc"]

if key is None:
pass
elif is_integer(key):
pass
else:
if key is not None and not is_integer(key):
self._invalid_indexer(form, key)

def _maybe_cast_slice_bound(self, label, side: str_t, kind):
Expand Down Expand Up @@ -5471,9 +5457,10 @@ def _cmp_method(self, other, op):
"""
Wrapper used to dispatch comparison operations.
"""
if isinstance(other, (np.ndarray, Index, ABCSeries, ExtensionArray)):
if len(self) != len(other):
raise ValueError("Lengths must match to compare")
if isinstance(other, (np.ndarray, Index, ABCSeries, ExtensionArray)) and (
len(self) != len(other)
):
raise ValueError("Lengths must match to compare")

if not isinstance(other, ABCMultiIndex):
other = extract_array(other, extract_numpy=True)
Expand Down Expand Up @@ -5795,7 +5782,7 @@ def trim_front(strings: List[str]) -> List[str]:
Trims zeros and decimal points.
"""
trimmed = strings
while len(strings) > 0 and all(x[0] == " " for x in trimmed):
while trimmed and all(x.startswith(" ") for x in trimmed):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this looks like it changes the logic by checking trimmed instead of strings?

perf difference between startswith vs x[0] == " "?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True, but AFAIKT this condition is only needed if trimmed is an empty list (because all([]) will always be True), so if we do an early return then we can remove it.

Regarding perf differences:

In [15]: %timeit 'foobarfdsfsdfsdafsdafhdlsafhgsdlafhsdlafhsda'.startswith('f')
110 ns ± 3.31 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)

In [16]: %timeit 'foobarfdsfsdfsdafsdafhdlsafhgsdlafhsdlafhsda'[0] == 'f'
24.1 ns ± 1.05 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)

so I'll go back to [0] == ' ' (cc @ivanovmg )

trimmed = [x[1:] for x in trimmed]
return trimmed

Expand Down Expand Up @@ -5865,15 +5852,11 @@ def _maybe_cast_with_dtype(data: np.ndarray, dtype: np.dtype, copy: bool) -> np.
except ValueError:
data = np.array(data, dtype=np.float64, copy=copy)

elif inferred == "string":
pass
else:
elif inferred != "string":
data = data.astype(dtype)
elif is_float_dtype(dtype):
inferred = lib.infer_dtype(data, skipna=False)
if inferred == "string":
pass
else:
if inferred != "string":
data = data.astype(dtype)
else:
data = np.array(data, dtype=dtype, copy=copy)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
if self.is_unique and self.equals(target):
return np.arange(len(self), dtype="intp")

if method == "pad" or method == "backfill":
if method in ["pad", "backfill"]:
raise NotImplementedError(
"method='pad' and method='backfill' not "
"implemented yet for CategoricalIndex"
Expand Down
75 changes: 33 additions & 42 deletions pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,22 +237,17 @@ def min(self, axis=None, skipna=True, *args, **kwargs):
return self._na_value

i8 = self.asi8
try:
# quick check
if len(i8) and self.is_monotonic:
if i8[0] != iNaT:
return self._data._box_func(i8[0])

if self.hasnans:
if skipna:
min_stamp = self[~self._isnan].asi8.min()
else:
return self._na_value
else:
min_stamp = i8.min()
return self._data._box_func(min_stamp)
except ValueError:

# quick check
if self.is_monotonic and i8[0] != iNaT:
return self._data._box_func(i8[0])
if self.hasnans and skipna and not self[~self._isnan].empty:
min_stamp = self[~self._isnan].asi8.min()
elif not self.hasnans:
min_stamp = i8.min()
else:
return self._na_value
return self._data._box_func(min_stamp)

def argmin(self, axis=None, skipna=True, *args, **kwargs):
"""
Expand Down Expand Up @@ -294,22 +289,17 @@ def max(self, axis=None, skipna=True, *args, **kwargs):
return self._na_value

i8 = self.asi8
try:
# quick check
if len(i8) and self.is_monotonic:
if i8[-1] != iNaT:
return self._data._box_func(i8[-1])

if self.hasnans:
if skipna:
max_stamp = self[~self._isnan].asi8.max()
else:
return self._na_value
else:
max_stamp = i8.max()
return self._data._box_func(max_stamp)
except ValueError:

# quick check
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we could just return self._data.max(...). Downside is that doesn't take advantage of caching

if self.is_monotonic and i8[-1] != iNaT:
return self._data._box_func(i8[-1])
if self.hasnans and skipna and not self[~self._isnan].empty:
max_stamp = self[~self._isnan].asi8.max()
elif not self.hasnans:
max_stamp = i8.max()
else:
return self._na_value
return self._data._box_func(max_stamp)

def argmax(self, axis=None, skipna=True, *args, **kwargs):
"""
Expand Down Expand Up @@ -572,9 +562,12 @@ def _get_delete_freq(self, loc: int):
loc = lib.maybe_indices_to_slice(
np.asarray(loc, dtype=np.intp), len(self)
)
if isinstance(loc, slice) and loc.step in (1, None):
if loc.start in (0, None) or loc.stop in (len(self), None):
freq = self.freq
if (
isinstance(loc, slice)
and loc.step in (1, None)
and (loc.start in (0, None) or loc.stop in (len(self), None))
):
freq = self.freq
return freq

def _get_insert_freq(self, loc, item):
Expand All @@ -592,7 +585,7 @@ def _get_insert_freq(self, loc, item):
if self.size:
if item is NaT:
pass
elif (loc == 0 or loc == -len(self)) and item + self.freq == self[0]:
elif loc in [0, -len(self)] and item + self.freq == self[0]:
freq = self.freq
elif (loc == len(self)) and item - self.freq == self[-1]:
freq = self.freq
Expand Down Expand Up @@ -677,8 +670,7 @@ def _shallow_copy(self, values=None, name: Label = lib.no_default):

@Appender(Index.difference.__doc__)
def difference(self, other, sort=None):
new_idx = super().difference(other, sort=sort)._with_freq(None)
return new_idx
return super().difference(other, sort=sort)._with_freq(None)

def intersection(self, other, sort=False):
"""
Expand Down Expand Up @@ -720,10 +712,9 @@ def intersection(self, other, sort=False):

if not isinstance(other, type(self)):
result = Index.intersection(self, other, sort=sort)
if isinstance(result, type(self)):
if result.freq is None:
# TODO: no tests rely on this; needed?
result = result._with_freq("infer")
if isinstance(result, type(self)) and result.freq is None:
# TODO: no tests rely on this; needed?
result = result._with_freq("infer")
return result

elif not self._can_fast_intersect(other):
Expand Down Expand Up @@ -869,13 +860,13 @@ def _union(self, other, sort):
elif result.freq is None:
# TODO: no tests rely on this; needed?
result = result._with_freq("infer")
return result
else:
i8self = Int64Index._simple_new(self.asi8)
i8other = Int64Index._simple_new(other.asi8)
i8result = i8self._union(i8other, sort=sort)
result = type(self)(i8result, dtype=self.dtype, freq="infer")
return result

return result

# --------------------------------------------------------------------
# Join Methods
Expand Down
16 changes: 8 additions & 8 deletions pandas/core/indexes/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,12 +425,13 @@ def _maybe_utc_convert(self, other: Index) -> Tuple["DatetimeIndex", Index]:
this = self

if isinstance(other, DatetimeIndex):
if self.tz is not None:
if other.tz is None:
raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex")
elif other.tz is not None:
if (
(self.tz is not None)
and (other.tz is None)
or (self.tz is None)
and (other.tz is not None)
):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(self.tz is None) ^ (other.tz is None)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(self.tz is None ^ other.tz is not None)? AFAIKT the check is to see if one of them is None but the other one isn't - I've tried regrouping the parens to make it clearer anyway, thanks

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

was this not viable?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as in,

if (self.tz is None) or (other.tz is None):
    raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex")

?

I don't think that would work because we don't want to raise if both self.tz and other.tz are None, just if one is but the other isn't

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not (self.tz is None) or (other.tz is None), but (self.tz is None) ^ (other.tz is None)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah sorry, I didn't actually know that was a Python command - that should work then, thanks so much!

raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex")

if not timezones.tz_compare(self.tz, other.tz):
this = self.tz_convert("UTC")
other = other.tz_convert("UTC")
Expand Down Expand Up @@ -758,8 +759,7 @@ def _get_string_slice(self, key: str):
freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None))
parsed, reso = parsing.parse_time_string(key, freq)
reso = Resolution.from_attrname(reso)
loc = self._partial_date_slice(reso, parsed)
return loc
return self._partial_date_slice(reso, parsed)

def slice_indexer(self, start=None, end=None, step=None, kind=None):
"""
Expand Down Expand Up @@ -822,7 +822,7 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None):
# --------------------------------------------------------------------

def is_type_compatible(self, typ) -> bool:
return typ == self.inferred_type or typ == "datetime"
return typ in [self.inferred_type, "datetime"]

@property
def inferred_type(self) -> str:
Expand Down