-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
CLN: annotation in reshape.merge #29490
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
8384e5f
bc15a94
6b3f6cb
a58d81e
de5fe3b
731cea7
ed3c56c
c33a91b
b34a9ac
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ | |
import datetime | ||
from functools import partial | ||
import string | ||
from typing import TYPE_CHECKING, Optional, Tuple | ||
import warnings | ||
|
||
import numpy as np | ||
|
@@ -39,29 +40,33 @@ | |
from pandas.core.dtypes.missing import isna, na_value_for_dtype | ||
|
||
from pandas import Categorical, Index, MultiIndex | ||
from pandas._typing import FrameOrSeries | ||
import pandas.core.algorithms as algos | ||
from pandas.core.arrays.categorical import _recode_for_categories | ||
import pandas.core.common as com | ||
from pandas.core.frame import _merge_doc | ||
from pandas.core.internals import _transform_index, concatenate_block_managers | ||
from pandas.core.sorting import is_int64_overflow_possible | ||
|
||
if TYPE_CHECKING: | ||
from pandas import DataFrame | ||
|
||
|
||
@Substitution("\nleft : DataFrame") | ||
@Appender(_merge_doc, indents=0) | ||
def merge( | ||
left, | ||
right, | ||
how="inner", | ||
how: str = "inner", | ||
on=None, | ||
left_on=None, | ||
right_on=None, | ||
left_index=False, | ||
right_index=False, | ||
sort=False, | ||
left_index: bool = False, | ||
right_index: bool = False, | ||
sort: bool = False, | ||
suffixes=("_x", "_y"), | ||
copy=True, | ||
indicator=False, | ||
copy: bool = True, | ||
indicator: bool = False, | ||
validate=None, | ||
): | ||
op = _MergeOperation( | ||
|
@@ -86,7 +91,9 @@ def merge( | |
merge.__doc__ = _merge_doc % "\nleft : DataFrame" | ||
|
||
|
||
def _groupby_and_merge(by, on, left, right, _merge_pieces, check_duplicates=True): | ||
def _groupby_and_merge( | ||
by, on, left, right, _merge_pieces, check_duplicates: bool = True | ||
): | ||
""" | ||
groupby & merge; we are always performing a left-by type operation | ||
|
||
|
@@ -172,7 +179,7 @@ def merge_ordered( | |
right_by=None, | ||
fill_method=None, | ||
suffixes=("_x", "_y"), | ||
how="outer", | ||
how: str = "outer", | ||
): | ||
""" | ||
Perform merge with optional filling/interpolation. | ||
|
@@ -298,14 +305,14 @@ def merge_asof( | |
on=None, | ||
left_on=None, | ||
right_on=None, | ||
left_index=False, | ||
right_index=False, | ||
left_index: bool = False, | ||
right_index: bool = False, | ||
by=None, | ||
left_by=None, | ||
right_by=None, | ||
suffixes=("_x", "_y"), | ||
tolerance=None, | ||
allow_exact_matches=True, | ||
allow_exact_matches: bool = True, | ||
direction="backward", | ||
): | ||
""" | ||
|
@@ -533,33 +540,37 @@ def merge_asof( | |
# TODO: only copy DataFrames when modification necessary | ||
class _MergeOperation: | ||
""" | ||
Perform a database (SQL) merge operation between two DataFrame objects | ||
using either columns as keys or their row indexes | ||
Perform a database (SQL) merge operation between two DataFrame or Series | ||
objects using either columns as keys or their row indexes | ||
""" | ||
|
||
_merge_type = "merge" | ||
|
||
indicator_name: Optional[str] | ||
left: "DataFrame" | ||
right: "DataFrame" | ||
|
||
def __init__( | ||
self, | ||
left, | ||
right, | ||
how="inner", | ||
left: FrameOrSeries, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Haven't debugged to verify but we can mix and match objects here right? So something as follows: >>> df = pd.DataFrame([[1]])
>>> ser = pd.Series([1], name="a")
>>> pd.merge(df, ser, left_index=True, right_index=True)
0 a
0 1 1 If that's the case then these actually should be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Correct, left and right can each be either DataFrame or Series, i.e. 4 valid combinations. Does |
||
right: FrameOrSeries, | ||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
how: str = "inner", | ||
on=None, | ||
left_on=None, | ||
right_on=None, | ||
axis=1, | ||
left_index=False, | ||
right_index=False, | ||
sort=True, | ||
left_index: bool = False, | ||
right_index: bool = False, | ||
sort: bool = True, | ||
suffixes=("_x", "_y"), | ||
copy=True, | ||
indicator=False, | ||
copy: bool = True, | ||
indicator: bool = False, | ||
validate=None, | ||
): | ||
left = validate_operand(left) | ||
right = validate_operand(right) | ||
self.left = self.orig_left = left | ||
self.right = self.orig_right = right | ||
_left = _validate_operand(left) | ||
_right = _validate_operand(right) | ||
self.left = self.orig_left = _validate_operand(_left) | ||
self.right = self.orig_right = _validate_operand(_right) | ||
self.how = how | ||
self.axis = axis | ||
|
||
|
@@ -597,11 +608,11 @@ def __init__( | |
) | ||
|
||
# warn user when merging between different levels | ||
if left.columns.nlevels != right.columns.nlevels: | ||
if _left.columns.nlevels != _right.columns.nlevels: | ||
msg = ( | ||
"merging between different levels can give an unintended " | ||
"result ({left} levels on the left, {right} on the right)" | ||
).format(left=left.columns.nlevels, right=right.columns.nlevels) | ||
).format(left=_left.columns.nlevels, right=_right.columns.nlevels) | ||
warnings.warn(msg, UserWarning) | ||
|
||
self._validate_specification() | ||
|
@@ -658,7 +669,9 @@ def get_result(self): | |
|
||
return result | ||
|
||
def _indicator_pre_merge(self, left, right): | ||
def _indicator_pre_merge( | ||
self, left: "DataFrame", right: "DataFrame" | ||
) -> Tuple["DataFrame", "DataFrame"]: | ||
|
||
columns = left.columns.union(right.columns) | ||
|
||
|
@@ -878,7 +891,12 @@ def _get_join_info(self): | |
return join_index, left_indexer, right_indexer | ||
|
||
def _create_join_index( | ||
self, index, other_index, indexer, other_indexer, how="left" | ||
self, | ||
index: Index, | ||
other_index: Index, | ||
indexer, | ||
other_indexer, | ||
how: str = "left", | ||
): | ||
""" | ||
Create a join index by rearranging one index to match another | ||
|
@@ -1263,7 +1281,9 @@ def _validate(self, validate: str): | |
raise ValueError("Not a valid argument for validate") | ||
|
||
|
||
def _get_join_indexers(left_keys, right_keys, sort=False, how="inner", **kwargs): | ||
def _get_join_indexers( | ||
left_keys, right_keys, sort: bool = False, how: str = "inner", **kwargs | ||
): | ||
""" | ||
|
||
Parameters | ||
|
@@ -1410,13 +1430,13 @@ def __init__( | |
on=None, | ||
left_on=None, | ||
right_on=None, | ||
left_index=False, | ||
right_index=False, | ||
left_index: bool = False, | ||
right_index: bool = False, | ||
axis=1, | ||
suffixes=("_x", "_y"), | ||
copy=True, | ||
copy: bool = True, | ||
fill_method=None, | ||
how="outer", | ||
how: str = "outer", | ||
): | ||
|
||
self.fill_method = fill_method | ||
|
@@ -1508,18 +1528,18 @@ def __init__( | |
on=None, | ||
left_on=None, | ||
right_on=None, | ||
left_index=False, | ||
right_index=False, | ||
left_index: bool = False, | ||
right_index: bool = False, | ||
by=None, | ||
left_by=None, | ||
right_by=None, | ||
axis=1, | ||
suffixes=("_x", "_y"), | ||
copy=True, | ||
copy: bool = True, | ||
fill_method=None, | ||
how="asof", | ||
how: str = "asof", | ||
tolerance=None, | ||
allow_exact_matches=True, | ||
allow_exact_matches: bool = True, | ||
direction="backward", | ||
): | ||
|
||
|
@@ -1757,13 +1777,15 @@ def flip(xs): | |
return func(left_values, right_values, self.allow_exact_matches, tolerance) | ||
|
||
|
||
def _get_multiindex_indexer(join_keys, index, sort): | ||
def _get_multiindex_indexer(join_keys, index: MultiIndex, sort: bool): | ||
|
||
# bind `sort` argument | ||
fkeys = partial(_factorize_keys, sort=sort) | ||
|
||
# left & right join labels and num. of levels at each location | ||
rcodes, lcodes, shape = map(list, zip(*map(fkeys, index.levels, join_keys))) | ||
mapped = (fkeys(index.levels[n], join_keys[n]) for n in range(len(index.levels))) | ||
zipped = zip(*mapped) | ||
rcodes, lcodes, shape = [list(x) for x in zipped] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. was this cleaning or silencing mypy? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this was necessary to silence mypy. i also dont like the pattern that was used before (and am not wild about the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yep. fine with this. just curious. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK with splitting this out but should probably keep |
||
if sort: | ||
rcodes = list(map(np.take, rcodes, index.codes)) | ||
else: | ||
|
@@ -1791,7 +1813,7 @@ def _get_multiindex_indexer(join_keys, index, sort): | |
return libjoin.left_outer_join(lkey, rkey, count, sort=sort) | ||
|
||
|
||
def _get_single_indexer(join_key, index, sort=False): | ||
def _get_single_indexer(join_key, index, sort: bool = False): | ||
left_key, right_key, count = _factorize_keys(join_key, index, sort=sort) | ||
|
||
left_indexer, right_indexer = libjoin.left_outer_join( | ||
|
@@ -1801,7 +1823,7 @@ def _get_single_indexer(join_key, index, sort=False): | |
return left_indexer, right_indexer | ||
|
||
|
||
def _left_join_on_index(left_ax, right_ax, join_keys, sort=False): | ||
def _left_join_on_index(left_ax: Index, right_ax: Index, join_keys, sort: bool = False): | ||
if len(join_keys) > 1: | ||
if not ( | ||
(isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels) | ||
|
@@ -1915,7 +1937,7 @@ def _factorize_keys(lk, rk, sort=True): | |
return llab, rlab, count | ||
|
||
|
||
def _sort_labels(uniques, left, right): | ||
def _sort_labels(uniques: np.ndarray, left, right): | ||
if not isinstance(uniques, np.ndarray): | ||
# tuplesafe | ||
uniques = Index(uniques).values | ||
|
@@ -1930,7 +1952,7 @@ def _sort_labels(uniques, left, right): | |
return new_left, new_right | ||
|
||
|
||
def _get_join_keys(llab, rlab, shape, sort): | ||
def _get_join_keys(llab, rlab, shape, sort: bool): | ||
|
||
# how many levels can be done without overflow | ||
pred = lambda i: not is_int64_overflow_possible(shape[:i]) | ||
|
@@ -1970,7 +1992,7 @@ def _any(x) -> bool: | |
return x is not None and com.any_not_none(*x) | ||
|
||
|
||
def validate_operand(obj): | ||
def _validate_operand(obj: FrameOrSeries) -> "DataFrame": | ||
if isinstance(obj, ABCDataFrame): | ||
return obj | ||
elif isinstance(obj, ABCSeries): | ||
|
@@ -1985,7 +2007,7 @@ def validate_operand(obj): | |
) | ||
|
||
|
||
def _items_overlap_with_suffix(left, lsuffix, right, rsuffix): | ||
def _items_overlap_with_suffix(left: Index, lsuffix, right: Index, rsuffix): | ||
""" | ||
If two indices overlap, add suffixes to overlapping entries. | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a reason for adding these definitions at the class level? Would prefer not to do that
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
there was a request to use py36 style annotations
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can these be done in the init? These are instance variables shouldn't need to expose as class variables now for annotations
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i guess so, sure. is this the wrong use case for putting these a the class level? or is it a preference/policy thing?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
since these are type declarations and not variable initialisations, I think that these are a noop at runtime and therefore not exposed as class variables.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there consensus on the desired usage?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If it can be done in the init I still think better. Should keep things localized and not push to higher namespaces unless really necessary
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
agreed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated