Skip to content

CLN: annotation in reshape.merge #29490

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Nov 12, 2019
116 changes: 69 additions & 47 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import datetime
from functools import partial
import string
from typing import TYPE_CHECKING, Optional, Tuple
import warnings

import numpy as np
Expand Down Expand Up @@ -39,29 +40,33 @@
from pandas.core.dtypes.missing import isna, na_value_for_dtype

from pandas import Categorical, Index, MultiIndex
from pandas._typing import FrameOrSeries
import pandas.core.algorithms as algos
from pandas.core.arrays.categorical import _recode_for_categories
import pandas.core.common as com
from pandas.core.frame import _merge_doc
from pandas.core.internals import _transform_index, concatenate_block_managers
from pandas.core.sorting import is_int64_overflow_possible

if TYPE_CHECKING:
from pandas import DataFrame


@Substitution("\nleft : DataFrame")
@Appender(_merge_doc, indents=0)
def merge(
left,
right,
how="inner",
how: str = "inner",
on=None,
left_on=None,
right_on=None,
left_index=False,
right_index=False,
sort=False,
left_index: bool = False,
right_index: bool = False,
sort: bool = False,
suffixes=("_x", "_y"),
copy=True,
indicator=False,
copy: bool = True,
indicator: bool = False,
validate=None,
):
op = _MergeOperation(
Expand All @@ -86,7 +91,9 @@ def merge(
merge.__doc__ = _merge_doc % "\nleft : DataFrame"


def _groupby_and_merge(by, on, left, right, _merge_pieces, check_duplicates=True):
def _groupby_and_merge(
by, on, left, right, _merge_pieces, check_duplicates: bool = True
):
"""
groupby & merge; we are always performing a left-by type operation

Expand Down Expand Up @@ -172,7 +179,7 @@ def merge_ordered(
right_by=None,
fill_method=None,
suffixes=("_x", "_y"),
how="outer",
how: str = "outer",
):
"""
Perform merge with optional filling/interpolation.
Expand Down Expand Up @@ -298,14 +305,14 @@ def merge_asof(
on=None,
left_on=None,
right_on=None,
left_index=False,
right_index=False,
left_index: bool = False,
right_index: bool = False,
by=None,
left_by=None,
right_by=None,
suffixes=("_x", "_y"),
tolerance=None,
allow_exact_matches=True,
allow_exact_matches: bool = True,
direction="backward",
):
"""
Expand Down Expand Up @@ -533,33 +540,37 @@ def merge_asof(
# TODO: only copy DataFrames when modification necessary
class _MergeOperation:
"""
Perform a database (SQL) merge operation between two DataFrame objects
using either columns as keys or their row indexes
Perform a database (SQL) merge operation between two DataFrame or Series
objects using either columns as keys or their row indexes
"""

_merge_type = "merge"

indicator_name: Optional[str]
left: "DataFrame"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason for adding these definitions at the class level? Would prefer not to do that

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there was a request to use py36 style annotations

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can these be done in the init? These are instance variables shouldn't need to expose as class variables now for annotations

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i guess so, sure. is this the wrong use case for putting these a the class level? or is it a preference/policy thing?

Copy link
Member

@simonjayhawkins simonjayhawkins Nov 11, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can these be done in the init? These are instance variables shouldn't need to expose as class variables now for annotations

since these are type declarations and not variable initialisations, I think that these are a noop at runtime and therefore not exposed as class variables.

>>> class foo():
...     bar: str
...
>>> foo().bar
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
AttributeError: 'foo' object has no attribute 'bar'
>>> foo.bar
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
AttributeError: type object 'foo' has no attribute 'bar'

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there consensus on the desired usage?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it can be done in the init I still think better. Should keep things localized and not push to higher namespaces unless really necessary

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agreed.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated

right: "DataFrame"

def __init__(
self,
left,
right,
how="inner",
left: FrameOrSeries,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Haven't debugged to verify but we can mix and match objects here right? So something as follows:

>>> df = pd.DataFrame([[1]])
>>> ser = pd.Series([1], name="a")
>>> pd.merge(df, ser, left_index=True, right_index=True)
   0  a
0  1  1

If that's the case then these actually should be Union[DataFrame, Series] instead of FrameOrSeries. Pretty tricky

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct, left and right can each be either DataFrame or Series, i.e. 4 valid combinations. Does FrameOrSeries assume you're not mix-and-matching? will update

right: FrameOrSeries,
how: str = "inner",
on=None,
left_on=None,
right_on=None,
axis=1,
left_index=False,
right_index=False,
sort=True,
left_index: bool = False,
right_index: bool = False,
sort: bool = True,
suffixes=("_x", "_y"),
copy=True,
indicator=False,
copy: bool = True,
indicator: bool = False,
validate=None,
):
left = validate_operand(left)
right = validate_operand(right)
self.left = self.orig_left = left
self.right = self.orig_right = right
_left = _validate_operand(left)
_right = _validate_operand(right)
self.left = self.orig_left = _validate_operand(_left)
self.right = self.orig_right = _validate_operand(_right)
self.how = how
self.axis = axis

Expand Down Expand Up @@ -597,11 +608,11 @@ def __init__(
)

# warn user when merging between different levels
if left.columns.nlevels != right.columns.nlevels:
if _left.columns.nlevels != _right.columns.nlevels:
msg = (
"merging between different levels can give an unintended "
"result ({left} levels on the left, {right} on the right)"
).format(left=left.columns.nlevels, right=right.columns.nlevels)
).format(left=_left.columns.nlevels, right=_right.columns.nlevels)
warnings.warn(msg, UserWarning)

self._validate_specification()
Expand Down Expand Up @@ -658,7 +669,9 @@ def get_result(self):

return result

def _indicator_pre_merge(self, left, right):
def _indicator_pre_merge(
self, left: "DataFrame", right: "DataFrame"
) -> Tuple["DataFrame", "DataFrame"]:

columns = left.columns.union(right.columns)

Expand Down Expand Up @@ -878,7 +891,12 @@ def _get_join_info(self):
return join_index, left_indexer, right_indexer

def _create_join_index(
self, index, other_index, indexer, other_indexer, how="left"
self,
index: Index,
other_index: Index,
indexer,
other_indexer,
how: str = "left",
):
"""
Create a join index by rearranging one index to match another
Expand Down Expand Up @@ -1263,7 +1281,9 @@ def _validate(self, validate: str):
raise ValueError("Not a valid argument for validate")


def _get_join_indexers(left_keys, right_keys, sort=False, how="inner", **kwargs):
def _get_join_indexers(
left_keys, right_keys, sort: bool = False, how: str = "inner", **kwargs
):
"""

Parameters
Expand Down Expand Up @@ -1410,13 +1430,13 @@ def __init__(
on=None,
left_on=None,
right_on=None,
left_index=False,
right_index=False,
left_index: bool = False,
right_index: bool = False,
axis=1,
suffixes=("_x", "_y"),
copy=True,
copy: bool = True,
fill_method=None,
how="outer",
how: str = "outer",
):

self.fill_method = fill_method
Expand Down Expand Up @@ -1508,18 +1528,18 @@ def __init__(
on=None,
left_on=None,
right_on=None,
left_index=False,
right_index=False,
left_index: bool = False,
right_index: bool = False,
by=None,
left_by=None,
right_by=None,
axis=1,
suffixes=("_x", "_y"),
copy=True,
copy: bool = True,
fill_method=None,
how="asof",
how: str = "asof",
tolerance=None,
allow_exact_matches=True,
allow_exact_matches: bool = True,
direction="backward",
):

Expand Down Expand Up @@ -1757,13 +1777,15 @@ def flip(xs):
return func(left_values, right_values, self.allow_exact_matches, tolerance)


def _get_multiindex_indexer(join_keys, index, sort):
def _get_multiindex_indexer(join_keys, index: MultiIndex, sort: bool):

# bind `sort` argument
fkeys = partial(_factorize_keys, sort=sort)

# left & right join labels and num. of levels at each location
rcodes, lcodes, shape = map(list, zip(*map(fkeys, index.levels, join_keys)))
mapped = (fkeys(index.levels[n], join_keys[n]) for n in range(len(index.levels)))
zipped = zip(*mapped)
rcodes, lcodes, shape = [list(x) for x in zipped]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

was this cleaning or silencing mypy?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this was necessary to silence mypy. i also dont like the pattern that was used before (and am not wild about the zip still here, open to suggestions)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep. fine with this. just curious.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK with splitting this out but should probably keep mapped as a generator; coercing to list may have some overhead

if sort:
rcodes = list(map(np.take, rcodes, index.codes))
else:
Expand Down Expand Up @@ -1791,7 +1813,7 @@ def _get_multiindex_indexer(join_keys, index, sort):
return libjoin.left_outer_join(lkey, rkey, count, sort=sort)


def _get_single_indexer(join_key, index, sort=False):
def _get_single_indexer(join_key, index, sort: bool = False):
left_key, right_key, count = _factorize_keys(join_key, index, sort=sort)

left_indexer, right_indexer = libjoin.left_outer_join(
Expand All @@ -1801,7 +1823,7 @@ def _get_single_indexer(join_key, index, sort=False):
return left_indexer, right_indexer


def _left_join_on_index(left_ax, right_ax, join_keys, sort=False):
def _left_join_on_index(left_ax: Index, right_ax: Index, join_keys, sort: bool = False):
if len(join_keys) > 1:
if not (
(isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels)
Expand Down Expand Up @@ -1915,7 +1937,7 @@ def _factorize_keys(lk, rk, sort=True):
return llab, rlab, count


def _sort_labels(uniques, left, right):
def _sort_labels(uniques: np.ndarray, left, right):
if not isinstance(uniques, np.ndarray):
# tuplesafe
uniques = Index(uniques).values
Expand All @@ -1930,7 +1952,7 @@ def _sort_labels(uniques, left, right):
return new_left, new_right


def _get_join_keys(llab, rlab, shape, sort):
def _get_join_keys(llab, rlab, shape, sort: bool):

# how many levels can be done without overflow
pred = lambda i: not is_int64_overflow_possible(shape[:i])
Expand Down Expand Up @@ -1970,7 +1992,7 @@ def _any(x) -> bool:
return x is not None and com.any_not_none(*x)


def validate_operand(obj):
def _validate_operand(obj: FrameOrSeries) -> "DataFrame":
if isinstance(obj, ABCDataFrame):
return obj
elif isinstance(obj, ABCSeries):
Expand All @@ -1985,7 +2007,7 @@ def validate_operand(obj):
)


def _items_overlap_with_suffix(left, lsuffix, right, rsuffix):
def _items_overlap_with_suffix(left: Index, lsuffix, right: Index, rsuffix):
"""
If two indices overlap, add suffixes to overlapping entries.

Expand Down