Skip to content

TYP: pandas/core/missing.py #38339

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 31 commits into from
Closed
Changes from 5 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
830fa00
add type hints
arw2019 Dec 5, 2020
605dc3c
review: remove assert
arw2019 Dec 7, 2020
e77c940
merge master
arw2019 Dec 7, 2020
f2d5ec4
typo
arw2019 Dec 7, 2020
e83904f
add isna check
arw2019 Dec 7, 2020
71caeeb
better error msg when interp method not string
arw2019 Dec 7, 2020
8fbbd47
improve docstring
arw2019 Dec 7, 2020
4474ada
Merge branch 'master' of https://github.com/pandas-dev/pandas into ty…
arw2019 Dec 7, 2020
575c227
remove Optional
arw2019 Dec 7, 2020
b19896b
use Axis TypeVar
arw2019 Dec 7, 2020
5036ee1
more hints
arw2019 Dec 8, 2020
c0c4338
Merge branch 'master' of https://github.com/pandas-dev/pandas into ty…
arw2019 Dec 8, 2020
2a31823
review comments
arw2019 Dec 9, 2020
4fb893b
Merge branch 'master' of https://github.com/pandas-dev/pandas into ty…
arw2019 Dec 9, 2020
95a734b
Merge branch 'master' of https://github.com/pandas-dev/pandas into ty…
arw2019 Dec 10, 2020
4aeec70
review comment
arw2019 Dec 10, 2020
d67977d
review comment: values_to_mask
arw2019 Dec 10, 2020
24f418a
review comments: mask_missing/infer_dtype_from_array
arw2019 Dec 11, 2020
aeb0b82
typo
arw2019 Dec 11, 2020
25d0051
typo
arw2019 Dec 11, 2020
bbd25ed
review comment
arw2019 Dec 11, 2020
785d27c
Merge branch 'master' of https://github.com/pandas-dev/pandas into ty…
arw2019 Dec 11, 2020
c2d6467
Merge branch 'master' of https://github.com/pandas-dev/pandas into ty…
arw2019 Dec 14, 2020
b505de5
review comment
arw2019 Dec 14, 2020
e39c152
docstring fix
arw2019 Dec 14, 2020
cb82c9a
review comments
arw2019 Dec 14, 2020
65effed
Merge branch 'master' of https://github.com/pandas-dev/pandas into ty…
arw2019 Dec 15, 2020
2fa64bd
Merge branch 'master' of https://github.com/pandas-dev/pandas into ty…
arw2019 Jan 5, 2021
a54a02f
merge master
arw2019 Feb 21, 2021
315822c
TYP: infer_dtype_from_array
arw2019 Feb 21, 2021
df4b70a
minimize diff
arw2019 Feb 21, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 95 additions & 41 deletions pandas/core/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
Routines for filling missing data.
"""
from functools import partial
from typing import TYPE_CHECKING, Any, List, Optional, Set, Union
from typing import TYPE_CHECKING, Any, Callable, List, Optional, Set, Tuple, Union

import numpy as np

from pandas._libs import algos, lib
from pandas._typing import ArrayLike, Axis, DtypeObj
from pandas._typing import ArrayLike, Axis, DtypeObj, IndexLabel, Scalar
from pandas.compat._optional import import_optional_dependency

from pandas.core.dtypes.cast import infer_dtype_from_array
Expand All @@ -23,7 +23,9 @@
from pandas import Index


def mask_missing(arr: ArrayLike, values_to_mask) -> np.ndarray:
def mask_missing(
arr: ArrayLike, values_to_mask: Union[List, Tuple, Scalar]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

values_to_mask is passed to infer_dtype_from_array

so the type of values_to_mask should be the same as the arr argument of infer_dtype_from_array

the arr argument of infer_dtype_from_array is not yet typed. (sometimes better to type the lower level functions first, but while np.ndarray resolves to Any, mypy won't report any inconsistencies)

looking quickly through the body of infer_dtype_from_array, Series and EA are handled, so passing an array-like to values_to_mask should be fine.

To handle Series, could either add to Union, or could maybe (if no perf implications) also update infer_dtype_from_array to handle Index as well, and use AnyArrayLike instead.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding the Index check to infer_dtype_from_array

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index 165e63e23d..676ab572a0 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -83,6 +83,7 @@ from pandas.core.dtypes.generic import (
     ABCDatetimeArray,
     ABCDatetimeIndex,
     ABCExtensionArray,
+    ABCIndexClass,
     ABCPeriodArray,
     ABCPeriodIndex,
     ABCSeries,
@@ -877,7 +878,7 @@ def infer_dtype_from_array(
     if pandas_dtype and is_extension_array_dtype(arr):
         return arr.dtype, arr
 
-    elif isinstance(arr, ABCSeries):
+    elif isinstance(arr, ABCSeries) or isinstance(arr, ABCIndexClass):
         return arr.dtype, np.asarray(arr)
 
     # don't force numpy coerce with nan's

seems to degrade some of the benchmarks that hit this

asv continuous -f 1.1 upstream/master HEAD -b ^algorithms -b ^replace

       before           after         ratio
     [d0db0098]       [9b685722]
     <master~1>       <master>  
+     3.41±0.04ms       5.40±0.7ms     1.59  algorithms.Factorize.time_factorize(False, False, 'boolean')
+     4.50±0.06ms       6.98±0.9ms     1.55  algorithms.Factorize.time_factorize(False, True, 'boolean')
+     3.27±0.01μs       3.89±0.5μs     1.19  algorithms.Duplicated.time_duplicated(True, 'first', 'float')

SOME BENCHMARKS HAVE CHANGED SIGNIFICANTLY.
PERFORMANCE DECREASED.

so I guess a Union with Series is the way to go unless there's a more efficient patch for infer_dtype_from_array

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for looking into this. Looking again, I see that Index is already handled and falls through the the final return after np.asarray(arr). so could probably use Union[AnyArrayLike, Sequence[Any], Scalar] after all.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right - done

) -> np.ndarray:
"""
Return a masking array of same size/shape as arr
with entries equaling any member of values_to_mask set to True
Expand Down Expand Up @@ -61,7 +63,7 @@ def mask_missing(arr: ArrayLike, values_to_mask) -> np.ndarray:
return mask


def clean_fill_method(method, allow_nearest: bool = False):
def clean_fill_method(method: str, allow_nearest: bool = False) -> Optional[str]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks like method can be None?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good match (mypy missed this)

# asfreq is compat for resampling
if method in [None, "asfreq"]:
return None
Expand Down Expand Up @@ -120,7 +122,7 @@ def clean_interp_method(method: str, **kwargs) -> str:
return method


def find_valid_index(values, how: str):
def find_valid_index(values: ArrayLike, how: str) -> Optional[int]:
"""
Retrieves the index of the first valid value.

Expand Down Expand Up @@ -168,7 +170,7 @@ def interpolate_1d(
bounds_error: bool = False,
order: Optional[int] = None,
**kwargs,
):
) -> np.ndarray:
"""
Logic for the 1-d interpolation. The result should be 1-d, inputs
xvalues and yvalues will each be 1-d arrays of the same length.
Expand Down Expand Up @@ -218,8 +220,13 @@ def interpolate_1d(

# These are sets of index pointers to invalid values... i.e. {0, 1, etc...
all_nans = set(np.flatnonzero(invalid))
start_nans = set(range(find_valid_index(yvalues, "first")))
end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid)))

start_nan_idx = find_valid_index(yvalues, "first")
start_nans = set() if start_nan_idx is None else set(range(start_nan_idx))

end_nan_idx = find_valid_index(yvalues, "last")
end_nans = set() if end_nan_idx is None else set(range(1 + end_nan_idx, len(valid)))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this fixing a bug in the case where end_nan_idx is None?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so because we special case all nans/no nans at the top of the the method


mid_nans = all_nans - start_nans - end_nans

# Like the sets above, preserve_nans contains indices of invalid values,
Expand Down Expand Up @@ -292,8 +299,15 @@ def interpolate_1d(


def _interpolate_scipy_wrapper(
x, y, new_x, method, fill_value=None, bounds_error=False, order=None, **kwargs
):
x,
y,
new_x,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we know anything about x, y, new_x?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes - added

method: Optional[str],
fill_value: Optional[Scalar] = None,
bounds_error: bool = False,
order: Optional[int] = None,
**kwargs,
) -> np.ndarray:
"""
Passed off to scipy.interpolate.interp1d. method is scipy's kind.
Returns an array interpolated at new_x. Add any new methods to
Expand Down Expand Up @@ -324,7 +338,7 @@ def _interpolate_scipy_wrapper(
elif method == "cubicspline":
alt_methods["cubicspline"] = _cubicspline_interpolate

interp1d_methods = [
interp1d_methods: List[str] = [
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mypy cant figure this out on its own?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think maybe I needed this at some point but yes mypy gets this without the annotation with the file as is

"nearest",
"zero",
"slinear",
Expand All @@ -333,15 +347,14 @@ def _interpolate_scipy_wrapper(
"polynomial",
]
if method in interp1d_methods:
if method == "polynomial":
method = order
kind = order if method == "polynomial" else method
terp = interpolate.interp1d(
x, y, kind=method, fill_value=fill_value, bounds_error=bounds_error
x, y, kind=kind, fill_value=fill_value, bounds_error=bounds_error
)
new_y = terp(new_x)
elif method == "spline":
# GH #10633, #24014
if isna(order) or (order <= 0):
if order is None or isna(order) or order <= 0:
raise ValueError(
f"order needs to be specified and greater than 0; got order: {order}"
)
Expand All @@ -356,12 +369,21 @@ def _interpolate_scipy_wrapper(
y = y.copy()
if not new_x.flags.writeable:
new_x = new_x.copy()
method = alt_methods[method]
new_y = method(x, y, new_x, **kwargs)

assert isinstance(method, str)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here: is there any guarantee we already checked this is a string by here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i'm not sure (don't think so) but if it's not then we'll raise with/without the assert because in the line below method is a key to a dict where all keys are strings

# ignores some kwargs that could be passed along.
alt_methods = {
"barycentric": interpolate.barycentric_interpolate,
"krogh": interpolate.krogh_interpolate,
"from_derivatives": _from_derivatives,
"piecewise_polynomial": _from_derivatives,
}

I can add a more informative error message here, though

alt_method = alt_methods[method]
new_y = alt_method(x, y, new_x, **kwargs)
return new_y


def _from_derivatives(xi, yi, x, order=None, der=0, extrapolate=False):
def _from_derivatives(
xi: np.ndarray,
yi: np.ndarray,
x: Union[Scalar, ArrayLike],
order: Optional[Union[int, List[int]]] = None,
der: Union[int, List[int]] = 0,
extrapolate: bool = False,
) -> np.ndarray:
"""
Convenience function for interpolate.BPoly.from_derivatives.

Expand Down Expand Up @@ -404,7 +426,13 @@ def _from_derivatives(xi, yi, x, order=None, der=0, extrapolate=False):
return m(x)


def _akima_interpolate(xi, yi, x, der=0, axis=0):
def _akima_interpolate(
xi: ArrayLike,
yi: ArrayLike,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the docstring says ArrayLike, but im guessing this wouldnt work on EAs

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess not since this is a wrapper for scipy.interpolate.Akima1DInterpolator (and mypy works with both of these annotated asnp.ndarray)

x: Union[Scalar, ArrayLike],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think these are just np.ndarray right?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and similar in almost all places that you are using Union[Scalar, ArrayLike] unless you can reveal_type that this is not the case

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right, they're always ndarray except in _interpolate_scipy_wrapper. It's because in _interpolate_scipy_wrapper we cast any scalar input to ndarray and these functions always get called from there:

new_x = np.asarray(new_x)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed these, and docstrings too

der: Optional[int] = 0,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is None really allowed?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, don't think so. Changed to plain int

axis: Optional[int] = 0,
) -> Union[Scalar, ArrayLike]:
"""
Convenience function for akima interpolation.
xi and yi are arrays of values used to approximate some function f,
Expand Down Expand Up @@ -447,7 +475,14 @@ def _akima_interpolate(xi, yi, x, der=0, axis=0):
return P(x, nu=der)


def _cubicspline_interpolate(xi, yi, x, axis=0, bc_type="not-a-knot", extrapolate=None):
def _cubicspline_interpolate(
xi: ArrayLike,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can be np.ndarray here as well, I think?

yi: ArrayLike,
x: Union[ArrayLike, Scalar],
axis: Optional[int] = 0,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How is axis optional? (it has a default of 0 already, so it's always specified?)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIK Optional means it could be None.

That said, the docstring lists this arg as optional so I'll fix that while I'm here

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Optional[T] in type-hinting is not the same as optional in docstrings. The former is an alias for Union[T, None] while the latter means doesn't need to be specified.

bc_type: Union[str, Tuple] = "not-a-knot",
extrapolate: Optional[Union[bool, str]] = None,
) -> Union[ArrayLike, Scalar]:
"""
Convenience function for cubic spline data interpolator.

Expand Down Expand Up @@ -555,6 +590,8 @@ def _interpolate_with_limit_area(
first = find_valid_index(values, "first")
last = find_valid_index(values, "last")

assert first is not None and last is not None

values = interpolate_2d(
values,
method=method,
Expand All @@ -572,12 +609,12 @@ def _interpolate_with_limit_area(


def interpolate_2d(
values,
values: np.ndarray,
method: str = "pad",
axis: Axis = 0,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

guessing just int

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

changed this to int

limit: Optional[int] = None,
limit_area: Optional[str] = None,
):
) -> np.ndarray:
"""
Perform an actual interpolation of values, values will be make 2-d if
needed fills inplace, returns the result.
Expand Down Expand Up @@ -623,7 +660,10 @@ def interpolate_2d(
raise AssertionError("cannot interpolate on a ndim == 1 with axis != 0")
values = values.reshape(tuple((1,) + values.shape))

method = clean_fill_method(method)
method_cleaned = clean_fill_method(method)
assert isinstance(method_cleaned, str)
method = method_cleaned

tvalues = transf(values)
if method == "pad":
result = _pad_2d(tvalues, limit=limit)
Expand All @@ -642,7 +682,9 @@ def interpolate_2d(
return result


def _cast_values_for_fillna(values, dtype: DtypeObj, has_mask: bool):
def _cast_values_for_fillna(
values: ArrayLike, dtype: DtypeObj, has_mask: bool
) -> ArrayLike:
"""
Cast values to a dtype that algos.pad and algos.backfill can handle.
"""
Expand All @@ -661,34 +703,41 @@ def _cast_values_for_fillna(values, dtype: DtypeObj, has_mask: bool):
return values


def _fillna_prep(values, mask=None):
def _fillna_prep(
values: np.ndarray, mask: Optional[np.ndarray] = None
) -> Tuple[np.ndarray, np.ndarray]:
# boilerplate for _pad_1d, _backfill_1d, _pad_2d, _backfill_2d
dtype = values.dtype

has_mask = mask is not None
if not has_mask:
# This needs to occur before datetime/timedeltas are cast to int64
mask = isna(values)

values = _cast_values_for_fillna(values, dtype, has_mask)
# This needs to occur before datetime/timedeltas are cast to int64
mask = isna(values) if mask is None else mask

values = _cast_values_for_fillna(values, values.dtype, has_mask)
mask = mask.view(np.uint8)

return values, mask


def _pad_1d(values, limit=None, mask=None):
def _pad_1d(
values: np.ndarray, limit: Optional[int] = None, mask: Optional[np.ndarray] = None
) -> np.ndarray:
values, mask = _fillna_prep(values, mask)
algos.pad_inplace(values, mask, limit=limit)
return values


def _backfill_1d(values, limit=None, mask=None):
def _backfill_1d(
values: np.ndarray, limit: Optional[int] = None, mask: Optional[np.ndarray] = None
) -> np.ndarray:
values, mask = _fillna_prep(values, mask)
algos.backfill_inplace(values, mask, limit=limit)
return values


def _pad_2d(values, limit=None, mask=None):
def _pad_2d(
values: np.ndarray, limit: Optional[int] = None, mask: Optional[np.ndarray] = None
) -> np.ndarray:
values, mask = _fillna_prep(values, mask)

if np.all(values.shape):
Expand All @@ -699,7 +748,9 @@ def _pad_2d(values, limit=None, mask=None):
return values


def _backfill_2d(values, limit=None, mask=None):
def _backfill_2d(
values: np.ndarray, limit: Optional[int] = None, mask: Optional[np.ndarray] = None
) -> np.ndarray:
values, mask = _fillna_prep(values, mask)

if np.all(values.shape):
Expand All @@ -713,16 +764,19 @@ def _backfill_2d(values, limit=None, mask=None):
_fill_methods = {"pad": _pad_1d, "backfill": _backfill_1d}


def get_fill_func(method):
method = clean_fill_method(method)
return _fill_methods[method]
def get_fill_func(method: str) -> Callable:
method_cleaned = clean_fill_method(method)
assert isinstance(method_cleaned, str)
return _fill_methods[method_cleaned]


def clean_reindex_fill_method(method):
def clean_reindex_fill_method(method: str) -> Optional[str]:
return clean_fill_method(method, allow_nearest=True)


def _interp_limit(invalid, fw_limit, bw_limit):
def _interp_limit(
invalid: np.ndarray, fw_limit: Optional[int], bw_limit: Optional[int]
) -> Set[IndexLabel]:
"""
Get indexers of values that won't be filled
because they exceed the limits.
Expand Down Expand Up @@ -757,7 +811,7 @@ def _interp_limit(invalid, fw_limit, bw_limit):
f_idx = set()
b_idx = set()

def inner(invalid, limit):
def inner(invalid: np.ndarray, limit: int) -> Set[IndexLabel]:
limit = min(limit, N)
windowed = _rolling_window(invalid, limit + 1).all(1)
idx = set(np.where(windowed)[0] + limit) | set(
Expand Down Expand Up @@ -787,7 +841,7 @@ def inner(invalid, limit):
return f_idx & b_idx


def _rolling_window(a: np.ndarray, window: int):
def _rolling_window(a: np.ndarray, window: int) -> np.ndarray:
"""
[True, True, False, True, False], 2 ->

Expand Down