Skip to content

Commit c00967c

Browse files
Merge remote-tracking branch 'upstream/master' into typing
2 parents 43fb342 + df2e081 commit c00967c

31 files changed

+654
-446
lines changed

ci/code_checks.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then
122122
MSG='Check for non-standard imports' ; echo $MSG
123123
invgrep -R --include="*.py*" -E "from pandas.core.common import " pandas
124124
invgrep -R --include="*.py*" -E "from collections.abc import " pandas
125-
# invgrep -R --include="*.py*" -E "from numpy import nan " pandas # GH#24822 not yet implemented since the offending imports have not all been removed
125+
invgrep -R --include="*.py*" -E "from numpy import nan " pandas
126126
RET=$(($RET + $?)) ; echo $MSG "DONE"
127127

128128
MSG='Check for use of exec' ; echo $MSG

pandas/core/dtypes/cast.py

+17-7
Original file line numberDiff line numberDiff line change
@@ -360,22 +360,32 @@ def maybe_promote(dtype, fill_value=np.nan):
360360
if isinstance(fill_value, datetime) and fill_value.tzinfo is not None:
361361
# Trying to insert tzaware into tznaive, have to cast to object
362362
dtype = np.dtype(np.object_)
363+
elif is_integer(fill_value) or (is_float(fill_value) and not isna(fill_value)):
364+
dtype = np.dtype(np.object_)
363365
else:
364366
try:
365367
fill_value = tslibs.Timestamp(fill_value).to_datetime64()
366368
except (TypeError, ValueError):
367369
dtype = np.dtype(np.object_)
368370
elif issubclass(dtype.type, np.timedelta64):
369-
try:
370-
fv = tslibs.Timedelta(fill_value)
371-
except ValueError:
371+
if (
372+
is_integer(fill_value)
373+
or (is_float(fill_value) and not np.isnan(fill_value))
374+
or isinstance(fill_value, str)
375+
):
376+
# TODO: What about str that can be a timedelta?
372377
dtype = np.dtype(np.object_)
373378
else:
374-
if fv is NaT:
375-
# NaT has no `to_timedelta64` method
376-
fill_value = np.timedelta64("NaT", "ns")
379+
try:
380+
fv = tslibs.Timedelta(fill_value)
381+
except ValueError:
382+
dtype = np.dtype(np.object_)
377383
else:
378-
fill_value = fv.to_timedelta64()
384+
if fv is NaT:
385+
# NaT has no `to_timedelta64` method
386+
fill_value = np.timedelta64("NaT", "ns")
387+
else:
388+
fill_value = fv.to_timedelta64()
379389
elif is_datetime64tz_dtype(dtype):
380390
if isna(fill_value):
381391
fill_value = NaT

pandas/core/frame.py

-8
Original file line numberDiff line numberDiff line change
@@ -6737,14 +6737,6 @@ def apply(
67376737
DataFrame.aggregate: Only perform aggregating type operations.
67386738
DataFrame.transform: Only perform transforming type operations.
67396739
6740-
Notes
6741-
-----
6742-
In the current implementation apply calls `func` twice on the
6743-
first column/row to decide whether it can take a fast or slow
6744-
code path. This can lead to unexpected behavior if `func` has
6745-
side-effects, as they will take effect twice for the first
6746-
column/row.
6747-
67486740
Examples
67496741
--------
67506742

pandas/core/groupby/generic.py

+13-8
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,9 @@ def pinner(cls):
142142
class SeriesGroupBy(GroupBy):
143143
_apply_whitelist = base.series_apply_whitelist
144144

145+
def _iterate_slices(self):
146+
yield self._selection_name, self._selected_obj
147+
145148
@property
146149
def _selection_name(self):
147150
"""
@@ -333,7 +336,7 @@ def _aggregate_multiple_funcs(self, arg, _level):
333336

334337
return DataFrame(results, columns=columns)
335338

336-
def _wrap_output(self, output, index, names=None):
339+
def _wrap_series_output(self, output, index, names=None):
337340
""" common agg/transform wrapping logic """
338341
output = output[self._selection_name]
339342

@@ -346,13 +349,15 @@ def _wrap_output(self, output, index, names=None):
346349
return Series(output, index=index, name=name)
347350

348351
def _wrap_aggregated_output(self, output, names=None):
349-
result = self._wrap_output(
352+
result = self._wrap_series_output(
350353
output=output, index=self.grouper.result_index, names=names
351354
)
352355
return self._reindex_output(result)._convert(datetime=True)
353356

354357
def _wrap_transformed_output(self, output, names=None):
355-
return self._wrap_output(output=output, index=self.obj.index, names=names)
358+
return self._wrap_series_output(
359+
output=output, index=self.obj.index, names=names
360+
)
356361

357362
def _wrap_applied_output(self, keys, values, not_indexed_same=False):
358363
if len(keys) == 0:
@@ -876,7 +881,7 @@ def aggregate(self, func=None, *args, **kwargs):
876881
if self.grouper.nkeys > 1:
877882
return self._python_agg_general(func, *args, **kwargs)
878883
elif args or kwargs:
879-
result = self._aggregate_generic(func, *args, **kwargs)
884+
result = self._aggregate_frame(func, *args, **kwargs)
880885
else:
881886

882887
# try to treat as if we are passing a list
@@ -885,7 +890,7 @@ def aggregate(self, func=None, *args, **kwargs):
885890
[func], _level=_level, _axis=self.axis
886891
)
887892
except Exception:
888-
result = self._aggregate_generic(func)
893+
result = self._aggregate_frame(func)
889894
else:
890895
result.columns = Index(
891896
result.columns.levels[0], name=self._selected_obj.columns.name
@@ -1009,7 +1014,7 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1):
10091014

10101015
return new_items, new_blocks
10111016

1012-
def _aggregate_generic(self, func, *args, **kwargs):
1017+
def _aggregate_frame(self, func, *args, **kwargs):
10131018
if self.grouper.nkeys != 1:
10141019
raise AssertionError("Number of keys must be 1")
10151020

@@ -1032,7 +1037,7 @@ def _aggregate_generic(self, func, *args, **kwargs):
10321037
wrapper = lambda x: func(x, *args, **kwargs)
10331038
result[name] = data.apply(wrapper, axis=axis)
10341039

1035-
return self._wrap_generic_output(result, obj)
1040+
return self._wrap_frame_output(result, obj)
10361041

10371042
def _aggregate_item_by_item(self, func, *args, **kwargs):
10381043
# only for axis==0
@@ -1516,7 +1521,7 @@ def _gotitem(self, key, ndim, subset=None):
15161521

15171522
raise AssertionError("invalid ndim for _gotitem")
15181523

1519-
def _wrap_generic_output(self, result, obj):
1524+
def _wrap_frame_output(self, result, obj):
15201525
result_index = self.grouper.levels[0]
15211526

15221527
if self.axis == 0:

pandas/core/groupby/groupby.py

+39-29
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ class providing the base-class of operations.
1111
from contextlib import contextmanager
1212
import datetime
1313
from functools import partial, wraps
14+
import inspect
15+
import re
1416
import types
1517
from typing import Dict, FrozenSet, List, Optional, Tuple, Type, Union
1618

@@ -613,48 +615,53 @@ def _make_wrapper(self, name):
613615
return self.apply(lambda self: getattr(self, name))
614616

615617
f = getattr(type(self._selected_obj), name)
618+
sig = inspect.signature(f)
616619

617620
def wrapper(*args, **kwargs):
618621
# a little trickery for aggregation functions that need an axis
619622
# argument
620-
kwargs_with_axis = kwargs.copy()
621-
if "axis" not in kwargs_with_axis or kwargs_with_axis["axis"] is None:
622-
kwargs_with_axis["axis"] = self.axis
623-
624-
def curried_with_axis(x):
625-
return f(x, *args, **kwargs_with_axis)
623+
if "axis" in sig.parameters:
624+
if kwargs.get("axis", None) is None:
625+
kwargs["axis"] = self.axis
626626

627627
def curried(x):
628628
return f(x, *args, **kwargs)
629629

630630
# preserve the name so we can detect it when calling plot methods,
631631
# to avoid duplicates
632-
curried.__name__ = curried_with_axis.__name__ = name
632+
curried.__name__ = name
633633

634634
# special case otherwise extra plots are created when catching the
635635
# exception below
636636
if name in base.plotting_methods:
637637
return self.apply(curried)
638638

639639
try:
640-
return self.apply(curried_with_axis)
641-
except Exception:
642-
try:
643-
return self.apply(curried)
644-
except Exception:
645-
646-
# related to : GH3688
647-
# try item-by-item
648-
# this can be called recursively, so need to raise
649-
# ValueError
650-
# if we don't have this method to indicated to aggregate to
651-
# mark this column as an error
652-
try:
653-
return self._aggregate_item_by_item(name, *args, **kwargs)
654-
except AttributeError:
655-
# e.g. SparseArray has no flags attr
656-
raise ValueError
657-
640+
return self.apply(curried)
641+
except TypeError as err:
642+
if not re.search(
643+
"reduction operation '.*' not allowed for this dtype", str(err)
644+
):
645+
# We don't have a cython implementation
646+
# TODO: is the above comment accurate?
647+
raise
648+
649+
# related to : GH3688
650+
# try item-by-item
651+
# this can be called recursively, so need to raise
652+
# ValueError
653+
# if we don't have this method to indicated to aggregate to
654+
# mark this column as an error
655+
try:
656+
return self._aggregate_item_by_item(name, *args, **kwargs)
657+
except AttributeError:
658+
# e.g. SparseArray has no flags attr
659+
# FIXME: 'SeriesGroupBy' has no attribute '_aggregate_item_by_item'
660+
# occurs in idxmax() case
661+
# in tests.groupby.test_function.test_non_cython_api
662+
raise ValueError
663+
664+
wrapper.__name__ = name
658665
return wrapper
659666

660667
def get_group(self, name, obj=None):
@@ -747,7 +754,7 @@ def _python_apply_general(self, f):
747754
)
748755

749756
def _iterate_slices(self):
750-
yield self._selection_name, self._selected_obj
757+
raise AbstractMethodError(self)
751758

752759
def transform(self, func, *args, **kwargs):
753760
raise AbstractMethodError(self)
@@ -872,6 +879,12 @@ def _cython_transform(self, how, numeric_only=True, **kwargs):
872879
def _wrap_aggregated_output(self, output, names=None):
873880
raise AbstractMethodError(self)
874881

882+
def _wrap_transformed_output(self, output, names=None):
883+
raise AbstractMethodError(self)
884+
885+
def _wrap_applied_output(self, keys, values, not_indexed_same=False):
886+
raise AbstractMethodError(self)
887+
875888
def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1):
876889
output = {}
877890
for name, obj in self._iterate_slices():
@@ -922,9 +935,6 @@ def _python_agg_general(self, func, *args, **kwargs):
922935

923936
return self._wrap_aggregated_output(output)
924937

925-
def _wrap_applied_output(self, *args, **kwargs):
926-
raise AbstractMethodError(self)
927-
928938
def _concat_objects(self, keys, values, not_indexed_same=False):
929939
from pandas.core.reshape.concat import concat
930940

pandas/core/internals/concat.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ def get_empty_dtype_and_na(join_units):
288288
return np.float64, np.nan
289289

290290
if is_uniform_reindex(join_units):
291-
# XXX: integrate property
291+
# FIXME: integrate property
292292
empty_dtype = join_units[0].block.dtype
293293
upcasted_na = join_units[0].block.fill_value
294294
return empty_dtype, upcasted_na
@@ -339,6 +339,7 @@ def get_empty_dtype_and_na(join_units):
339339
if not upcast_classes:
340340
upcast_classes = null_upcast_classes
341341

342+
# TODO: de-duplicate with maybe_promote?
342343
# create the result
343344
if "object" in upcast_classes:
344345
return np.dtype(np.object_), np.nan
@@ -357,7 +358,7 @@ def get_empty_dtype_and_na(join_units):
357358
elif "datetime" in upcast_classes:
358359
return np.dtype("M8[ns]"), tslibs.iNaT
359360
elif "timedelta" in upcast_classes:
360-
return np.dtype("m8[ns]"), tslibs.iNaT
361+
return np.dtype("m8[ns]"), np.timedelta64("NaT", "ns")
361362
else: # pragma
362363
try:
363364
g = np.find_common_type(upcast_classes, [])

pandas/core/internals/managers.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2035,7 +2035,7 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):
20352035
values = b.values
20362036
if copy:
20372037
values = values.copy()
2038-
elif not copy:
2038+
else:
20392039
values = values.view()
20402040
b = b.make_block_same_class(values, placement=placement)
20412041
elif is_uniform_join_units(join_units):

pandas/core/ops/__init__.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -766,9 +766,9 @@ def f(self, other, axis=default_axis, level=None):
766766
return f
767767

768768

769-
def _comp_method_FRAME(cls, func, special):
770-
str_rep = _get_opstr(func)
771-
op_name = _get_op_name(func, special)
769+
def _comp_method_FRAME(cls, op, special):
770+
str_rep = _get_opstr(op)
771+
op_name = _get_op_name(op, special)
772772

773773
@Appender("Wrapper for comparison method {name}".format(name=op_name))
774774
def f(self, other):
@@ -781,18 +781,18 @@ def f(self, other):
781781
raise ValueError(
782782
"Can only compare identically-labeled DataFrame objects"
783783
)
784-
new_data = dispatch_to_series(self, other, func, str_rep)
784+
new_data = dispatch_to_series(self, other, op, str_rep)
785785
return self._construct_result(new_data)
786786

787787
elif isinstance(other, ABCSeries):
788788
return _combine_series_frame(
789-
self, other, func, fill_value=None, axis=None, level=None
789+
self, other, op, fill_value=None, axis=None, level=None
790790
)
791791
else:
792792

793793
# straight boolean comparisons we want to allow all columns
794794
# (regardless of dtype to pass thru) See #4537 for discussion.
795-
new_data = dispatch_to_series(self, other, func)
795+
new_data = dispatch_to_series(self, other, op)
796796
return self._construct_result(new_data)
797797

798798
f.__name__ = op_name

pandas/core/ops/array_ops.py

+15-6
Original file line numberDiff line numberDiff line change
@@ -118,14 +118,14 @@ def masked_arith_op(x, y, op):
118118
return result
119119

120120

121-
def define_na_arithmetic_op(op, str_rep, eval_kwargs):
121+
def define_na_arithmetic_op(op, str_rep: str, eval_kwargs):
122122
def na_op(x, y):
123123
return na_arithmetic_op(x, y, op, str_rep, eval_kwargs)
124124

125125
return na_op
126126

127127

128-
def na_arithmetic_op(left, right, op, str_rep, eval_kwargs):
128+
def na_arithmetic_op(left, right, op, str_rep: str, eval_kwargs):
129129
"""
130130
Return the result of evaluating op on the passed in values.
131131
@@ -173,6 +173,7 @@ def arithmetic_op(
173173
Cannot be a DataFrame or Index. Series is *not* excluded.
174174
op : {operator.add, operator.sub, ...}
175175
Or one of the reversed variants from roperator.
176+
str_rep : str
176177
177178
Returns
178179
-------
@@ -279,8 +280,16 @@ def comparison_op(
279280
return res_values
280281

281282

282-
def na_logical_op(x, y, op):
283+
def na_logical_op(x: np.ndarray, y, op):
283284
try:
285+
# For exposition, write:
286+
# yarr = isinstance(y, np.ndarray)
287+
# yint = is_integer(y) or (yarr and y.dtype.kind == "i")
288+
# ybool = is_bool(y) or (yarr and y.dtype.kind == "b")
289+
# xint = x.dtype.kind == "i"
290+
# xbool = x.dtype.kind == "b"
291+
# Then Cases where this goes through without raising include:
292+
# (xint or xbool) and (yint or bool)
284293
result = op(x, y)
285294
except TypeError:
286295
if isinstance(y, np.ndarray):
@@ -304,9 +313,9 @@ def na_logical_op(x, y, op):
304313
NotImplementedError,
305314
):
306315
raise TypeError(
307-
"cannot compare a dtyped [{dtype}] array "
308-
"with a scalar of type [{typ}]".format(
309-
dtype=x.dtype, typ=type(y).__name__
316+
"Cannot perform '{op}' with a dtyped [{dtype}] array "
317+
"and scalar of type [{typ}]".format(
318+
op=op.__name__, dtype=x.dtype, typ=type(y).__name__
310319
)
311320
)
312321

0 commit comments

Comments
 (0)