pandas-dev · mroeschke · Feb 13, 2023 · Feb 10, 2023 · Feb 11, 2023 · Feb 11, 2023
diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c
@@ -67,7 +67,7 @@ void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
 
     func = PyObject_GetAttrString(src->obj, "read");
 
-    /* TODO: does this release the GIL? */
+    /* Note: PyObject_CallObject requires the GIL */
     result = PyObject_CallObject(func, args);
     Py_XDECREF(args);
     Py_XDECREF(func);

diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx
@@ -651,6 +651,7 @@ cdef datetime dateutil_parse(
     try:
         res, _ = DEFAULTPARSER._parse(timestr, dayfirst=dayfirst, yearfirst=yearfirst)
     except InvalidOperation:
+        # GH#51157 dateutil can raise decimal.InvalidOperation
         res = None
 
     if res is None:

diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
@@ -14,6 +14,7 @@
     ContextManager,
     Counter,
     Iterable,
+    cast,
 )
 
 import numpy as np
@@ -121,6 +122,7 @@
         PeriodIndex,
         TimedeltaIndex,
     )
+    from pandas.core.arrays import ArrowExtensionArray
 
 _N = 30
 _K = 4
@@ -1019,11 +1021,11 @@ def shares_memory(left, right) -> bool:
 
     if isinstance(left, ExtensionArray) and left.dtype == "string[pyarrow]":
         # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
+        left = cast("ArrowExtensionArray", left)
         if isinstance(right, ExtensionArray) and right.dtype == "string[pyarrow]":
-            # error: "ExtensionArray" has no attribute "_data"
-            left_pa_data = left._data  # type: ignore[attr-defined]
-            # error: "ExtensionArray" has no attribute "_data"
-            right_pa_data = right._data  # type: ignore[attr-defined]
+            right = cast("ArrowExtensionArray", right)
+            left_pa_data = left._data
+            right_pa_data = right._data
             left_buf1 = left_pa_data.chunk(0).buffers()[1]
             right_buf1 = right_pa_data.chunk(0).buffers()[1]
             return left_buf1 == right_buf1

diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py
@@ -20,6 +20,7 @@
 from typing import (
     Any,
     TypeVar,
+    cast,
     overload,
 )
 
@@ -159,8 +160,8 @@ def validate_argsort_with_ascending(ascending: bool | int | None, args, kwargs)
         ascending = True
 
     validate_argsort_kind(args, kwargs, max_fname_arg_count=3)
-    # error: Incompatible return value type (got "int", expected "bool")
-    return ascending  # type: ignore[return-value]
+    ascending = cast(bool, ascending)
+    return ascending
 
 
 CLIP_DEFAULTS: dict[str, Any] = {"out": None}

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -588,13 +588,11 @@ class NDFrameApply(Apply):
     not GroupByApply or ResamplerWindowApply
     """
 
+    obj: DataFrame | Series
+
     @property
     def index(self) -> Index:
-        # error: Argument 1 to "__get__" of "AxisProperty" has incompatible type
-        # "Union[Series, DataFrame, GroupBy[Any], SeriesGroupBy,
-        # DataFrameGroupBy, BaseWindow, Resampler]"; expected "Union[DataFrame,
-        # Series]"
-        return self.obj.index  # type:ignore[arg-type]
+        return self.obj.index
 
     @property
     def agg_axis(self) -> Index:

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -88,7 +88,6 @@
 from pandas.core.algorithms import (
     factorize,
     take_nd,
-    unique1d,
 )
 from pandas.core.arrays._mixins import (
     NDArrayBackedExtensionArray,
@@ -2096,8 +2095,8 @@ def unique(self):
         ['b', 'a']
         Categories (3, object): ['a' < 'b' < 'c']
         """
-        unique_codes = unique1d(self.codes)
-        return self._from_backing_data(unique_codes)
+        # pylint: disable=useless-parent-delegation
+        return super().unique()
 
     def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
         # make sure we have correct itemsize for resulting codes

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3811,6 +3811,8 @@ def _getitem_multilevel(self, key):
             # string in the key. If the result is a Series, exclude the
             # implied empty string from its name.
             if len(result.columns) == 1:
+                # e.g. test_frame_getitem_multicolumn_empty_level,
+                #  test_frame_mixed_depth_get, test_loc_setitem_single_column_slice
                 top = result.columns[0]
                 if isinstance(top, tuple):
                     top = top[0]
@@ -7822,13 +7824,13 @@ def combine(
         result = {}
         for col in new_columns:
             series = this[col]
-            otherSeries = other[col]
+            other_series = other[col]
 
             this_dtype = series.dtype
-            other_dtype = otherSeries.dtype
+            other_dtype = other_series.dtype
 
             this_mask = isna(series)
-            other_mask = isna(otherSeries)
+            other_mask = isna(other_series)
 
             # don't overwrite columns unnecessarily
             # DO propagate if this column is not in the intersection
@@ -7838,9 +7840,9 @@ def combine(
 
             if do_fill:
                 series = series.copy()
-                otherSeries = otherSeries.copy()
+                other_series = other_series.copy()
                 series[this_mask] = fill_value
-                otherSeries[other_mask] = fill_value
+                other_series[other_mask] = fill_value
 
             if col not in self.columns:
                 # If self DataFrame does not have col in other DataFrame,
@@ -7855,9 +7857,9 @@ def combine(
                 # if we have different dtypes, possibly promote
                 new_dtype = find_common_type([this_dtype, other_dtype])
                 series = series.astype(new_dtype, copy=False)
-                otherSeries = otherSeries.astype(new_dtype, copy=False)
+                other_series = other_series.astype(new_dtype, copy=False)
 
-            arr = func(series, otherSeries)
+            arr = func(series, other_series)
             if isinstance(new_dtype, np.dtype):
                 # if new_dtype is an EA Dtype, then `func` is expected to return
                 # the correct dtype without any additional casting
@@ -9919,7 +9921,7 @@ def _dict_round(df: DataFrame, decimals):
                 except KeyError:
                     yield vals
 
-        def _series_round(ser: Series, decimals: int):
+        def _series_round(ser: Series, decimals: int) -> Series:
             if is_integer_dtype(ser.dtype) or is_float_dtype(ser.dtype):
                 return ser.round(decimals)
             return ser

@@ -1265,9 +1265,10 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
         result = op.agg()
         if not is_dict_like(func) and result is not None:
             return result
-        elif relabeling and result is not None:
+        elif relabeling:
             # this should be the only (non-raising) case with relabeling
             # used reordered index of columns
+            result = cast(DataFrame, result)
             result = result.iloc[:, order]
             result = cast(DataFrame, result)
             # error: Incompatible types in assignment (expression has type
@@ -1336,6 +1337,9 @@ def _iterate_slices(self) -> Iterable[Series]:
         else:
             for label, values in obj.items():
                 if label in self.exclusions:
+                    # Note: if we tried to just iterate over _obj_with_exclusions,
+                    #  we would break test_wrap_agg_out by yielding a column
+                    #  that is skipped here but not dropped from obj_with_exclusions
                     continue
 
                 yield values
@@ -1379,6 +1383,7 @@ def _wrap_applied_output(
             return result
 
         # GH12824
+        # using values[0] here breaks test_groupby_apply_none_first
         first_not_none = next(com.not_none(*values), None)
 
         if first_not_none is None:
@@ -1817,7 +1822,7 @@ def _indexed_output_to_ndframe(
     def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame:
         return self.obj._constructor(mgr)
 
-    def _iterate_column_groupbys(self, obj: DataFrame | Series):
+    def _iterate_column_groupbys(self, obj: DataFrame):
         for i, colname in enumerate(obj.columns):
             yield colname, SeriesGroupBy(
                 obj.iloc[:, i],

@@ -89,7 +89,6 @@ class providing the base-class of operations.
 
 from pandas.core import (
     algorithms,
-    nanops,
     sample,
 )
 from pandas.core._numba import executor
@@ -1342,10 +1341,6 @@ def f(g):
                     with np.errstate(all="ignore"):
                         return func(g, *args, **kwargs)
 
-            elif hasattr(nanops, f"nan{func}"):
-                # TODO: should we wrap this in to e.g. _is_builtin_func?
-                f = getattr(nanops, f"nan{func}")
-
             else:
                 raise ValueError(
                     "func must be a callable if args or kwargs are supplied"
@@ -1417,6 +1412,8 @@ def _python_apply_general(
             is_transform,
         )
 
+    # TODO: I (jbrockmendel) think this should be equivalent to doing grouped_reduce
+    #  on _agg_py_fallback, but trying that here fails a bunch of tests 2023-02-07.
     @final
     def _python_agg_general(self, func, *args, **kwargs):
         func = com.is_builtin_func(func)
@@ -2902,10 +2899,7 @@ def blk_func(values: ArrayLike) -> ArrayLike:
                     out[i, :] = algorithms.take_nd(value_element, indexer)
                 return out
 
-        obj = self._obj_with_exclusions
-        if self.axis == 1:
-            obj = obj.T
-        mgr = obj._mgr
+        mgr = self._get_data_to_aggregate()
         res_mgr = mgr.apply(blk_func)
 
         new_obj = self._wrap_agged_manager(res_mgr)

@@ -207,6 +207,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray:
             if how in ["var", "mean"] or (
                 self.kind == "transform" and self.has_dropped_na
             ):
+                # has_dropped_na check need for test_null_group_str_transformer
                 # result may still include NaN, so we have to cast
                 values = ensure_float64(values)
 

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -630,7 +630,6 @@ def _replace_regex(
         to_replace,
         value,
         inplace: bool = False,
-        convert: bool = True,
         mask=None,
     ) -> list[Block]:
         """
@@ -644,8 +643,6 @@ def _replace_regex(
             Replacement object.
         inplace : bool, default False
             Perform inplace modification.
-        convert : bool, default True
-            If true, try to coerce any object types to better types.
         mask : array-like of bool, optional
             True indicate corresponding element is ignored.
 
@@ -788,7 +785,6 @@ def _replace_coerce(
                 to_replace,
                 value,
                 inplace=inplace,
-                convert=False,
                 mask=mask,
             )
         else:

diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -1512,6 +1512,10 @@ def _maybe_null_out(
     Dtype
         The product of all elements on a given axis. ( NaNs are treated as 1)
     """
+    if mask is None and min_count == 0:
+        # nothing to check; short-circuit
+        return result
+
     if axis is not None and isinstance(result, np.ndarray):
         if mask is not None:
             null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -4045,6 +4045,9 @@ def get_blk_items(mgr):
             blocks = list(mgr.blocks)
             blk_items = get_blk_items(mgr)
             for c in data_columns:
+                # This reindex would raise ValueError if we had a duplicate
+                #  index, so we can infer that (as long as axis==1) we
+                #  get a single column back, so a single block.
                 mgr = frame.reindex([c], axis=axis)._mgr
                 mgr = cast(BlockManager, mgr)
                 blocks.extend(mgr.blocks)

diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py
@@ -1574,7 +1574,7 @@ def test_pi_sub_period(self):
         assert result.freq == exp.freq
 
     def test_pi_sub_pdnat(self):
-        # GH#13071
+        # GH#13071, GH#19389
         idx = PeriodIndex(
             ["2011-01", "2011-02", "NaT", "2011-04"], freq="M", name="idx"
         )

diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py
@@ -496,7 +496,6 @@ def test_is_datetime_or_timedelta_dtype():
     assert not com.is_datetime_or_timedelta_dtype(pd.Series([1, 2]))
     assert not com.is_datetime_or_timedelta_dtype(np.array(["a", "b"]))
 
-    # TODO(jreback), this is slightly suspect
     assert not com.is_datetime_or_timedelta_dtype(DatetimeTZDtype("ns", "US/Eastern"))
 
     assert com.is_datetime_or_timedelta_dtype(np.datetime64)

diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py
@@ -568,6 +568,7 @@ def test_array_equivalent_nested(strict_nan):
     assert not array_equivalent(left, right, strict_nan=strict_nan)
 
 
+@pytest.mark.filterwarnings("ignore:elementwise comparison failed:DeprecationWarning")
 @pytest.mark.parametrize(
     "strict_nan", [pytest.param(True, marks=pytest.mark.xfail), False]
 )
@@ -610,6 +611,7 @@ def test_array_equivalent_nested_list(strict_nan):
     assert not array_equivalent(left, right, strict_nan=strict_nan)
 
 
+@pytest.mark.filterwarnings("ignore:elementwise comparison failed:DeprecationWarning")
 @pytest.mark.xfail(reason="failing")
 @pytest.mark.parametrize("strict_nan", [True, False])
 def test_array_equivalent_nested_mixed_list(strict_nan):

diff --git a/pandas/tests/groupby/test_allowlist.py b/pandas/tests/groupby/test_allowlist.py
@@ -314,8 +314,6 @@ def test_all_methods_categorized(mframe):
 
     # removed a public method?
     all_categorized = reduction_kernels | transformation_kernels | groupby_other_methods
-    print(names)
-    print(all_categorized)
     if names != all_categorized:
         msg = f"""
 Some methods which are supposed to be on the Grouper class

diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py
@@ -63,7 +63,3 @@ def test_generate_bins(binner, closed, expected):
     values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64)
     result = lib.generate_bins_dt64(values, binner, closed=closed)
     tm.assert_numpy_array_equal(result, expected)
-
-
-class TestMoments:
-    pass
diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py
@@ -369,8 +369,7 @@ def test_filter_and_transform_with_non_unique_int_index():
     tm.assert_series_equal(actual, expected)
 
     actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
-    NA = np.nan
-    expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
+    expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
     # ^ made manually because this can get confusing!
     tm.assert_series_equal(actual, expected)
 
@@ -412,8 +411,7 @@ def test_filter_and_transform_with_multiple_non_unique_int_index():
     tm.assert_series_equal(actual, expected)
 
     actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
-    NA = np.nan
-    expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
+    expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
     # ^ made manually because this can get confusing!
     tm.assert_series_equal(actual, expected)
 
@@ -455,8 +453,7 @@ def test_filter_and_transform_with_non_unique_float_index():
     tm.assert_series_equal(actual, expected)
 
     actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
-    NA = np.nan
-    expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
+    expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
     # ^ made manually because this can get confusing!
     tm.assert_series_equal(actual, expected)
 
@@ -501,8 +498,7 @@ def test_filter_and_transform_with_non_unique_timestamp_index():
     tm.assert_series_equal(actual, expected)
 
     actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
-    NA = np.nan
-    expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
+    expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
     # ^ made manually because this can get confusing!
     tm.assert_series_equal(actual, expected)
 
@@ -544,8 +540,7 @@ def test_filter_and_transform_with_non_unique_string_index():
     tm.assert_series_equal(actual, expected)
 
     actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
-    NA = np.nan
-    expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
+    expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
     # ^ made manually because this can get confusing!
     tm.assert_series_equal(actual, expected)