Merge branch 'master' of https://github.com/pandas-dev/pandas into depr_fallback_agg_dict

rhshadrach · rhshadrach · commit e0fddd8034b7 · 2021-09-25T12:10:33.000-04:00
� Conflicts:
�	pandas/tests/resample/test_resample_api.py
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -81,6 +81,15 @@ repos:
             - flake8-comprehensions==3.1.0
             - flake8-bugbear==21.3.2
             - pandas-dev-flaker==0.2.0
+-   repo: local
+    hooks:
+    -   id: pyright
+        name: pyright
+        entry: pyright
+        language: node
+        pass_filenames: false
+        types: [python]
+        additional_dependencies: ['pyright@1.1.170']
 -   repo: local
     hooks:
     -   id: flake8-rst
diff --git a/asv_bench/benchmarks/io/style.py b/asv_bench/benchmarks/io/style.py
@@ -34,6 +34,14 @@ def peakmem_classes_render(self, cols, rows):
         self._style_classes()
         self.st._render_html(True, True)
 
+    def time_tooltips_render(self, cols, rows):
+        self._style_tooltips()
+        self.st._render_html(True, True)
+
+    def peakmem_tooltips_render(self, cols, rows):
+        self._style_tooltips()
+        self.st._render_html(True, True)
+
     def time_format_render(self, cols, rows):
         self._style_format()
         self.st._render_html(True, True)
@@ -77,3 +85,9 @@ def _style_apply_format_hide(self):
         self.st.format("{:.3f}")
         self.st.hide_index(self.st.index[1:])
         self.st.hide_columns(self.st.columns[1:])
+
+    def _style_tooltips(self):
+        ttips = DataFrame("abc", index=self.df.index[::2], columns=self.df.columns[::2])
+        self.st = self.df.style.set_tooltips(ttips)
+        self.st.hide_index(self.st.index[12:])
+        self.st.hide_columns(self.st.columns[12:])
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -379,6 +379,7 @@ Categorical
 Datetimelike
 ^^^^^^^^^^^^
 - Bug in :class:`DataFrame` constructor unnecessarily copying non-datetimelike 2D object arrays (:issue:`39272`)
+- Bug in :func:`to_datetime` with ``format`` and ``pandas.NA`` was raising ``ValueError`` (:issue:`42957`)
 - :func:`to_datetime` would silently swap ``MM/DD/YYYY`` and ``DD/MM/YYYY`` formats if the given ``dayfirst`` option could not be respected - now, a warning is raised in the case of delimited date strings (e.g. ``31-12-2012``) (:issue:`12585`)
 -
 
@@ -427,6 +428,7 @@ Indexing
 - Bug in :meth:`Index.get_indexer_non_unique` when index contains multiple ``np.nan`` (:issue:`35392`)
 - Bug in :meth:`DataFrame.query` did not handle the degree sign in a backticked column name, such as \`Temp(°C)\`, used in an expression to query a dataframe (:issue:`42826`)
 - Bug in :meth:`DataFrame.drop` where the error message did not show missing labels with commas when raising ``KeyError`` (:issue:`42881`)
+- Bug in :meth:`DataFrame.query` where method calls in query strings led to errors when the ``numexpr`` package was installed. (:issue:`22435`)
 - Bug in :meth:`DataFrame.nlargest` and :meth:`Series.nlargest` where sorted result did not count indexes containing ``np.nan`` (:issue:`28984`)
 
 
diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx
@@ -20,10 +20,10 @@ from numpy cimport (
     ndarray,
 )
 
+from pandas._libs.missing cimport checknull_with_nat_and_na
 from pandas._libs.tslibs.nattype cimport (
     NPY_NAT,
     c_nat_strings as nat_strings,
-    checknull_with_nat,
 )
 from pandas._libs.tslibs.np_datetime cimport (
     check_dts_bounds,
@@ -134,7 +134,7 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors='
                 iresult[i] = NPY_NAT
                 continue
         else:
-            if checknull_with_nat(val):
+            if checknull_with_nat_and_na(val):
                 iresult[i] = NPY_NAT
                 continue
             else:
diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py
@@ -265,7 +265,10 @@ def f(self, *args, **kwargs):
     return f
 
 
-_T = TypeVar("_T", bound="BaseExprVisitor")
+# should be bound by BaseExprVisitor but that creates a circular dependency:
+# _T is used in disallow, but disallow is used to define BaseExprVisitor
+# https://github.com/microsoft/pyright/issues/2315
+_T = TypeVar("_T")
 
 
 def disallow(nodes: set[str]) -> Callable[[type[_T]], type[_T]]:
@@ -279,11 +282,13 @@ def disallow(nodes: set[str]) -> Callable[[type[_T]], type[_T]]:
     """
 
     def disallowed(cls: type[_T]) -> type[_T]:
-        cls.unsupported_nodes = ()
+        # error: "Type[_T]" has no attribute "unsupported_nodes"
+        cls.unsupported_nodes = ()  # type: ignore[attr-defined]
         for node in nodes:
             new_method = _node_not_implemented(node)
             name = f"visit_{node}"
-            cls.unsupported_nodes += (name,)
+            # error: "Type[_T]" has no attribute "unsupported_nodes"
+            cls.unsupported_nodes += (name,)  # type: ignore[attr-defined]
             setattr(cls, name, new_method)
         return cls
 
@@ -702,7 +707,8 @@ def visit_Call(self, node, side=None, **kwargs):
                 if key.arg:
                     kwargs[key.arg] = self.visit(key.value).value
 
-            return self.const_type(res(*new_args, **kwargs), self.env)
+            name = self.env.add_tmp(res(*new_args, **kwargs))
+            return self.term_type(name=name, env=self.env)
 
     def translate_In(self, op):
         return op
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -21,6 +21,7 @@
     Mapping,
     TypeVar,
     Union,
+    cast,
 )
 import warnings
 
@@ -30,7 +31,9 @@
 from pandas._typing import (
     ArrayLike,
     FrameOrSeries,
+    Manager,
     Manager2D,
+    SingleManager,
 )
 from pandas.util._decorators import (
     Appender,
@@ -80,7 +83,6 @@
     Index,
     MultiIndex,
     all_indexes_same,
-    default_index,
 )
 from pandas.core.series import Series
 from pandas.core.util.numba_ import maybe_use_numba
@@ -159,19 +161,21 @@ def pinner(cls):
 class SeriesGroupBy(GroupBy[Series]):
     _apply_allowlist = base.series_apply_allowlist
 
-    def _wrap_agged_manager(self, mgr: Manager2D) -> Series:
-        single = mgr.iget(0)
+    def _wrap_agged_manager(self, mgr: Manager) -> Series:
+        if mgr.ndim == 1:
+            mgr = cast(SingleManager, mgr)
+            single = mgr
+        else:
+            mgr = cast(Manager2D, mgr)
+            single = mgr.iget(0)
         ser = self.obj._constructor(single, name=self.obj.name)
         # NB: caller is responsible for setting ser.index
         return ser
 
-    def _get_data_to_aggregate(self) -> Manager2D:
+    def _get_data_to_aggregate(self) -> SingleManager:
         ser = self._obj_with_exclusions
         single = ser._mgr
-        columns = default_index(1)
-        # Much faster than using ser.to_frame() since we avoid inferring columns
-        #  from scalar
-        return single.to_2d_mgr(columns)
+        return single
 
     def _iterate_slices(self) -> Iterable[Series]:
         yield self._selected_obj
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1745,6 +1745,8 @@ def count(self) -> Series | DataFrame:
         ids, _, ngroups = self.grouper.group_info
         mask = ids != -1
 
+        is_series = data.ndim == 1
+
         def hfunc(bvalues: ArrayLike) -> ArrayLike:
             # TODO(2DEA): reshape would not be necessary with 2D EAs
             if bvalues.ndim == 1:
@@ -1754,6 +1756,10 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike:
                 masked = mask & ~isna(bvalues)
 
             counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups, axis=1)
+            if is_series:
+                assert counted.ndim == 2
+                assert counted.shape[0] == 1
+                return counted[0]
             return counted
 
         new_mgr = data.grouped_reduce(hfunc)
@@ -2702,7 +2708,7 @@ def blk_func(values: ArrayLike) -> ArrayLike:
         mgr = self._get_data_to_aggregate()
 
         res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True)
-        if len(res_mgr.items) != len(mgr.items):
+        if not is_ser and len(res_mgr.items) != len(mgr.items):
             warnings.warn(
                 "Dropping invalid columns in "
                 f"{type(self).__name__}.quantile is deprecated. "
@@ -3134,14 +3140,15 @@ def blk_func(values: ArrayLike) -> ArrayLike:
         obj = self._obj_with_exclusions
 
         # Operate block-wise instead of column-by-column
-        orig_ndim = obj.ndim
+        is_ser = obj.ndim == 1
         mgr = self._get_data_to_aggregate()
 
         if numeric_only:
             mgr = mgr.get_numeric_data()
 
         res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True)
-        if len(res_mgr.items) != len(mgr.items):
+
+        if not is_ser and len(res_mgr.items) != len(mgr.items):
             howstr = how.replace("group_", "")
             warnings.warn(
                 "Dropping invalid columns in "
@@ -3162,7 +3169,7 @@ def blk_func(values: ArrayLike) -> ArrayLike:
                 # We should never get here
                 raise TypeError("All columns were dropped in grouped_reduce")
 
-        if orig_ndim == 1:
+        if is_ser:
             out = self._wrap_agged_manager(res_mgr)
             out.index = self.grouper.result_index
         else:
diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py
@@ -10,6 +10,7 @@
 )
 
 from pandas._typing import (
+    ArrayLike,
     DtypeObj,
     Shape,
 )
@@ -18,7 +19,10 @@
 from pandas.core.dtypes.cast import find_common_type
 
 from pandas.core.base import PandasObject
-from pandas.core.indexes.api import Index
+from pandas.core.indexes.api import (
+    Index,
+    default_index,
+)
 
 T = TypeVar("T", bound="DataManager")
 
@@ -171,6 +175,23 @@ def setitem_inplace(self, indexer, value) -> None:
         """
         self.array[indexer] = value
 
+    def grouped_reduce(self, func, ignore_failures: bool = False):
+        """
+        ignore_failures : bool, default False
+            Not used; for compatibility with ArrayManager/BlockManager.
+        """
+
+        arr = self.array
+        res = func(arr)
+        index = default_index(len(res))
+
+        mgr = type(self).from_array(res, index)
+        return mgr
+
+    @classmethod
+    def from_array(cls, arr: ArrayLike, index: Index):
+        raise AbstractMethodError(cls)
+
 
 def interleaved_dtype(dtypes: list[DtypeObj]) -> DtypeObj | None:
     """
diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
@@ -1,6 +1,8 @@
 """
 Quantilization functions and related stuff
 """
+from __future__ import annotations
+
 from typing import (
     Any,
     Callable,
diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py
@@ -1,6 +1,7 @@
 """
 timedelta support tools
 """
+from __future__ import annotations
 
 import numpy as np
 
diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py
@@ -294,7 +294,7 @@ def _translate(
         d.update({"table_attributes": table_attr})
 
         if self.tooltips:
-            d = self.tooltips._translate(self.data, self.uuid, d)
+            d = self.tooltips._translate(self, d)
 
         return d
 
@@ -1508,7 +1508,7 @@ def _pseudo_css(self, uuid: str, name: str, row: int, col: int, text: str):
             },
         ]
 
-    def _translate(self, styler_data: DataFrame | Series, uuid: str, d: dict):
+    def _translate(self, styler: StylerRenderer, d: dict):
         """
         Mutate the render dictionary to allow for tooltips:
 
@@ -1529,21 +1529,23 @@ def _translate(self, styler_data: DataFrame | Series, uuid: str, d: dict):
         -------
         render_dict : Dict
         """
-        self.tt_data = self.tt_data.reindex_like(styler_data)
-
+        self.tt_data = self.tt_data.reindex_like(styler.data)
         if self.tt_data.empty:
             return d
 
         name = self.class_name
-
         mask = (self.tt_data.isna()) | (self.tt_data.eq(""))  # empty string = no ttip
         self.table_styles = [
             style
             for sublist in [
-                self._pseudo_css(uuid, name, i, j, str(self.tt_data.iloc[i, j]))
+                self._pseudo_css(styler.uuid, name, i, j, str(self.tt_data.iloc[i, j]))
                 for i in range(len(self.tt_data.index))
                 for j in range(len(self.tt_data.columns))
-                if not mask.iloc[i, j]
+                if not (
+                    mask.iloc[i, j]
+                    or i in styler.hidden_rows
+                    or j in styler.hidden_columns
+                )
             ]
             for style in sublist
         ]
diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py
@@ -731,6 +731,26 @@ def test_check_tz_aware_index_query(self, tz_aware_fixture):
         result = df.reset_index().query('"2018-01-03 00:00:00+00" < time')
         tm.assert_frame_equal(result, expected)
 
+    def test_method_calls_in_query(self):
+        # https://github.com/pandas-dev/pandas/issues/22435
+        n = 10
+        df = DataFrame({"a": 2 * np.random.rand(n), "b": np.random.rand(n)})
+        expected = df[df["a"].astype("int") == 0]
+        result = df.query(
+            "a.astype('int') == 0", engine=self.engine, parser=self.parser
+        )
+        tm.assert_frame_equal(result, expected)
+
+        df = DataFrame(
+            {
+                "a": np.where(np.random.rand(n) < 0.5, np.nan, np.random.randn(n)),
+                "b": np.random.randn(n),
+            }
+        )
+        expected = df[df["a"].notnull()]
+        result = df.query("a.notnull()", engine=self.engine, parser=self.parser)
+        tm.assert_frame_equal(result, expected)
+
 
 @td.skip_if_no_ne
 class TestDataFrameQueryNumExprPython(TestDataFrameQueryNumExprPandas):
diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py
@@ -177,6 +177,28 @@ def test_to_datetime_format_YYYYMMDD_overflow(self, input_s, expected):
         result = to_datetime(input_s, format="%Y%m%d", errors="coerce")
         tm.assert_series_equal(result, expected)
 
+    @pytest.mark.parametrize(
+        "data, format, expected",
+        [
+            ([pd.NA], "%Y%m%d%H%M%S", DatetimeIndex(["NaT"])),
+            ([pd.NA], None, DatetimeIndex(["NaT"])),
+            (
+                [pd.NA, "20210202202020"],
+                "%Y%m%d%H%M%S",
+                DatetimeIndex(["NaT", "2021-02-02 20:20:20"]),
+            ),
+            (["201010", pd.NA], "%y%m%d", DatetimeIndex(["2020-10-10", "NaT"])),
+            (["201010", pd.NA], "%d%m%y", DatetimeIndex(["2010-10-20", "NaT"])),
+            (["201010", pd.NA], None, DatetimeIndex(["2010-10-20", "NaT"])),
+            ([None, np.nan, pd.NA], None, DatetimeIndex(["NaT", "NaT", "NaT"])),
+            ([None, np.nan, pd.NA], "%Y%m%d", DatetimeIndex(["NaT", "NaT", "NaT"])),
+        ],
+    )
+    def test_to_datetime_with_NA(self, data, format, expected):
+        # GH#42957
+        result = to_datetime(data, format=format)
+        tm.assert_index_equal(result, expected)
+
     @pytest.mark.parametrize("cache", [True, False])
     def test_to_datetime_format_integer(self, cache):
         # GH 10178
diff --git a/pyproject.toml b/pyproject.toml