pandas-dev
diff --git a/‎ci/code_checks.sh
-2 b/‎ci/code_checks.sh
-2
diff --git a/‎doc/source/user_guide/categorical.rst
-8 b/‎doc/source/user_guide/categorical.rst
-8
diff --git a/‎doc/source/whatsnew/v2.0.0.rst
+3 b/‎doc/source/whatsnew/v2.0.0.rst
+3
diff --git a/‎doc/source/whatsnew/v2.1.0.rst
+1 b/‎doc/source/whatsnew/v2.1.0.rst
+1
diff --git a/‎pandas/_libs/tslib.pyx
-4 b/‎pandas/_libs/tslib.pyx
-4
diff --git a/‎pandas/_libs/tslibs/conversion.pyx
+14-30 b/‎pandas/_libs/tslibs/conversion.pyx
+14-30
diff --git a/‎pandas/_libs/tslibs/np_datetime.pyx
+2 b/‎pandas/_libs/tslibs/np_datetime.pyx
+2
diff --git a/‎pandas/_typing.py
+8-1 b/‎pandas/_typing.py
+8-1
diff --git a/‎pandas/core/arrays/string_.py
+12-9 b/‎pandas/core/arrays/string_.py
+12-9
diff --git a/‎pandas/core/dtypes/common.py
+2 b/‎pandas/core/dtypes/common.py
+2
diff --git a/‎pandas/core/frame.py
+14-14 b/‎pandas/core/frame.py
+14-14
diff --git a/‎pandas/core/generic.py
+3-3 b/‎pandas/core/generic.py
+3-3
diff --git a/‎pandas/core/groupby/generic.py
+2-8 b/‎pandas/core/groupby/generic.py
+2-8
diff --git a/‎pandas/core/indexes/multi.py
+2 b/‎pandas/core/indexes/multi.py
+2
@@ -564,8 +564,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         pandas.api.types.is_datetime64_any_dtype \
         pandas.api.types.is_datetime64_ns_dtype \
         pandas.api.types.is_datetime64tz_dtype \
-        pandas.api.types.is_integer_dtype \
-        pandas.api.types.is_string_dtype \
         pandas.plotting.andrews_curves \
         pandas.plotting.autocorrelation_plot \
         pandas.plotting.lag_plot \
 
@@ -263,14 +263,6 @@ All instances of ``CategoricalDtype`` compare equal to the string ``'category'``
 
    c1 == "category"
 
-.. warning::
-
-   Since ``dtype='category'`` is essentially ``CategoricalDtype(None, False)``,
-   and since all instances ``CategoricalDtype`` compare equal to ``'category'``,
-   all instances of ``CategoricalDtype`` compare equal to a
-   ``CategoricalDtype(None, False)``, regardless of ``categories`` or
-   ``ordered``.
-
 Description
 -----------
 
 
@@ -1214,6 +1214,7 @@ Datetimelike
 - Bug in :func:`DataFrame.from_records` when given a :class:`DataFrame` input with timezone-aware datetime64 columns incorrectly dropping the timezone-awareness (:issue:`51162`)
 - Bug in :func:`to_datetime` was raising ``decimal.InvalidOperation`` when parsing date strings with ``errors='coerce'`` (:issue:`51084`)
 - Bug in :func:`to_datetime` with both ``unit`` and ``origin`` specified returning incorrect results (:issue:`42624`)
+- Bug in :meth:`Series.astype` and :meth:`DataFrame.astype` when converting an object-dtype object containing timezone-aware datetimes or strings to ``datetime64[ns]`` incorrectly localizing as UTC instead of raising ``TypeError`` (:issue:`50140`)
 - Bug in :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` with datetime or timedelta dtypes giving incorrect results for groups containing ``NaT`` (:issue:`51373`)
 - Bug in :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` incorrectly raising with :class:`PeriodDtype` or :class:`DatetimeTZDtype` (:issue:`51373`)
 
@@ -1276,6 +1277,7 @@ Indexing
 - Bug in :meth:`DataFrame.sort_values` where ``None`` was not returned when ``by`` is empty list and ``inplace=True`` (:issue:`50643`)
 - Bug in :meth:`DataFrame.loc` coercing dtypes when setting values with a list indexer (:issue:`49159`)
 - Bug in :meth:`Series.loc` raising error for out of bounds end of slice indexer (:issue:`50161`)
+- Bug in :meth:`DataFrame.loc` raising ``ValueError`` with all ``False`` ``bool`` indexer and empty object (:issue:`51450`)
 - Bug in :meth:`DataFrame.loc` raising ``ValueError`` with ``bool`` indexer and :class:`MultiIndex` (:issue:`47687`)
 - Bug in :meth:`DataFrame.loc` raising ``IndexError`` when setting values for a pyarrow-backed column with a non-scalar indexer (:issue:`50085`)
 - Bug in :meth:`DataFrame.__getitem__`, :meth:`Series.__getitem__`, :meth:`DataFrame.__setitem__` and :meth:`Series.__setitem__`
@@ -1377,6 +1379,7 @@ Groupby/resample/rolling
 - Bug in :meth:`.DataFrameGroupBy.agg` with ``engine="numba"`` failing to respect ``as_index=False`` (:issue:`51228`)
 - Bug in :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, and :meth:`.Resampler.agg` would ignore arguments when passed a list of functions (:issue:`50863`)
 - Bug in :meth:`.DataFrameGroupBy.ohlc` ignoring ``as_index=False`` (:issue:`51413`)
+- Bug in :meth:`DataFrameGroupBy.agg` after subsetting columns (e.g. ``.groupby(...)[["a", "b"]]``) would not include groupings in the result (:issue:`51186`)
 
 Reshaping
 ^^^^^^^^^
 
@@ -102,6 +102,7 @@ Performance improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 - Performance improvement in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` for extension array dtypes (:issue:`51549`)
 - Performance improvement in :meth:`DataFrame.clip` and :meth:`Series.clip` (:issue:`51472`)
+- Performance improvement in :func:`read_parquet` on string columns when using ``use_nullable_dtypes=True`` (:issue:`47345`)
 -
 
 .. ---------------------------------------------------------------------------
 
@@ -56,7 +56,6 @@ from pandas._libs.tslibs.conversion cimport (
     convert_timezone,
     get_datetime64_nanos,
     parse_pydatetime,
-    precision_from_unit,
 )
 from pandas._libs.tslibs.nattype cimport (
     NPY_NAT,
@@ -258,7 +257,6 @@ def array_with_unit_to_datetime(
     """
     cdef:
         Py_ssize_t i, n=len(values)
-        int64_t mult
         bint is_ignore = errors == "ignore"
         bint is_coerce = errors == "coerce"
         bint is_raise = errors == "raise"
@@ -275,8 +273,6 @@ def array_with_unit_to_datetime(
         )
         return result, tz
 
-    mult, _ = precision_from_unit(unit)
-
     result = np.empty(n, dtype="M8[ns]")
     iresult = result.view("i8")
 
 
@@ -37,6 +37,7 @@ from pandas._libs.tslibs.np_datetime cimport (
     NPY_FR_us,
     check_dts_bounds,
     convert_reso,
+    get_conversion_factor,
     get_datetime64_unit,
     get_datetime64_value,
     get_implementation_bounds,
@@ -83,9 +84,9 @@ TD64NS_DTYPE = np.dtype("m8[ns]")
 # Unit Conversion Helpers
 
 cdef int64_t cast_from_unit(
-        object ts,
-        str unit,
-        NPY_DATETIMEUNIT out_reso=NPY_FR_ns
+    object ts,
+    str unit,
+    NPY_DATETIMEUNIT out_reso=NPY_FR_ns
 ) except? -1:
     """
     Return a casting of the unit represented to nanoseconds
@@ -104,12 +105,6 @@ cdef int64_t cast_from_unit(
         int64_t m
         int p
 
-    m, p = precision_from_unit(unit, out_reso)
-
-    # just give me the unit back
-    if ts is None:
-        return m
-
     if unit in ["Y", "M"]:
         if is_float_object(ts) and not ts.is_integer():
             # GH#47267 it is clear that 2 "M" corresponds to 1970-02-01,
@@ -126,6 +121,8 @@ cdef int64_t cast_from_unit(
         dt64obj = np.datetime64(ts, unit)
         return get_datetime64_nanos(dt64obj, out_reso)
 
+    m, p = precision_from_unit(unit, out_reso)
+
     # cast the unit, multiply base/frac separately
     # to avoid precision issues from float -> int
     try:
@@ -148,8 +145,8 @@ cdef int64_t cast_from_unit(
 
 
 cpdef inline (int64_t, int) precision_from_unit(
-        str unit,
-        NPY_DATETIMEUNIT out_reso=NPY_DATETIMEUNIT.NPY_FR_ns,
+    str unit,
+    NPY_DATETIMEUNIT out_reso=NPY_DATETIMEUNIT.NPY_FR_ns,
 ):
     """
     Return a casting of the unit represented to nanoseconds + the precision
@@ -166,34 +163,21 @@ cpdef inline (int64_t, int) precision_from_unit(
         int p
         NPY_DATETIMEUNIT reso = abbrev_to_npy_unit(unit)
 
-    multiplier = periods_per_second(out_reso)
-
+    if reso == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
+        reso = NPY_DATETIMEUNIT.NPY_FR_ns
     if reso == NPY_DATETIMEUNIT.NPY_FR_Y:
         # each 400 years we have 97 leap years, for an average of 97/400=.2425
         #  extra days each year. We get 31556952 by writing
         #  3600*24*365.2425=31556952
+        multiplier = periods_per_second(out_reso)
         m = multiplier * 31556952
     elif reso == NPY_DATETIMEUNIT.NPY_FR_M:
         # 2629746 comes from dividing the "Y" case by 12.
+        multiplier = periods_per_second(out_reso)
         m = multiplier * 2629746
-    elif reso == NPY_DATETIMEUNIT.NPY_FR_W:
-        m = multiplier * 3600 * 24 * 7
-    elif reso == NPY_DATETIMEUNIT.NPY_FR_D:
-        m = multiplier * 3600 * 24
-    elif reso == NPY_DATETIMEUNIT.NPY_FR_h:
-        m = multiplier * 3600
-    elif reso == NPY_DATETIMEUNIT.NPY_FR_m:
-        m = multiplier * 60
-    elif reso == NPY_DATETIMEUNIT.NPY_FR_s:
-        m = multiplier
-    elif reso == NPY_DATETIMEUNIT.NPY_FR_ms:
-        m = multiplier // 1_000
-    elif reso == NPY_DATETIMEUNIT.NPY_FR_us:
-        m = multiplier // 1_000_000
-    elif reso == NPY_DATETIMEUNIT.NPY_FR_ns or reso == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
-        m = multiplier // 1_000_000_000
     else:
-        raise ValueError(f"cannot cast unit {unit}")
+        m = get_conversion_factor(reso, out_reso)
+
     p = <int>log10(m)  # number of digits in 'm' minus 1
     return m, p
 
 
@@ -571,6 +571,8 @@ cdef int64_t get_conversion_factor(
         return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_fs, to_unit)
     elif from_unit == NPY_DATETIMEUNIT.NPY_FR_fs:
         return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_as, to_unit)
+    else:
+        raise ValueError("Converting from M or Y units is not supported.")
 
 
 cdef int64_t convert_reso(
 
@@ -94,6 +94,13 @@
 AnyArrayLike = Union[ArrayLike, "Index", "Series"]
 TimeArrayLike = Union["DatetimeArray", "TimedeltaArray"]
 
+# list-like
+
+# Cannot use `Sequence` because a string is a sequence, and we don't want to
+# accept that.  Could refine if https://github.com/python/typing/issues/256 is
+# resolved to differentiate between Sequence[str] and str
+ListLike = Union[AnyArrayLike, List, range]
+
 # scalars
 
 PythonScalar = Union[str, float, bool]
@@ -130,7 +137,7 @@
 Ordered = Optional[bool]
 JSONSerializable = Optional[Union[PythonScalar, List, Dict]]
 Frequency = Union[str, "BaseOffset"]
-Axes = Union[AnyArrayLike, List, range]
+Axes = ListLike
 
 RandomState = Union[
     int,
 
@@ -203,16 +203,19 @@ def __from_arrow__(
                 # pyarrow.ChunkedArray
                 chunks = array.chunks
 
-            results = []
-            for arr in chunks:
-                # using _from_sequence to ensure None is converted to NA
-                str_arr = StringArray._from_sequence(np.array(arr))
-                results.append(str_arr)
-
-        if results:
-            return StringArray._concat_same_type(results)
+        if len(chunks) == 0:
+            arr = np.array([], dtype=object)
         else:
-            return StringArray(np.array([], dtype="object"))
+            arr = pyarrow.concat_arrays(chunks).to_numpy(zero_copy_only=False)
+            arr = lib.convert_nans_to_NA(arr)
+        # Bypass validation inside StringArray constructor, see GH#47781
+        new_string_array = StringArray.__new__(StringArray)
+        NDArrayBacked.__init__(
+            new_string_array,
+            arr,
+            StringDtype(storage="python"),
+        )
+        return new_string_array
 
 
 class BaseStringArray(ExtensionArray):
 
@@ -523,6 +523,7 @@ def is_string_dtype(arr_or_dtype) -> bool:
 
     Examples
     --------
+    >>> from pandas.api.types import is_string_dtype
     >>> is_string_dtype(str)
     True
     >>> is_string_dtype(object)
@@ -674,6 +675,7 @@ def is_integer_dtype(arr_or_dtype) -> bool:
 
     Examples
     --------
+    >>> from pandas.api.types import is_integer_dtype
     >>> is_integer_dtype(str)
     False
     >>> is_integer_dtype(int)
 
@@ -1134,9 +1134,9 @@ def _repr_html_(self) -> str | None:
     def to_string(
         self,
         buf: None = ...,
-        columns: Sequence[str] | None = ...,
+        columns: Axes | None = ...,
         col_space: int | list[int] | dict[Hashable, int] | None = ...,
-        header: bool | Sequence[str] = ...,
+        header: bool | list[str] = ...,
         index: bool = ...,
         na_rep: str = ...,
         formatters: fmt.FormattersType | None = ...,
@@ -1159,9 +1159,9 @@ def to_string(
     def to_string(
         self,
         buf: FilePath | WriteBuffer[str],
-        columns: Sequence[str] | None = ...,
+        columns: Axes | None = ...,
         col_space: int | list[int] | dict[Hashable, int] | None = ...,
-        header: bool | Sequence[str] = ...,
+        header: bool | list[str] = ...,
         index: bool = ...,
         na_rep: str = ...,
         formatters: fmt.FormattersType | None = ...,
@@ -1181,8 +1181,8 @@ def to_string(
         ...
 
     @Substitution(
-        header_type="bool or sequence of str",
-        header="Write out the column names. If a list of strings "
+        header_type="bool or list of str",
+        header="Write out the column names. If a list of columns "
         "is given, it is assumed to be aliases for the "
         "column names",
         col_space_type="int, list or dict of int",
@@ -1194,9 +1194,9 @@ def to_string(
     def to_string(
         self,
         buf: FilePath | WriteBuffer[str] | None = None,
-        columns: Sequence[str] | None = None,
+        columns: Axes | None = None,
         col_space: int | list[int] | dict[Hashable, int] | None = None,
-        header: bool | Sequence[str] = True,
+        header: bool | list[str] = True,
         index: bool = True,
         na_rep: str = "NaN",
         formatters: fmt.FormattersType | None = None,
@@ -2965,9 +2965,9 @@ def to_orc(
     def to_html(
         self,
         buf: FilePath | WriteBuffer[str],
-        columns: Sequence[Level] | None = ...,
+        columns: Axes | None = ...,
         col_space: ColspaceArgType | None = ...,
-        header: bool | Sequence[str] = ...,
+        header: bool = ...,
         index: bool = ...,
         na_rep: str = ...,
         formatters: FormattersType | None = ...,
@@ -2994,9 +2994,9 @@ def to_html(
     def to_html(
         self,
         buf: None = ...,
-        columns: Sequence[Level] | None = ...,
+        columns: Axes | None = ...,
         col_space: ColspaceArgType | None = ...,
-        header: bool | Sequence[str] = ...,
+        header: bool = ...,
         index: bool = ...,
         na_rep: str = ...,
         formatters: FormattersType | None = ...,
@@ -3030,9 +3030,9 @@ def to_html(
     def to_html(
         self,
         buf: FilePath | WriteBuffer[str] | None = None,
-        columns: Sequence[Level] | None = None,
+        columns: Axes | None = None,
         col_space: ColspaceArgType | None = None,
-        header: bool | Sequence[str] = True,
+        header: bool = True,
         index: bool = True,
         na_rep: str = "NaN",
         formatters: FormattersType | None = None,
 
@@ -3101,7 +3101,7 @@ def to_latex(
         self,
         buf: None = ...,
         columns: Sequence[Hashable] | None = ...,
-        header: bool_t | Sequence[str] = ...,
+        header: bool_t | list[str] = ...,
         index: bool_t = ...,
         na_rep: str = ...,
         formatters: FormattersType | None = ...,
@@ -3128,7 +3128,7 @@ def to_latex(
         self,
         buf: FilePath | WriteBuffer[str],
         columns: Sequence[Hashable] | None = ...,
-        header: bool_t | Sequence[str] = ...,
+        header: bool_t | list[str] = ...,
         index: bool_t = ...,
         na_rep: str = ...,
         formatters: FormattersType | None = ...,
@@ -3155,7 +3155,7 @@ def to_latex(
         self,
         buf: FilePath | WriteBuffer[str] | None = None,
         columns: Sequence[Hashable] | None = None,
-        header: bool_t | Sequence[str] = True,
+        header: bool_t | list[str] = True,
         index: bool_t = True,
         na_rep: str = "NaN",
         formatters: FormattersType | None = None,
 
@@ -1359,21 +1359,15 @@ def _python_agg_general(self, func, *args, **kwargs):
         return self._wrap_aggregated_output(res)
 
     def _iterate_slices(self) -> Iterable[Series]:
-        obj = self._selected_obj
+        obj = self._obj_with_exclusions
         if self.axis == 1:
             obj = obj.T
 
-        if isinstance(obj, Series) and obj.name not in self.exclusions:
+        if isinstance(obj, Series):
             # Occurs when doing DataFrameGroupBy(...)["X"]
             yield obj
         else:
             for label, values in obj.items():
-                if label in self.exclusions:
-                    # Note: if we tried to just iterate over _obj_with_exclusions,
-                    #  we would break test_wrap_agg_out by yielding a column
-                    #  that is skipped here but not dropped from obj_with_exclusions
-                    continue
-
                 yield values
 
     def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame:
 
@@ -3748,6 +3748,8 @@ def delete(self, loc) -> MultiIndex:
     @doc(Index.isin)
     def isin(self, values, level=None) -> npt.NDArray[np.bool_]:
         if level is None:
+            if len(values) == 0:
+                return np.zeros((len(self),), dtype=np.bool_)
             if not isinstance(values, MultiIndex):
                 values = MultiIndex.from_tuples(values)
             return values.unique().get_indexer_for(self) != -1
Original file line number	Diff line number	Diff line change
`@@ -102,6 +102,7 @@ Performance improvements`
`102`	`102`	`~~~~~~~~~~~~~~~~~~~~~~~~`
`103`	`103`	- Performance improvement in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` for extension array dtypes (:issue:`51549`)
`104`	`104`	- Performance improvement in :meth:`DataFrame.clip` and :meth:`Series.clip` (:issue:`51472`)
	`105`	+- Performance improvement in :func:`read_parquet` on string columns when using ``use_nullable_dtypes=True`` (:issue:`47345`)
`105`	`106`	`-`
`106`	`107`
`107`	`108`	`.. ---------------------------------------------------------------------------`