pandas-dev
diff --git a/‎.pre-commit-config.yaml
+1-1 b/‎.pre-commit-config.yaml
+1-1
diff --git a/‎ci/code_checks.sh
-2 b/‎ci/code_checks.sh
-2
diff --git a/‎doc/source/development/internals.rst
+23-23 b/‎doc/source/development/internals.rst
+23-23
diff --git a/‎doc/source/user_guide/indexing.rst
+1-1 b/‎doc/source/user_guide/indexing.rst
+1-1
diff --git a/‎doc/source/whatsnew/v2.0.0.rst
+34 b/‎doc/source/whatsnew/v2.0.0.rst
+34
diff --git a/‎pandas/_libs/internals.pyi
+3-1 b/‎pandas/_libs/internals.pyi
+3-1
diff --git a/‎pandas/_libs/lib.pyx
+7 b/‎pandas/_libs/lib.pyx
+7
diff --git a/‎pandas/_libs/tslibs/parsing.pyx
+12 b/‎pandas/_libs/tslibs/parsing.pyx
+12
diff --git a/‎pandas/_testing/contexts.py
+8-1 b/‎pandas/_testing/contexts.py
+8-1
diff --git a/‎pandas/core/algorithms.py
+11-4 b/‎pandas/core/algorithms.py
+11-4
diff --git a/‎pandas/core/arrays/arrow/array.py
+8-3 b/‎pandas/core/arrays/arrow/array.py
+8-3
diff --git a/‎pandas/core/arrays/categorical.py
+11-2 b/‎pandas/core/arrays/categorical.py
+11-2
diff --git a/‎pandas/core/arrays/masked.py
+2-2 b/‎pandas/core/arrays/masked.py
+2-2
@@ -135,7 +135,7 @@ repos:
         types: [python]
         stages: [manual]
         additional_dependencies: &pyright_dependencies
-        - [email protected].276
+        - [email protected].284
     -   id: pyright_reportGeneralTypeIssues
         # note: assumes python env is setup and activated
         name: pyright reportGeneralTypeIssues
 
@@ -578,13 +578,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
 
     MSG='Partially validate docstrings (EX02)' ;  echo $MSG
     $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX02 --ignore_functions \
-        pandas.DataFrame.copy \
         pandas.DataFrame.plot.line \
         pandas.DataFrame.std \
         pandas.DataFrame.var \
         pandas.Index.factorize \
         pandas.Period.strftime \
-        pandas.Series.copy \
         pandas.Series.factorize \
         pandas.Series.floordiv \
         pandas.Series.plot.line \
 
@@ -15,24 +15,24 @@ Indexing
 In pandas there are a few objects implemented which can serve as valid
 containers for the axis labels:
 
-* ``Index``: the generic "ordered set" object, an ndarray of object dtype
+* :class:`Index`: the generic "ordered set" object, an ndarray of object dtype
   assuming nothing about its contents. The labels must be hashable (and
   likely immutable) and unique. Populates a dict of label to location in
   Cython to do ``O(1)`` lookups.
 * ``Int64Index``: a version of ``Index`` highly optimized for 64-bit integer
   data, such as time stamps
 * ``Float64Index``: a version of ``Index`` highly optimized for 64-bit float data
-* ``MultiIndex``: the standard hierarchical index object
-* ``DatetimeIndex``: An Index object with ``Timestamp`` boxed elements (impl are the int64 values)
-* ``TimedeltaIndex``: An Index object with ``Timedelta`` boxed elements (impl are the in64 values)
-* ``PeriodIndex``: An Index object with Period elements
+* :class:`MultiIndex`: the standard hierarchical index object
+* :class:`DatetimeIndex`: An Index object with :class:`Timestamp` boxed elements (impl are the int64 values)
+* :class:`TimedeltaIndex`: An Index object with :class:`Timedelta` boxed elements (impl are the in64 values)
+* :class:`PeriodIndex`: An Index object with Period elements
 
 There are functions that make the creation of a regular index easy:
 
-* ``date_range``: fixed frequency date range generated from a time rule or
+* :func:`date_range`: fixed frequency date range generated from a time rule or
   DateOffset. An ndarray of Python datetime objects
-* ``period_range``: fixed frequency date range generated from a time rule or
-  DateOffset. An ndarray of ``Period`` objects, representing timespans
+* :func:`period_range`: fixed frequency date range generated from a time rule or
+  DateOffset. An ndarray of :class:`Period` objects, representing timespans
 
 The motivation for having an ``Index`` class in the first place was to enable
 different implementations of indexing. This means that it's possible for you,
@@ -43,28 +43,28 @@ From an internal implementation point of view, the relevant methods that an
 ``Index`` must define are one or more of the following (depending on how
 incompatible the new object internals are with the ``Index`` functions):
 
-* ``get_loc``: returns an "indexer" (an integer, or in some cases a
+* :meth:`~Index.get_loc`: returns an "indexer" (an integer, or in some cases a
   slice object) for a label
-* ``slice_locs``: returns the "range" to slice between two labels
-* ``get_indexer``: Computes the indexing vector for reindexing / data
+* :meth:`~Index.slice_locs`: returns the "range" to slice between two labels
+* :meth:`~Index.get_indexer`: Computes the indexing vector for reindexing / data
   alignment purposes. See the source / docstrings for more on this
-* ``get_indexer_non_unique``: Computes the indexing vector for reindexing / data
+* :meth:`~Index.get_indexer_non_unique`: Computes the indexing vector for reindexing / data
   alignment purposes when the index is non-unique. See the source / docstrings
   for more on this
-* ``reindex``: Does any pre-conversion of the input index then calls
+* :meth:`~Index.reindex`: Does any pre-conversion of the input index then calls
   ``get_indexer``
-* ``union``, ``intersection``: computes the union or intersection of two
+* :meth:`~Index.union`, :meth:`~Index.intersection`: computes the union or intersection of two
   Index objects
-* ``insert``: Inserts a new label into an Index, yielding a new object
-* ``delete``: Delete a label, yielding a new object
-* ``drop``: Deletes a set of labels
-* ``take``: Analogous to ndarray.take
+* :meth:`~Index.insert`: Inserts a new label into an Index, yielding a new object
+* :meth:`~Index.delete`: Delete a label, yielding a new object
+* :meth:`~Index.drop`: Deletes a set of labels
+* :meth:`~Index.take`: Analogous to ndarray.take
 
 MultiIndex
 ~~~~~~~~~~
 
-Internally, the ``MultiIndex`` consists of a few things: the **levels**, the
-integer **codes** (until version 0.24 named *labels*), and the level **names**:
+Internally, the :class:`MultiIndex` consists of a few things: the **levels**, the
+integer **codes**, and the level **names**:
 
 .. ipython:: python
 
@@ -80,13 +80,13 @@ You can probably guess that the codes determine which unique element is
 identified with that location at each layer of the index. It's important to
 note that sortedness is determined **solely** from the integer codes and does
 not check (or care) whether the levels themselves are sorted. Fortunately, the
-constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but
-if you compute the levels and codes yourself, please be careful.
+constructors :meth:`~MultiIndex.from_tuples` and :meth:`~MultiIndex.from_arrays` ensure
+that this is true, but if you compute the levels and codes yourself, please be careful.
 
 Values
 ~~~~~~
 
-pandas extends NumPy's type system with custom types, like ``Categorical`` or
+pandas extends NumPy's type system with custom types, like :class:`Categorical` or
 datetimes with a timezone, so we have multiple notions of "values". For 1-D
 containers (``Index`` classes and ``Series``) we have the following convention:
 
 
@@ -231,7 +231,7 @@ You can also assign a ``dict`` to a row of a ``DataFrame``:
 
 You can use attribute access to modify an existing element of a Series or column of a DataFrame, but be careful;
 if you try to use attribute access to create a new column, it creates a new attribute rather than a
-new column. In 0.21.0 and later, this will raise a ``UserWarning``:
+new column and will this raise a ``UserWarning``:
 
 .. code-block:: ipython
 
 
@@ -334,6 +334,36 @@ a supported dtype:
 
    pd.Series(["2016-01-01"], dtype="datetime64[D]")
 
+.. _whatsnew_200.api_breaking.value_counts:
+
+Value counts sets the resulting name to ``count``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+In past versions, when running :meth:`Series.value_counts`, the result would inherit
+the original object's name, and the result index would be nameless. This would cause
+confusion when resetting the index, and the column names would not correspond with the
+column values.
+Now, the result name will be ``'count'`` (or ``'proportion'`` if ``normalize=True`` was passed),
+and the index will be named after the original object (:issue:`49497`).
+
+*Previous behavior*:
+
+.. code-block:: ipython
+
+    In [8]: pd.Series(['quetzal', 'quetzal', 'elk'], name='animal').value_counts()
+
+    Out[2]:
+    quetzal    2
+    elk        1
+    Name: animal, dtype: int64
+
+*New behavior*:
+
+.. ipython:: python
+
+    pd.Series(['quetzal', 'quetzal', 'elk'], name='animal').value_counts()
+
+Likewise for other ``value_counts`` methods (for example, :meth:`DataFrame.value_counts`).
+
 .. _whatsnew_200.api_breaking.astype_to_unsupported_datetimelike:
 
 Disallow astype conversion to non-supported datetime64/timedelta64 dtypes
@@ -636,6 +666,7 @@ Other API changes
 
 Deprecations
 ~~~~~~~~~~~~
+- Deprecated parsing datetime strings with system-local timezone to ``tzlocal``, pass a ``tz`` keyword or explicitly call ``tz_localize`` instead (:issue:`50791`)
 - Deprecated argument ``infer_datetime_format`` in :func:`to_datetime` and :func:`read_csv`, as a strict version of it is now the default (:issue:`48621`)
 - Deprecated behavior of :func:`to_datetime` with ``unit`` when parsing strings, in a future version these will be parsed as datetimes (matching unit-less behavior) instead of cast to floats. To retain the old behavior, cast strings to numeric types before calling :func:`to_datetime` (:issue:`50735`)
 - Deprecated :func:`pandas.io.sql.execute` (:issue:`50185`)
@@ -950,6 +981,8 @@ Performance improvements
 - Performance improvement in :meth:`.SeriesGroupBy.value_counts` with categorical dtype (:issue:`46202`)
 - Fixed a reference leak in :func:`read_hdf` (:issue:`37441`)
 - Fixed a memory leak in :meth:`DataFrame.to_json` and :meth:`Series.to_json` when serializing datetimes and timedeltas (:issue:`40443`)
+- Decreased memory usage in many :class:`DataFrameGroupBy` methods (:issue:`51090`)
+-
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_200.bug_fixes:
@@ -1028,6 +1061,7 @@ Conversion
 - Bug where any :class:`ExtensionDtype` subclass with ``kind="M"`` would be interpreted as a timezone type (:issue:`34986`)
 - Bug in :class:`.arrays.ArrowExtensionArray` that would raise ``NotImplementedError`` when passed a sequence of strings or binary (:issue:`49172`)
 - Bug in :meth:`Series.astype` raising ``pyarrow.ArrowInvalid`` when converting from a non-pyarrow string dtype to a pyarrow numeric type (:issue:`50430`)
+- Bug in :meth:`DataFrame.astype` modifying input array inplace when converting to ``string`` and ``copy=False`` (:issue:`51073`)
 - Bug in :meth:`Series.to_numpy` converting to NumPy array before applying ``na_value`` (:issue:`48951`)
 - Bug in :meth:`DataFrame.astype` not copying data when converting to pyarrow dtype (:issue:`50984`)
 - Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`)
 
@@ -44,7 +44,9 @@ class BlockPlacement:
     @property
     def is_slice_like(self) -> bool: ...
     @overload
-    def __getitem__(self, loc: slice | Sequence[int]) -> BlockPlacement: ...
+    def __getitem__(
+        self, loc: slice | Sequence[int] | npt.NDArray[np.intp]
+    ) -> BlockPlacement: ...
     @overload
     def __getitem__(self, loc: int) -> int: ...
     def __iter__(self) -> Iterator[int]: ...
 
@@ -739,6 +739,7 @@ cpdef ndarray[object] ensure_string_array(
     """
     cdef:
         Py_ssize_t i = 0, n = len(arr)
+        bint already_copied = True
 
     if hasattr(arr, "to_numpy"):
 
@@ -757,6 +758,8 @@ cpdef ndarray[object] ensure_string_array(
 
     if copy and result is arr:
         result = result.copy()
+    elif not copy and result is arr:
+        already_copied = False
 
     if issubclass(arr.dtype.type, np.str_):
         # short-circuit, all elements are str
@@ -768,6 +771,10 @@ cpdef ndarray[object] ensure_string_array(
         if isinstance(val, str):
             continue
 
+        elif not already_copied:
+            result = result.copy()
+            already_copied = True
+
         if not checknull(val):
             if not util.is_float_object(val):
                 # f"{val}" is faster than str(val)
 
@@ -686,6 +686,18 @@ cdef datetime dateutil_parse(
         ret = ret + relativedelta.relativedelta(weekday=res.weekday)
     if not ignoretz:
         if res.tzname and res.tzname in time.tzname:
+            # GH#50791
+            if res.tzname != "UTC":
+                # If the system is localized in UTC (as many CI runs are)
+                #  we get tzlocal, once the deprecation is enforced will get
+                #  timezone.utc, not raise.
+                warnings.warn(
+                    "Parsing '{res.tzname}' as tzlocal (dependent on system timezone) "
+                    "is deprecated and will raise in a future version. Pass the 'tz' "
+                    "keyword or call tz_localize after construction instead",
+                    FutureWarning,
+                    stacklevel=find_stack_level()
+                )
             ret = ret.replace(tzinfo=_dateutil_tzlocal())
         elif res.tzoffset == 0:
             ret = ret.replace(tzinfo=_dateutil_tzutc())
 
@@ -11,6 +11,11 @@
 )
 import uuid
 
+from pandas._typing import (
+    BaseBuffer,
+    CompressionOptions,
+    FilePath,
+)
 from pandas.compat import PYPY
 from pandas.errors import ChainedAssignmentError
 
@@ -20,7 +25,9 @@
 
 
 @contextmanager
-def decompress_file(path, compression) -> Generator[IO[bytes], None, None]:
+def decompress_file(
+    path: FilePath | BaseBuffer, compression: CompressionOptions
+) -> Generator[IO[bytes], None, None]:
     """
     Open a compressed file and return a file object.
 
 
@@ -847,7 +847,8 @@ def value_counts(
         Series,
     )
 
-    name = getattr(values, "name", None)
+    index_name = getattr(values, "name", None)
+    name = "proportion" if normalize else "count"
 
     if bins is not None:
         from pandas.core.reshape.tile import cut
@@ -860,6 +861,7 @@ def value_counts(
 
         # count, remove nulls (from the index), and but the bins
         result = ii.value_counts(dropna=dropna)
+        result.name = name
         result = result[result.index.notna()]
         result.index = result.index.astype("interval")
         result = result.sort_index()
@@ -878,14 +880,18 @@ def value_counts(
             # handle Categorical and sparse,
             result = Series(values)._values.value_counts(dropna=dropna)
             result.name = name
+            result.index.name = index_name
             counts = result._values
 
         elif isinstance(values, ABCMultiIndex):
             # GH49558
             levels = list(range(values.nlevels))
-            result = Series(index=values).groupby(level=levels, dropna=dropna).size()
-            # TODO: allow index names to remain (see discussion in GH49497)
-            result.index.names = [None] * values.nlevels
+            result = (
+                Series(index=values, name=name)
+                .groupby(level=levels, dropna=dropna)
+                .size()
+            )
+            result.index.names = values.names
             counts = result._values
 
         else:
@@ -899,6 +905,7 @@ def value_counts(
             idx = Index(keys)
             if idx.dtype == bool and keys.dtype == object:
                 idx = idx.astype(object)
+            idx.name = index_name
 
             result = Series(counts, index=idx, name=name)
 
 
@@ -934,7 +934,7 @@ def value_counts(self, dropna: bool = True) -> Series:
 
         index = Index(type(self)(values))
 
-        return Series(counts, index=index).astype("Int64")
+        return Series(counts, index=index, name="count").astype("Int64")
 
     @classmethod
     def _concat_same_type(
@@ -1255,7 +1255,7 @@ def _quantile(
         pa_dtype = self._data.type
 
         data = self._data
-        if pa.types.is_temporal(pa_dtype) and interpolation in ["lower", "higher"]:
+        if pa.types.is_temporal(pa_dtype):
             # https://github.com/apache/arrow/issues/33769 in these cases
             #  we can cast to ints and back
             nbits = pa_dtype.bit_width
@@ -1266,7 +1266,12 @@ def _quantile(
 
         result = pc.quantile(data, q=qs, interpolation=interpolation)
 
-        if pa.types.is_temporal(pa_dtype) and interpolation in ["lower", "higher"]:
+        if pa.types.is_temporal(pa_dtype):
+            nbits = pa_dtype.bit_width
+            if nbits == 32:
+                result = result.cast(pa.int32())
+            else:
+                result = result.cast(pa.int64())
             result = result.cast(pa_dtype)
 
         return type(self)(result)
 
@@ -1499,7 +1499,7 @@ def value_counts(self, dropna: bool = True) -> Series:
         ix = coerce_indexer_dtype(ix, self.dtype.categories)
         ix = self._from_backing_data(ix)
 
-        return Series(count, index=CategoricalIndex(ix), dtype="int64")
+        return Series(count, index=CategoricalIndex(ix), dtype="int64", name="count")
 
     # error: Argument 2 of "_empty" is incompatible with supertype
     # "NDArrayBackedExtensionArray"; supertype defines the argument type as
@@ -2284,7 +2284,16 @@ def _replace(self, *, to_replace, value, inplace: bool = False):
         ser = ser.replace(to_replace=to_replace, value=value)
 
         all_values = Index(ser)
-        new_categories = Index(ser.drop_duplicates(keep="first"))
+
+        # GH51016: maintain order of existing categories
+        idxr = cat.categories.get_indexer_for(all_values)
+        locs = np.arange(len(ser))
+        locs = np.where(idxr == -1, locs, idxr)
+        locs = locs.argsort()
+
+        new_categories = ser.take(locs)
+        new_categories = new_categories.drop_duplicates(keep="first")
+        new_categories = Index(new_categories)
         new_codes = recode_for_categories(
             cat._codes, all_values, new_categories, copy=False
         )
 
@@ -996,7 +996,7 @@ def value_counts(self, dropna: bool = True) -> Series:
         )
 
         if dropna:
-            res = Series(value_counts, index=keys)
+            res = Series(value_counts, index=keys, name="count")
             res.index = res.index.astype(self.dtype)
             res = res.astype("Int64")
             return res
@@ -1012,7 +1012,7 @@ def value_counts(self, dropna: bool = True) -> Series:
         mask = np.zeros(len(counts), dtype="bool")
         counts_array = IntegerArray(counts, mask)
 
-        return Series(counts_array, index=index)
+        return Series(counts_array, index=index, name="count")
 
     @doc(ExtensionArray.equals)
     def equals(self, other) -> bool: