pandas-dev
diff --git a/‎asv_bench/benchmarks/io/csv.py
+3 b/‎asv_bench/benchmarks/io/csv.py
+3
diff --git a/‎ci/code_checks.sh
+2-10 b/‎ci/code_checks.sh
+2-10
diff --git a/‎doc/source/conf.py
-1 b/‎doc/source/conf.py
-1
diff --git a/‎doc/source/getting_started/install.rst
+5-4 b/‎doc/source/getting_started/install.rst
+5-4
diff --git a/‎doc/source/reference/frame.rst
+1 b/‎doc/source/reference/frame.rst
+1
diff --git a/‎doc/source/user_guide/io.rst
+2-2 b/‎doc/source/user_guide/io.rst
+2-2
diff --git a/‎doc/source/whatsnew/v2.2.1.rst
+5 b/‎doc/source/whatsnew/v2.2.1.rst
+5
diff --git a/‎doc/source/whatsnew/v3.0.0.rst
+3-1 b/‎doc/source/whatsnew/v3.0.0.rst
+3-1
diff --git a/‎pandas/_libs/groupby.pyx
+10-7 b/‎pandas/_libs/groupby.pyx
+10-7
diff --git a/‎pandas/_libs/hashtable_class_helper.pxi.in
+3 b/‎pandas/_libs/hashtable_class_helper.pxi.in
+3
diff --git a/‎pandas/_libs/index.pyi
+1 b/‎pandas/_libs/index.pyi
+1
diff --git a/‎pandas/_libs/index.pyx
+11 b/‎pandas/_libs/index.pyx
+11
diff --git a/‎pandas/_libs/src/parser/tokenizer.c
+9 b/‎pandas/_libs/src/parser/tokenizer.c
+9
diff --git a/‎pandas/_testing/__init__.py
-2 b/‎pandas/_testing/__init__.py
-2
diff --git a/‎pandas/_testing/_io.py
-29 b/‎pandas/_testing/_io.py
-29
diff --git a/‎pandas/_testing/_warnings.py
+6-1 b/‎pandas/_testing/_warnings.py
+6-1
diff --git a/‎pandas/compat/numpy/function.py
+2 b/‎pandas/compat/numpy/function.py
+2
diff --git a/‎pandas/core/frame.py
+5-7 b/‎pandas/core/frame.py
+5-7
@@ -408,6 +408,9 @@ def time_read_stringcsv(self, engine):
     def time_read_bytescsv(self, engine):
         read_csv(self.data(self.BytesIO_input), engine=engine)
 
+    def peakmem_read_csv(self, engine):
+        read_csv(self.data(self.BytesIO_input), engine=engine)
+
 
 class ReadCSVCategorical(BaseIO):
     fname = "__test__.csv"
 
@@ -65,16 +65,8 @@ fi
 ### DOCSTRINGS ###
 if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
 
-    MSG='Validate docstrings (EX01, EX04, GL01, GL02, GL03, GL04, GL05, GL06, GL07, GL09, GL10, PR03, PR04, PR05, PR06, PR08, PR09, PR10, RT01, RT02, RT04, RT05, SA02, SA03, SA04, SS01, SS02, SS03, SS04, SS05, SS06)' ; echo $MSG
-    $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX01,EX04,GL01,GL02,GL03,GL04,GL05,GL06,GL07,GL09,GL10,PR03,PR04,PR05,PR06,PR08,PR09,PR10,RT01,RT02,RT04,RT05,SA02,SA03,SA04,SS01,SS02,SS03,SS04,SS05,SS06
-    RET=$(($RET + $?)) ; echo $MSG "DONE"
-
-    MSG='Partially validate docstrings (EX03)' ;  echo $MSG
-    $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX03 --ignore_functions \
-        pandas.Series.plot.line \
-        pandas.Series.to_sql \
-        pandas.read_json \
-        pandas.DataFrame.to_sql # There should be no backslash in the final line, please keep this comment in the last ignored function
+    MSG='Validate docstrings (EX01, EX03, EX04, GL01, GL02, GL03, GL04, GL05, GL06, GL07, GL09, GL10, PR03, PR04, PR05, PR06, PR08, PR09, PR10, RT01, RT02, RT04, RT05, SA02, SA03, SA04, SS01, SS02, SS03, SS04, SS05, SS06)' ; echo $MSG
+    $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX01,EX03,EX04,GL01,GL02,GL03,GL04,GL05,GL06,GL07,GL09,GL10,PR03,PR04,PR05,PR06,PR08,PR09,PR10,RT01,RT02,RT04,RT05,SA02,SA03,SA04,SS01,SS02,SS03,SS04,SS05,SS06
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
     MSG='Partially validate docstrings (PR02)' ;  echo $MSG
 
@@ -460,7 +460,6 @@
         "dateutil": ("https://dateutil.readthedocs.io/en/latest/", None),
         "matplotlib": ("https://matplotlib.org/stable/", None),
         "numpy": ("https://numpy.org/doc/stable/", None),
-        "py": ("https://pylib.readthedocs.io/en/latest/", None),
         "python": ("https://docs.python.org/3/", None),
         "scipy": ("https://docs.scipy.org/doc/scipy/", None),
         "pyarrow": ("https://arrow.apache.org/docs/", None),
 
@@ -277,11 +277,12 @@ Installable with ``pip install "pandas[excel]"``.
 ========================= ================== =============== =============================================================
 Dependency                Minimum Version    pip extra       Notes
 ========================= ================== =============== =============================================================
-xlrd                      2.0.1              excel           Reading Excel
-xlsxwriter                3.0.5              excel           Writing Excel
-openpyxl                  3.1.0              excel           Reading / writing for xlsx files
+xlrd                      2.0.1              excel           Reading for xls files
+xlsxwriter                3.0.5              excel           Writing for xlsx files
+openpyxl                  3.1.0              excel           Reading / writing for Excel 2010 xlsx/xlsm/xltx/xltm files
 pyxlsb                    1.0.10             excel           Reading for xlsb files
-python-calamine           0.1.7              excel           Reading for xls/xlsx/xlsb/ods files
+python-calamine           0.1.7              excel           Reading for xls/xlsx/xlsm/xlsb/xla/xlam/ods files
+odfpy                     1.4.1              excel           Reading / writing for OpenDocument 1.2 files
 ========================= ================== =============== =============================================================
 
 HTML
 
@@ -49,6 +49,7 @@ Conversion
    DataFrame.infer_objects
    DataFrame.copy
    DataFrame.bool
+   DataFrame.to_numpy
 
 Indexing, iteration
 ~~~~~~~~~~~~~~~~~~~
 
@@ -61,8 +61,8 @@ Basic
 +++++
 
 filepath_or_buffer : various
-  Either a path to a file (a :class:`python:str`, :class:`python:pathlib.Path`,
-  or :class:`py:py._path.local.LocalPath`), URL (including http, ftp, and S3
+  Either a path to a file (a :class:`python:str`, :class:`python:pathlib.Path`)
+  URL (including http, ftp, and S3
   locations), or any object with a ``read()`` method (such as an open file or
   :class:`~python:io.StringIO`).
 sep : str, defaults to ``','`` for :func:`read_csv`, ``\t`` for :func:`read_table`
 
@@ -13,7 +13,12 @@ including other versions of pandas.
 
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
+- Fixed memory leak in :func:`read_csv` (:issue:`57039`)
+- Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`)
 - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`)
+- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`)
+- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`)
+- Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_221.bug_fixes:
 
@@ -84,7 +84,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor
 
 Other API changes
 ^^^^^^^^^^^^^^^^^
--
+- 3rd party ``py.path`` objects are no longer explicitly supported in IO methods. Use :py:class:`pathlib.Path` objects instead (:issue:`57091`)
 -
 
 .. ---------------------------------------------------------------------------
@@ -105,8 +105,10 @@ Performance improvements
 - Performance improvement in :meth:`DataFrame.join` when left and/or right are non-unique and ``how`` is ``"left"``, ``"right"``, or ``"inner"`` (:issue:`56817`)
 - Performance improvement in :meth:`DataFrame.join` with ``how="left"`` or ``how="right"`` and ``sort=True`` (:issue:`56919`)
 - Performance improvement in :meth:`DataFrameGroupBy.ffill`, :meth:`DataFrameGroupBy.bfill`, :meth:`SeriesGroupBy.ffill`, and :meth:`SeriesGroupBy.bfill` (:issue:`56902`)
+- Performance improvement in :meth:`Index.join` by propagating cached attributes in cases where the result matches one of the inputs (:issue:`57023`)
 - Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`)
 - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`)
+- Performance improvement in indexing operations for string dtypes (:issue:`56997`)
 -
 
 .. ---------------------------------------------------------------------------
 
@@ -1771,6 +1771,7 @@ def group_idxmin_idxmax(
         Py_ssize_t i, j, N, K, lab
         numeric_object_t val
         numeric_object_t[:, ::1] group_min_or_max
+        uint8_t[:, ::1] seen
         bint uses_mask = mask is not None
         bint isna_entry
         bint compute_max = name == "idxmax"
@@ -1784,13 +1785,10 @@ def group_idxmin_idxmax(
 
     if numeric_object_t is object:
         group_min_or_max = np.empty((<object>out).shape, dtype=object)
+        seen = np.zeros((<object>out).shape, dtype=np.uint8)
     else:
         group_min_or_max = np.empty_like(out, dtype=values.dtype)
-    if N > 0 and K > 0:
-        # When N or K is zero, we never use group_min_or_max
-        group_min_or_max[:] = _get_min_or_max(
-            values[0, 0], compute_max, is_datetimelike
-        )
+        seen = np.zeros_like(out, dtype=np.uint8)
 
     # When using transform, we need a valid value for take in the case
     # a category is not observed; these values will be dropped
@@ -1806,6 +1804,7 @@ def group_idxmin_idxmax(
                 if not skipna and out[lab, j] == -1:
                     # Once we've hit NA there is no going back
                     continue
+
                 val = values[i, j]
 
                 if uses_mask:
@@ -1814,10 +1813,14 @@ def group_idxmin_idxmax(
                     isna_entry = _treat_as_na(val, is_datetimelike)
 
                 if isna_entry:
-                    if not skipna:
+                    if not skipna or not seen[lab, j]:
                         out[lab, j] = -1
                 else:
-                    if compute_max:
+                    if not seen[lab, j]:
+                        seen[lab, j] = True
+                        group_min_or_max[lab, j] = val
+                        out[lab, j] = i
+                    elif compute_max:
                         if val > group_min_or_max[lab, j]:
                             group_min_or_max[lab, j] = val
                             out[lab, j] = i
 
@@ -933,6 +933,9 @@ cdef class StringHashTable(HashTable):
             kh_destroy_str(self.table)
             self.table = NULL
 
+    def __len__(self) -> int:
+        return self.table.size
+
     def sizeof(self, deep: bool = False) -> int:
         overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
         for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
 
@@ -50,6 +50,7 @@ class UInt32Engine(IndexEngine): ...
 class UInt16Engine(IndexEngine): ...
 class UInt8Engine(IndexEngine): ...
 class ObjectEngine(IndexEngine): ...
+class StringEngine(IndexEngine): ...
 class DatetimeEngine(Int64Engine): ...
 class TimedeltaEngine(DatetimeEngine): ...
 class PeriodEngine(Int64Engine): ...
 
@@ -533,6 +533,17 @@ cdef class ObjectEngine(IndexEngine):
         return loc
 
 
+cdef class StringEngine(IndexEngine):
+
+    cdef _make_hash_table(self, Py_ssize_t n):
+        return _hash.StringHashTable(n)
+
+    cdef _check_type(self, object val):
+        if not isinstance(val, str):
+            raise KeyError(val)
+        return str(val)
+
+
 cdef class DatetimeEngine(Int64Engine):
 
     cdef:
 
@@ -109,6 +109,14 @@ void parser_set_default_options(parser_t *self) {
 
 parser_t *parser_new(void) { return (parser_t *)calloc(1, sizeof(parser_t)); }
 
+static void parser_clear_data_buffers(parser_t *self) {
+  free_if_not_null((void *)&self->stream);
+  free_if_not_null((void *)&self->words);
+  free_if_not_null((void *)&self->word_starts);
+  free_if_not_null((void *)&self->line_start);
+  free_if_not_null((void *)&self->line_fields);
+}
+
 static void parser_cleanup(parser_t *self) {
   // XXX where to put this
   free_if_not_null((void *)&self->error_msg);
@@ -119,6 +127,7 @@ static void parser_cleanup(parser_t *self) {
     self->skipset = NULL;
   }
 
+  parser_clear_data_buffers(self);
   if (self->cb_cleanup != NULL) {
     self->cb_cleanup(self->source);
     self->cb_cleanup = NULL;
 
@@ -34,7 +34,6 @@
     Series,
 )
 from pandas._testing._io import (
-    round_trip_localpath,
     round_trip_pathlib,
     round_trip_pickle,
     write_to_compressed,
@@ -609,7 +608,6 @@ def shares_memory(left, right) -> bool:
     "OBJECT_DTYPES",
     "raise_assert_detail",
     "raises_chained_assignment_error",
-    "round_trip_localpath",
     "round_trip_pathlib",
     "round_trip_pickle",
     "setitem",
 
@@ -89,35 +89,6 @@ def round_trip_pathlib(writer, reader, path: str | None = None):
     return obj
 
 
-def round_trip_localpath(writer, reader, path: str | None = None):
-    """
-    Write an object to file specified by a py.path LocalPath and read it back.
-
-    Parameters
-    ----------
-    writer : callable bound to pandas object
-        IO writing function (e.g. DataFrame.to_csv )
-    reader : callable
-        IO reading function (e.g. pd.read_csv )
-    path : str, default None
-        The path where the object is written and then read.
-
-    Returns
-    -------
-    pandas object
-        The original object that was serialized and then re-read.
-    """
-    import pytest
-
-    LocalPath = pytest.importorskip("py.path").local
-    if path is None:
-        path = "___localpath___"
-    with ensure_clean(path) as path:
-        writer(LocalPath(path))
-        obj = reader(LocalPath(path))
-    return obj
-
-
 def write_to_compressed(compression, path, data, dest: str = "test") -> None:
     """
     Write data to a compressed file.
 
@@ -220,7 +220,12 @@ def _assert_raised_with_correct_stacklevel(
     frame = inspect.currentframe()
     for _ in range(4):
         frame = frame.f_back  # type: ignore[union-attr]
-    caller_filename = inspect.getfile(frame)  # type: ignore[arg-type]
+    try:
+        caller_filename = inspect.getfile(frame)  # type: ignore[arg-type]
+    finally:
+        # See note in
+        # https://docs.python.org/3/library/inspect.html#inspect.Traceback
+        del frame
     msg = (
         "Warning not set with correct stacklevel. "
         f"File where warning is raised: {actual_warning.filename} != "
 
@@ -138,6 +138,7 @@ def validate_argmax_with_skipna(skipna: bool | ndarray | None, args, kwargs) ->
 ARGSORT_DEFAULTS["kind"] = "quicksort"
 ARGSORT_DEFAULTS["order"] = None
 ARGSORT_DEFAULTS["kind"] = None
+ARGSORT_DEFAULTS["stable"] = None
 
 
 validate_argsort = CompatValidator(
@@ -149,6 +150,7 @@ def validate_argmax_with_skipna(skipna: bool | ndarray | None, args, kwargs) ->
 ARGSORT_DEFAULTS_KIND: dict[str, int | None] = {}
 ARGSORT_DEFAULTS_KIND["axis"] = -1
 ARGSORT_DEFAULTS_KIND["order"] = None
+ARGSORT_DEFAULTS_KIND["stable"] = None
 validate_argsort_kind = CompatValidator(
     ARGSORT_DEFAULTS_KIND, fname="argsort", max_fname_arg_count=0, method="both"
 )
 
@@ -8012,19 +8012,17 @@ def _arith_method_with_reindex(self, right: DataFrame, op) -> DataFrame:
         left = self
 
         # GH#31623, only operate on shared columns
-        cols, lcols, rcols = left.columns.join(
-            right.columns, how="inner", level=None, return_indexers=True
+        cols, lcol_indexer, rcol_indexer = left.columns.join(
+            right.columns, how="inner", return_indexers=True
         )
 
-        new_left = left.iloc[:, lcols]
-        new_right = right.iloc[:, rcols]
+        new_left = left if lcol_indexer is None else left.iloc[:, lcol_indexer]
+        new_right = right if rcol_indexer is None else right.iloc[:, rcol_indexer]
         result = op(new_left, new_right)
 
         # Do the join on the columns instead of using left._align_for_op
         #  to avoid constructing two potentially large/sparse DataFrames
-        join_columns, _, _ = left.columns.join(
-            right.columns, how="outer", level=None, return_indexers=True
-        )
+        join_columns = left.columns.join(right.columns, how="outer")
 
         if result.columns.has_duplicates:
             # Avoid reindexing with a duplicate axis.