From 074b8baba988b289049a46d907f60fa5d26cb3e1 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 4 Aug 2021 17:26:07 +0200
Subject: [PATCH 01/12] TST: update message in skip_array_manager mark (#42877)

---
 pandas/util/_test_decorators.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py
index 62e31c0e46715..b78f1652dc419 100644
--- a/pandas/util/_test_decorators.py
+++ b/pandas/util/_test_decorators.py
@@ -286,7 +286,8 @@ def async_mark():
 
 
 skip_array_manager_not_yet_implemented = pytest.mark.skipif(
-    get_option("mode.data_manager") == "array", reason="JSON C code relies on Blocks"
+    get_option("mode.data_manager") == "array",
+    reason="Not yet implemented for ArrayManager",
 )
 
 skip_array_manager_invalid_test = pytest.mark.skipif(

From 74e50ec515d668842f6ce55ef4d96a0f6001ccd8 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 4 Aug 2021 17:44:59 +0200
Subject: [PATCH 02/12] TST: remove chained assignment outside indexing tests
 (#42882)

---
 pandas/tests/frame/methods/test_dropna.py     |  2 +-
 .../tests/frame/methods/test_interpolate.py   | 26 +++++++++----------
 pandas/tests/frame/methods/test_isin.py       |  6 ++---
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/pandas/tests/frame/methods/test_dropna.py b/pandas/tests/frame/methods/test_dropna.py
index 76a6f3aa25362..bc2b48d3312d7 100644
--- a/pandas/tests/frame/methods/test_dropna.py
+++ b/pandas/tests/frame/methods/test_dropna.py
@@ -66,7 +66,7 @@ def test_dropIncompleteRows(self, float_frame):
 
     def test_dropna(self):
         df = DataFrame(np.random.randn(6, 4))
-        df[2][:2] = np.nan
+        df.iloc[:2, 2] = np.nan
 
         dropped = df.dropna(axis=1)
         expected = df.loc[:, [0, 1, 3]]
diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py
index d0551ffd5cffe..7a749f3705e35 100644
--- a/pandas/tests/frame/methods/test_interpolate.py
+++ b/pandas/tests/frame/methods/test_interpolate.py
@@ -102,34 +102,34 @@ def test_interp_various(self):
         expected = df.copy()
         result = df.interpolate(method="polynomial", order=1)
 
-        expected.A.loc[3] = 2.66666667
-        expected.A.loc[13] = 5.76923076
+        expected.loc[3, "A"] = 2.66666667
+        expected.loc[13, "A"] = 5.76923076
         tm.assert_frame_equal(result, expected)
 
         result = df.interpolate(method="cubic")
         # GH #15662.
-        expected.A.loc[3] = 2.81547781
-        expected.A.loc[13] = 5.52964175
+        expected.loc[3, "A"] = 2.81547781
+        expected.loc[13, "A"] = 5.52964175
         tm.assert_frame_equal(result, expected)
 
         result = df.interpolate(method="nearest")
-        expected.A.loc[3] = 2
-        expected.A.loc[13] = 5
+        expected.loc[3, "A"] = 2
+        expected.loc[13, "A"] = 5
         tm.assert_frame_equal(result, expected, check_dtype=False)
 
         result = df.interpolate(method="quadratic")
-        expected.A.loc[3] = 2.82150771
-        expected.A.loc[13] = 6.12648668
+        expected.loc[3, "A"] = 2.82150771
+        expected.loc[13, "A"] = 6.12648668
         tm.assert_frame_equal(result, expected)
 
         result = df.interpolate(method="slinear")
-        expected.A.loc[3] = 2.66666667
-        expected.A.loc[13] = 5.76923077
+        expected.loc[3, "A"] = 2.66666667
+        expected.loc[13, "A"] = 5.76923077
         tm.assert_frame_equal(result, expected)
 
         result = df.interpolate(method="zero")
-        expected.A.loc[3] = 2.0
-        expected.A.loc[13] = 5
+        expected.loc[3, "A"] = 2.0
+        expected.loc[13, "A"] = 5
         tm.assert_frame_equal(result, expected, check_dtype=False)
 
     @td.skip_if_no_scipy
@@ -218,7 +218,7 @@ def test_interp_leading_nans(self, check_scipy):
         )
         result = df.interpolate()
         expected = df.copy()
-        expected["B"].loc[3] = -3.75
+        expected.loc[3, "B"] = -3.75
         tm.assert_frame_equal(result, expected)
 
         if check_scipy:
diff --git a/pandas/tests/frame/methods/test_isin.py b/pandas/tests/frame/methods/test_isin.py
index d2ebd09c4cc48..e924963f588f3 100644
--- a/pandas/tests/frame/methods/test_isin.py
+++ b/pandas/tests/frame/methods/test_isin.py
@@ -79,8 +79,8 @@ def test_isin_df(self):
         df2 = DataFrame({"A": [0, 2, 12, 4], "B": [2, np.nan, 4, 5]})
         expected = DataFrame(False, df1.index, df1.columns)
         result = df1.isin(df2)
-        expected["A"].loc[[1, 3]] = True
-        expected["B"].loc[[0, 2]] = True
+        expected.loc[[1, 3], "A"] = True
+        expected.loc[[0, 2], "B"] = True
         tm.assert_frame_equal(result, expected)
 
         # partial overlapping columns
@@ -133,7 +133,7 @@ def test_isin_against_series(self):
         )
         s = Series([1, 3, 11, 4], index=["a", "b", "c", "d"])
         expected = DataFrame(False, index=df.index, columns=df.columns)
-        expected["A"].loc["a"] = True
+        expected.loc["a", "A"] = True
         expected.loc["d"] = True
         result = df.isin(s)
         tm.assert_frame_equal(result, expected)

From c182565be618929c17d7afe365ef17cfade5ca89 Mon Sep 17 00:00:00 2001
From: Shoham Debnath <debnathshoham@gmail.com>
Date: Thu, 5 Aug 2021 03:25:14 +0530
Subject: [PATCH 03/12] read_excel() modifies provided types dict when
 accessing file with duplicate column (#42508)

---
 doc/source/whatsnew/v1.3.2.rst                     | 1 +
 pandas/io/parsers/python_parser.py                 | 4 ++--
 pandas/tests/io/excel/test_readers.py              | 6 +++++-
 pandas/tests/io/parser/dtypes/test_dtypes_basic.py | 6 +++++-
 4 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.2.rst b/doc/source/whatsnew/v1.3.2.rst
index c716460e997d0..4e6ea85e2ff1d 100644
--- a/doc/source/whatsnew/v1.3.2.rst
+++ b/doc/source/whatsnew/v1.3.2.rst
@@ -30,6 +30,7 @@ Fixed regressions
 
 Bug fixes
 ~~~~~~~~~
+- Bug in :meth:`pandas.read_excel` modifies the dtypes dictionary when reading a file with duplicate columns (:issue:`42462`)
 - 1D slices over extension types turn into N-dimensional slices over ExtensionArrays (:issue:`42430`)
 - :meth:`.Styler.hide_columns` now hides the index name header row as well as column headers (:issue:`42101`)
 
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index 7c9fcde08bf24..af253fc062632 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -4,6 +4,7 @@
     abc,
     defaultdict,
 )
+from copy import copy
 import csv
 from io import StringIO
 import re
@@ -81,7 +82,7 @@ def __init__(self, f: FilePathOrBuffer | list, **kwds):
         self.verbose = kwds["verbose"]
         self.converters = kwds["converters"]
 
-        self.dtype = kwds["dtype"]
+        self.dtype = copy(kwds["dtype"])
         self.thousands = kwds["thousands"]
         self.decimal = kwds["decimal"]
 
@@ -432,7 +433,6 @@ def _infer_columns(self):
                                 and self.dtype.get(col) is None
                             ):
                                 self.dtype.update({col: self.dtype.get(old_col)})
-
                         this_columns[i] = col
                         counts[col] = cur_count + 1
                 elif have_mi_columns:
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
index cbd241ceda0b1..f999733192725 100644
--- a/pandas/tests/io/excel/test_readers.py
+++ b/pandas/tests/io/excel/test_readers.py
@@ -576,8 +576,12 @@ def test_reader_dtype_str(self, read_ext, dtype, expected):
     def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value):
         # GH#35211
         basename = "df_mangle_dup_col_dtypes"
-        result = pd.read_excel(basename + read_ext, dtype={"a": str, **dtypes})
+        dtype_dict = {"a": str, **dtypes}
+        dtype_dict_copy = dtype_dict.copy()
+        # GH#42462
+        result = pd.read_excel(basename + read_ext, dtype=dtype_dict)
         expected = DataFrame({"a": ["1"], "a.1": [exp_value]})
+        assert dtype_dict == dtype_dict_copy, "dtype dict changed"
         tm.assert_frame_equal(result, expected)
 
     def test_reader_spaces(self, read_ext):
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
index 6ed52ed86af2a..32a7ac44c0b38 100644
--- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
+++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -245,8 +245,12 @@ def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value):
     # GH#35211
     parser = all_parsers
     data = """a,a\n1,1"""
-    result = parser.read_csv(StringIO(data), dtype={"a": str, **dtypes})
+    dtype_dict = {"a": str, **dtypes}
+    # GH#42462
+    dtype_dict_copy = dtype_dict.copy()
+    result = parser.read_csv(StringIO(data), dtype=dtype_dict)
     expected = DataFrame({"a": ["1"], "a.1": [exp_value]})
+    assert dtype_dict == dtype_dict_copy, "dtype dict changed"
     tm.assert_frame_equal(result, expected)
 
 

From 17c6798a5ebadce2ee5ae40963935b562e3287bb Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Wed, 4 Aug 2021 15:18:04 -0700
Subject: [PATCH 04/12] BUG: ArrayManager reindex with copy=True not copying
 (#42647)

---
 pandas/core/generic.py                     |  2 ++
 pandas/core/internals/array_manager.py     |  2 ++
 pandas/tests/frame/methods/test_reindex.py | 14 ++++++++++++++
 3 files changed, 18 insertions(+)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 19dd06074bf78..2eace06c7bd8d 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -4840,6 +4840,8 @@ def _reindex_axes(
                 copy=copy,
                 allow_dups=False,
             )
+            # If we've made a copy once, no need to make another one
+            copy = False
 
         return obj
 
diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py
index 79c0aad66229c..3c429597ea3e2 100644
--- a/pandas/core/internals/array_manager.py
+++ b/pandas/core/internals/array_manager.py
@@ -601,6 +601,8 @@ def _reindex_indexer(
                     )
                 else:
                     arr = self.arrays[i]
+                    if copy:
+                        arr = arr.copy()
                 new_arrays.append(arr)
 
         else:
diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py
index d0765084adfa9..6b73c6a662da7 100644
--- a/pandas/tests/frame/methods/test_reindex.py
+++ b/pandas/tests/frame/methods/test_reindex.py
@@ -83,6 +83,20 @@ class TestDataFrameSelectReindex:
     # These are specific reindex-based tests; other indexing tests should go in
     # test_indexing
 
+    def test_reindex_copies(self):
+        # based on asv time_reindex_axis1
+        N = 10
+        df = DataFrame(np.random.randn(N * 10, N))
+        cols = np.arange(N)
+        np.random.shuffle(cols)
+
+        result = df.reindex(columns=cols, copy=True)
+        assert not np.shares_memory(result[0]._values, df[0]._values)
+
+        # pass both columns and index
+        result2 = df.reindex(columns=cols, index=df.index, copy=True)
+        assert not np.shares_memory(result2[0]._values, df[0]._values)
+
     def test_reindex_date_fill_value(self):
         # passing date to dt64 is deprecated
         arr = date_range("2016-01-01", periods=6).values.reshape(3, 2)

From 2d9ca9d91cecc44c80fd89b0c548158804f39348 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <37011898+mzeitlin11@users.noreply.github.com>
Date: Wed, 4 Aug 2021 18:19:13 -0400
Subject: [PATCH 05/12] REGR: sample modifying `weights` inplace (#42843)

---
 pandas/core/sample.py                     |  6 +++++-
 pandas/tests/frame/methods/test_sample.py | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/pandas/core/sample.py b/pandas/core/sample.py
index e4bad22e8e43c..63b8789f3f551 100644
--- a/pandas/core/sample.py
+++ b/pandas/core/sample.py
@@ -63,7 +63,11 @@ def preprocess_weights(obj: FrameOrSeries, weights, axis: int) -> np.ndarray:
     if (weights < 0).any():
         raise ValueError("weight vector many not include negative values")
 
-    weights[np.isnan(weights)] = 0
+    missing = np.isnan(weights)
+    if missing.any():
+        # Don't modify weights in place
+        weights = weights.copy()
+        weights[missing] = 0
     return weights
 
 
diff --git a/pandas/tests/frame/methods/test_sample.py b/pandas/tests/frame/methods/test_sample.py
index 366722531329a..d5d1f975deefa 100644
--- a/pandas/tests/frame/methods/test_sample.py
+++ b/pandas/tests/frame/methods/test_sample.py
@@ -339,6 +339,24 @@ def test_sample_is_copy(self):
         with tm.assert_produces_warning(None):
             df2["d"] = 1
 
+    def test_sample_does_not_modify_weights(self):
+        # GH-42843
+        result = np.array([np.nan, 1, np.nan])
+        expected = result.copy()
+        ser = Series([1, 2, 3])
+
+        # Test numpy array weights won't be modified in place
+        ser.sample(weights=result)
+        tm.assert_numpy_array_equal(result, expected)
+
+        # Test DataFrame column won't be modified in place
+        df = DataFrame({"values": [1, 1, 1], "weights": [1, np.nan, np.nan]})
+        expected = df["weights"].copy()
+
+        df.sample(frac=1.0, replace=True, weights="weights")
+        result = df["weights"]
+        tm.assert_series_equal(result, expected)
+
     def test_sample_ignore_index(self):
         # GH 38581
         df = DataFrame(

From c760df55e50caeffa128432ac7fd034570c4395f Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Wed, 4 Aug 2021 15:24:51 -0700
Subject: [PATCH 06/12] PERF: Groupby.shift dont re-call
 libgroupby.group_shift_indexer (#42885)

---
 pandas/core/groupby/groupby.py                 | 18 +++++++++++-------
 .../tests/groupby/transform/test_transform.py  |  2 +-
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index e57e48cb3ab11..5f9b1dec062f8 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -3031,15 +3031,19 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         if freq is not None or axis != 0:
             return self.apply(lambda x: x.shift(periods, freq, axis, fill_value))
 
-        return self._get_cythonized_result(
-            "group_shift_indexer",
-            numeric_only=False,
-            cython_dtype=np.dtype(np.int64),
-            needs_ngroups=True,
-            result_is_index=True,
-            periods=periods,
+        ids, _, ngroups = self.grouper.group_info
+        res_indexer = np.zeros(len(ids), dtype=np.int64)
+
+        libgroupby.group_shift_indexer(res_indexer, ids, ngroups, periods)
+
+        obj = self._obj_with_exclusions
+
+        res = obj._reindex_with_indexers(
+            {self.axis: (obj.axes[self.axis], res_indexer)},
             fill_value=fill_value,
+            allow_dups=True,
         )
+        return res
 
     @final
     @Substitution(name="groupby")
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
index 6275a5440a0e2..441cbfe66f1d8 100644
--- a/pandas/tests/groupby/transform/test_transform.py
+++ b/pandas/tests/groupby/transform/test_transform.py
@@ -183,7 +183,7 @@ def test_transform_axis_1(request, transformation_func, using_array_manager):
         result = df.groupby([0, 0, 1], axis=1).transform(transformation_func, *args)
         expected = df.T.groupby([0, 0, 1]).transform(transformation_func, *args).T
 
-    if transformation_func == "diff":
+    if transformation_func in ["diff", "shift"]:
         # Result contains nans, so transpose coerces to float
         expected["b"] = expected["b"].astype("int64")
 

From c5e236b6cc586a627f9364843ce00d2f293d57b4 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Wed, 4 Aug 2021 16:46:18 -0700
Subject: [PATCH 07/12] ENH: Add BytesIOWrapper (#42669)

---
 pandas/io/common.py            | 56 ++++++++++++++++++++++++++++++++--
 pandas/tests/io/test_common.py | 42 +++++++++++++++++++++++++
 2 files changed, 95 insertions(+), 3 deletions(-)

diff --git a/pandas/io/common.py b/pandas/io/common.py
index 06b00a9cbb4eb..4e97eaf8b953c 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -6,11 +6,13 @@
 from collections import abc
 import dataclasses
 import gzip
+import io
 from io import (
     BufferedIOBase,
     BytesIO,
     RawIOBase,
     StringIO,
+    TextIOBase,
     TextIOWrapper,
 )
 import mmap
@@ -50,7 +52,6 @@
 
 lzma = import_lzma()
 
-
 _VALID_URLS = set(uses_relative + uses_netloc + uses_params)
 _VALID_URLS.discard("")
 
@@ -102,7 +103,7 @@ def close(self) -> None:
         avoid closing the potentially user-created buffer.
         """
         if self.is_wrapped:
-            assert isinstance(self.handle, TextIOWrapper)
+            assert isinstance(self.handle, (TextIOWrapper, BytesIOWrapper))
             self.handle.flush()
             self.handle.detach()
             self.created_handles.remove(self.handle)
@@ -712,7 +713,16 @@ def get_handle(
 
     # Convert BytesIO or file objects passed with an encoding
     is_wrapped = False
-    if is_text and (compression or _is_binary_mode(handle, ioargs.mode)):
+    if not is_text and ioargs.mode == "rb" and isinstance(handle, TextIOBase):
+        handle = BytesIOWrapper(
+            handle,
+            encoding=ioargs.encoding,
+        )
+        handles.append(handle)
+        # the (text) handle is always provided by the caller
+        # since get_handle would have opened it in binary mode
+        is_wrapped = True
+    elif is_text and (compression or _is_binary_mode(handle, ioargs.mode)):
         handle = TextIOWrapper(
             # error: Argument 1 to "TextIOWrapper" has incompatible type
             # "Union[IO[bytes], IO[Any], RawIOBase, BufferedIOBase, TextIOBase, mmap]";
@@ -878,6 +888,46 @@ def __next__(self) -> str:
         return newline.lstrip("\n")
 
 
+# Wrapper that wraps a StringIO buffer and reads bytes from it
+# Created for compat with pyarrow read_csv
+class BytesIOWrapper(io.BytesIO):
+    buffer: StringIO | TextIOBase | None
+
+    def __init__(self, buffer: StringIO | TextIOBase, encoding: str = "utf-8"):
+        self.buffer = buffer
+        self.encoding = encoding
+        # Because a character can be represented by more than 1 byte,
+        # it is possible that reading will produce more bytes than n
+        # We store the extra bytes in this overflow variable, and append the
+        # overflow to the front of the bytestring the next time reading is performed
+        self.overflow = b""
+
+    def __getattr__(self, attr: str):
+        return getattr(self.buffer, attr)
+
+    def read(self, n: int | None = -1) -> bytes:
+        assert self.buffer is not None
+        bytestring = self.buffer.read(n).encode(self.encoding)
+        # When n=-1/n greater than remaining bytes: Read entire file/rest of file
+        combined_bytestring = self.overflow + bytestring
+        if n is None or n < 0 or n >= len(combined_bytestring):
+            self.overflow = b""
+            return combined_bytestring
+        else:
+            to_return = combined_bytestring[:n]
+            self.overflow = combined_bytestring[n:]
+            return to_return
+
+    def detach(self):
+        # Slightly modified from Python's TextIOWrapper detach method
+        if self.buffer is None:
+            raise ValueError("buffer is already detached")
+        self.flush()
+        buffer = self.buffer
+        self.buffer = None
+        return buffer
+
+
 def _maybe_memory_map(
     handle: FileOrBuffer,
     memory_map: bool,
diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
index d52ea01ac35de..b48d676cd0f8a 100644
--- a/pandas/tests/io/test_common.py
+++ b/pandas/tests/io/test_common.py
@@ -135,6 +135,48 @@ def test_get_handle_with_buffer(self):
         assert not input_buffer.closed
         input_buffer.close()
 
+    # Test that BytesIOWrapper(get_handle) returns correct amount of bytes every time
+    def test_bytesiowrapper_returns_correct_bytes(self):
+        # Test latin1, ucs-2, and ucs-4 chars
+        data = """a,b,c
+1,2,3
+©,®,®
+Look,a snake,🐍"""
+        with icom.get_handle(StringIO(data), "rb", is_text=False) as handles:
+            result = b""
+            chunksize = 5
+            while True:
+                chunk = handles.handle.read(chunksize)
+                # Make sure each chunk is correct amount of bytes
+                assert len(chunk) <= chunksize
+                if len(chunk) < chunksize:
+                    # Can be less amount of bytes, but only at EOF
+                    # which happens when read returns empty
+                    assert len(handles.handle.read()) == 0
+                    result += chunk
+                    break
+                result += chunk
+            assert result == data.encode("utf-8")
+
+    # Test that pyarrow can handle a file opened with get_handle
+    @td.skip_if_no("pyarrow", min_version="0.15.0")
+    def test_get_handle_pyarrow_compat(self):
+        from pyarrow import csv
+
+        # Test latin1, ucs-2, and ucs-4 chars
+        data = """a,b,c
+1,2,3
+©,®,®
+Look,a snake,🐍"""
+        expected = pd.DataFrame(
+            {"a": ["1", "©", "Look"], "b": ["2", "®", "a snake"], "c": ["3", "®", "🐍"]}
+        )
+        s = StringIO(data)
+        with icom.get_handle(s, "rb", is_text=False) as handles:
+            df = csv.read_csv(handles.handle).to_pandas()
+            tm.assert_frame_equal(df, expected)
+            assert not s.closed
+
     def test_iterator(self):
         with pd.read_csv(StringIO(self.data1), chunksize=1) as reader:
             result = pd.concat(reader, ignore_index=True)

From d68aa43ef65eed66c2965e990f75bd91687b8cec Mon Sep 17 00:00:00 2001
From: Francois Dion <fdion@dionresearch.com>
Date: Wed, 4 Aug 2021 19:51:05 -0400
Subject: [PATCH 08/12] BUG: column names with degree sign make query fail
 (#42826)

---
 doc/source/whatsnew/v1.4.0.rst        |  2 ++
 pandas/core/computation/parsing.py    |  1 +
 pandas/tests/computation/test_eval.py | 10 ++++++++++
 3 files changed, 13 insertions(+)

diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
index 6763c3043b102..16474dd83a1f5 100644
--- a/doc/source/whatsnew/v1.4.0.rst
+++ b/doc/source/whatsnew/v1.4.0.rst
@@ -234,6 +234,8 @@ Indexing
 - Bug in indexing on a :class:`MultiIndex` failing to drop scalar levels when the indexer is a tuple containing a datetime-like string (:issue:`42476`)
 - Bug in :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` when passing an ascending value, failed to raise or incorrectly raising ``ValueError`` (:issue:`41634`)
 - Bug in updating values of :class:`pandas.Series` using boolean index, created by using :meth:`pandas.DataFrame.pop` (:issue:`42530`)
+- Bug in :meth:`DataFrame.query` did not handle the degree sign in a backticked column name, such as \`Temp(°C)\`, used in an expression to query a dataframe (:issue:`42826`)
+-
 
 Missing
 ^^^^^^^
diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py
index 5e000116d19f2..89d1f2133f77a 100644
--- a/pandas/core/computation/parsing.py
+++ b/pandas/core/computation/parsing.py
@@ -49,6 +49,7 @@ def create_valid_python_identifier(name: str) -> str:
             "!": "_EXCLAMATIONMARK_",
             "$": "_DOLLARSIGN_",
             "€": "_EUROSIGN_",
+            "°": "_DEGREESIGN_",
             # Including quotes works, but there are exceptions.
             "'": "_SINGLEQUOTE_",
             '"': "_DOUBLEQUOTE_",
diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py
index f27112dbd3956..99c3cac9ba976 100644
--- a/pandas/tests/computation/test_eval.py
+++ b/pandas/tests/computation/test_eval.py
@@ -2035,6 +2035,16 @@ def test_truediv_deprecated(engine, parser):
     assert match in str(m[0].message)
 
 
+@pytest.mark.parametrize("column", ["Temp(°C)", "Capacitance(μF)"])
+def test_query_token(engine, column):
+    # See: https://github.com/pandas-dev/pandas/pull/42826
+    df = DataFrame(np.random.randn(5, 2), columns=[column, "b"])
+    expected = df[df[column] > 5]
+    query_string = f"`{column}` > 5"
+    result = df.query(query_string, engine=engine)
+    tm.assert_frame_equal(result, expected)
+
+
 def test_negate_lt_eq_le(engine, parser):
     df = DataFrame([[0, 10], [1, 20]], columns=["cat", "count"])
     expected = df[~(df.cat > 0)]

From 7d96743d067ed4cac8173d55baa048eef5f8972a Mon Sep 17 00:00:00 2001
From: Andrew Hawyrluk <50434302+ahawryluk@users.noreply.github.com>
Date: Wed, 4 Aug 2021 17:52:02 -0600
Subject: [PATCH 09/12] DOC: add cookbook link about hidden sheets (#42874)

---
 doc/source/user_guide/cookbook.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst
index 5f3da133d9c09..03221e71ea32a 100644
--- a/doc/source/user_guide/cookbook.rst
+++ b/doc/source/user_guide/cookbook.rst
@@ -1211,6 +1211,9 @@ The :ref:`Excel <io.excel>` docs
 `Modifying formatting in XlsxWriter output
 <https://pbpython.com/improve-pandas-excel-output.html>`__
 
+`Loading only visible sheets
+<https://github.com/pandas-dev/pandas/issues/19842#issuecomment-892150745>`__
+
 .. _cookbook.html:
 
 HTML

From 28849e387e63a94e59e986066a13310d96397430 Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Wed, 4 Aug 2021 17:05:41 -0700
Subject: [PATCH 10/12] DEPR: unused kind arg in Index methods (#42857)

---
 doc/source/whatsnew/v1.4.0.rst                |  2 ++
 pandas/core/indexes/base.py                   | 20 +++++++++++++++----
 pandas/core/indexes/datetimes.py              |  4 +++-
 pandas/core/indexes/multi.py                  | 11 ++++++++--
 pandas/core/indexes/numeric.py                |  2 +-
 .../tests/indexes/base_class/test_indexing.py |  8 +++++---
 pandas/tests/indexes/multi/test_indexing.py   |  3 ++-
 pandas/tests/indexes/numeric/test_indexing.py |  7 +++++--
 8 files changed, 43 insertions(+), 14 deletions(-)

diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
index 16474dd83a1f5..7395f9d2dcb9e 100644
--- a/doc/source/whatsnew/v1.4.0.rst
+++ b/doc/source/whatsnew/v1.4.0.rst
@@ -160,6 +160,8 @@ Deprecations
 - Deprecated treating ``numpy.datetime64`` objects as UTC times when passed to the :class:`Timestamp` constructor along with a timezone. In a future version, these will be treated as wall-times. To retain the old behavior, use ``Timestamp(dt64).tz_localize("UTC").tz_convert(tz)`` (:issue:`24559`)
 - Deprecated ignoring missing labels when indexing with a sequence of labels on a level of a MultiIndex (:issue:`42351`)
 - Creating an empty Series without a dtype will now raise a more visible ``FutureWarning`` instead of a ``DeprecationWarning`` (:issue:`30017`)
+- Deprecated the 'kind' argument in :meth:`Index.get_slice_bound`, :meth:`Index.slice_indexer`, :meth:`Index.slice_locs`; in a future version passing 'kind' will raise (:issue:`42857`)
+-
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 54271f0f9b492..1c94baf74b60b 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -5839,7 +5839,7 @@ def slice_indexer(
         start: Hashable | None = None,
         end: Hashable | None = None,
         step: int | None = None,
-        kind: str_t | None = None,
+        kind=no_default,
     ) -> slice:
         """
         Compute the slice indexer for input labels and step.
@@ -5855,6 +5855,8 @@ def slice_indexer(
         step : int, default None
         kind : str, default None
 
+            .. deprecated:: 1.4.0
+
         Returns
         -------
         indexer : slice
@@ -5880,6 +5882,8 @@ def slice_indexer(
         >>> idx.slice_indexer(start='b', end=('c', 'g'))
         slice(1, 3, None)
         """
+        self._deprecated_arg(kind, "kind", "slice_indexer")
+
         start_slice, end_slice = self.slice_locs(start, end, step=step)
 
         # return a slice
@@ -5928,6 +5932,8 @@ def _maybe_cast_slice_bound(self, label, side: str_t, kind=no_default):
         side : {'left', 'right'}
         kind : {'loc', 'getitem'} or None
 
+            .. deprecated:: 1.3.0
+
         Returns
         -------
         label : object
@@ -5962,7 +5968,7 @@ def _searchsorted_monotonic(self, label, side: str_t = "left"):
 
         raise ValueError("index must be monotonic increasing or decreasing")
 
-    def get_slice_bound(self, label, side: str_t, kind=None) -> int:
+    def get_slice_bound(self, label, side: str_t, kind=no_default) -> int:
         """
         Calculate slice bound that corresponds to given label.
 
@@ -5975,12 +5981,15 @@ def get_slice_bound(self, label, side: str_t, kind=None) -> int:
         side : {'left', 'right'}
         kind : {'loc', 'getitem'} or None
 
+            .. deprecated:: 1.4.0
+
         Returns
         -------
         int
             Index of label.
         """
-        assert kind in ["loc", "getitem", None]
+        assert kind in ["loc", "getitem", None, no_default]
+        self._deprecated_arg(kind, "kind", "get_slice_bound")
 
         if side not in ("left", "right"):
             raise ValueError(
@@ -6030,7 +6039,7 @@ def get_slice_bound(self, label, side: str_t, kind=None) -> int:
             else:
                 return slc
 
-    def slice_locs(self, start=None, end=None, step=None, kind=None):
+    def slice_locs(self, start=None, end=None, step=None, kind=no_default):
         """
         Compute slice locations for input labels.
 
@@ -6044,6 +6053,8 @@ def slice_locs(self, start=None, end=None, step=None, kind=None):
             If None, defaults to 1.
         kind : {'loc', 'getitem'} or None
 
+            .. deprecated:: 1.4.0
+
         Returns
         -------
         start, end : int
@@ -6062,6 +6073,7 @@ def slice_locs(self, start=None, end=None, step=None, kind=None):
         >>> idx.slice_locs(start='b', end='c')
         (1, 3)
         """
+        self._deprecated_arg(kind, "kind", "slice_locs")
         inc = step is None or step >= 0
 
         if not inc:
diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
index 9712a5d95a234..348598c1309eb 100644
--- a/pandas/core/indexes/datetimes.py
+++ b/pandas/core/indexes/datetimes.py
@@ -729,7 +729,7 @@ def _maybe_cast_slice_bound(self, label, side: str, kind=lib.no_default):
 
         return self._maybe_cast_for_get_loc(label)
 
-    def slice_indexer(self, start=None, end=None, step=None, kind=None):
+    def slice_indexer(self, start=None, end=None, step=None, kind=lib.no_default):
         """
         Return indexer for specified label slice.
         Index.slice_indexer, customized to handle time slicing.
@@ -743,6 +743,8 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None):
           value-based selection in non-monotonic cases.
 
         """
+        self._deprecated_arg(kind, "kind", "slice_indexer")
+
         # For historical reasons DatetimeIndex supports slices between two
         # instances of datetime.time as if it were applying a slice mask to
         # an array of (self.hour, self.minute, self.seconds, self.microsecond).
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index b4e8a763d1210..e5aa8e95e23de 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -2586,7 +2586,7 @@ def _get_indexer_level_0(self, target) -> np.ndarray:
         return ci.get_indexer_for(target)
 
     def get_slice_bound(
-        self, label: Hashable | Sequence[Hashable], side: str, kind: str | None = None
+        self, label: Hashable | Sequence[Hashable], side: str, kind=lib.no_default
     ) -> int:
         """
         For an ordered MultiIndex, compute slice bound
@@ -2601,6 +2601,8 @@ def get_slice_bound(
         side : {'left', 'right'}
         kind : {'loc', 'getitem', None}
 
+            .. deprecated:: 1.4.0
+
         Returns
         -------
         int
@@ -2632,11 +2634,13 @@ def get_slice_bound(
         MultiIndex.get_locs : Get location for a label/slice/list/mask or a
                               sequence of such.
         """
+        self._deprecated_arg(kind, "kind", "get_slice_bound")
+
         if not isinstance(label, tuple):
             label = (label,)
         return self._partial_tup_index(label, side=side)
 
-    def slice_locs(self, start=None, end=None, step=None, kind=None):
+    def slice_locs(self, start=None, end=None, step=None, kind=lib.no_default):
         """
         For an ordered MultiIndex, compute the slice locations for input
         labels.
@@ -2655,6 +2659,8 @@ def slice_locs(self, start=None, end=None, step=None, kind=None):
             Slice step
         kind : string, optional, defaults None
 
+            .. deprecated:: 1.4.0
+
         Returns
         -------
         (start, end) : (int, int)
@@ -2688,6 +2694,7 @@ def slice_locs(self, start=None, end=None, step=None, kind=None):
         MultiIndex.get_locs : Get location for a label/slice/list/mask or a
                               sequence of such.
         """
+        self._deprecated_arg(kind, "kind", "slice_locs")
         # This function adds nothing to its parent implementation (the magic
         # happens in get_slice_bound method), but it adds meaningful doc.
         return super().slice_locs(start, end, step)
diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py
index bb9a2688f0485..d31f6d6a252f3 100644
--- a/pandas/core/indexes/numeric.py
+++ b/pandas/core/indexes/numeric.py
@@ -244,7 +244,7 @@ def _convert_slice_indexer(self, key: slice, kind: str):
 
             # We always treat __getitem__ slicing as label-based
             # translate to locations
-            return self.slice_indexer(key.start, key.stop, key.step, kind=kind)
+            return self.slice_indexer(key.start, key.stop, key.step)
 
         return super()._convert_slice_indexer(key, kind=kind)
 
diff --git a/pandas/tests/indexes/base_class/test_indexing.py b/pandas/tests/indexes/base_class/test_indexing.py
index fd04a820037b9..654f5a89f1828 100644
--- a/pandas/tests/indexes/base_class/test_indexing.py
+++ b/pandas/tests/indexes/base_class/test_indexing.py
@@ -10,7 +10,8 @@ class TestGetSliceBounds:
     @pytest.mark.parametrize("side, expected", [("left", 4), ("right", 5)])
     def test_get_slice_bounds_within(self, kind, side, expected):
         index = Index(list("abcdef"))
-        result = index.get_slice_bound("e", kind=kind, side=side)
+        with tm.assert_produces_warning(FutureWarning, match="'kind' argument"):
+            result = index.get_slice_bound("e", kind=kind, side=side)
         assert result == expected
 
     @pytest.mark.parametrize("kind", ["getitem", "loc", None])
@@ -20,12 +21,13 @@ def test_get_slice_bounds_within(self, kind, side, expected):
     )
     def test_get_slice_bounds_outside(self, kind, side, expected, data, bound):
         index = Index(data)
-        result = index.get_slice_bound(bound, kind=kind, side=side)
+        with tm.assert_produces_warning(FutureWarning, match="'kind' argument"):
+            result = index.get_slice_bound(bound, kind=kind, side=side)
         assert result == expected
 
     def test_get_slice_bounds_invalid_side(self):
         with pytest.raises(ValueError, match="Invalid value for side kwarg"):
-            Index([]).get_slice_bound("a", kind=None, side="middle")
+            Index([]).get_slice_bound("a", side="middle")
 
 
 class TestGetIndexerNonUnique:
diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py
index d2afc76076dc0..e142cbf89f1bd 100644
--- a/pandas/tests/indexes/multi/test_indexing.py
+++ b/pandas/tests/indexes/multi/test_indexing.py
@@ -820,7 +820,8 @@ def test_timestamp_multiindex_indexer():
 def test_get_slice_bound_with_missing_value(index_arr, expected, target, algo):
     # issue 19132
     idx = MultiIndex.from_arrays(index_arr)
-    result = idx.get_slice_bound(target, side=algo, kind="loc")
+    with tm.assert_produces_warning(FutureWarning, match="'kind' argument"):
+        result = idx.get_slice_bound(target, side=algo, kind="loc")
     assert result == expected
 
 
diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py
index e6b418868dbeb..8f113491dad60 100644
--- a/pandas/tests/indexes/numeric/test_indexing.py
+++ b/pandas/tests/indexes/numeric/test_indexing.py
@@ -545,7 +545,9 @@ class TestGetSliceBounds:
     @pytest.mark.parametrize("side, expected", [("left", 4), ("right", 5)])
     def test_get_slice_bounds_within(self, kind, side, expected):
         index = Index(range(6))
-        result = index.get_slice_bound(4, kind=kind, side=side)
+        with tm.assert_produces_warning(FutureWarning, match="'kind' argument"):
+
+            result = index.get_slice_bound(4, kind=kind, side=side)
         assert result == expected
 
     @pytest.mark.parametrize("kind", ["getitem", "loc", None])
@@ -553,5 +555,6 @@ def test_get_slice_bounds_within(self, kind, side, expected):
     @pytest.mark.parametrize("bound, expected", [(-1, 0), (10, 6)])
     def test_get_slice_bounds_outside(self, kind, side, expected, bound):
         index = Index(range(6))
-        result = index.get_slice_bound(bound, kind=kind, side=side)
+        with tm.assert_produces_warning(FutureWarning, match="'kind' argument"):
+            result = index.get_slice_bound(bound, kind=kind, side=side)
         assert result == expected

From 6ef154eb7c44b8f56c8f11a324bb207363c5879e Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Wed, 4 Aug 2021 17:15:25 -0700
Subject: [PATCH 11/12] REF: date arg not reachable in
 DTI._maybe_cast_slice_bound (#42855)

---
 pandas/core/indexes/datetimes.py                | 9 ++++++++-
 pandas/tests/indexes/datetimes/test_indexing.py | 4 ++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
index 348598c1309eb..97c648013f9d1 100644
--- a/pandas/core/indexes/datetimes.py
+++ b/pandas/core/indexes/datetimes.py
@@ -722,7 +722,7 @@ def _maybe_cast_slice_bound(self, label, side: str, kind=lib.no_default):
             if self._is_strictly_monotonic_decreasing and len(self) > 1:
                 return upper if side == "left" else lower
             return lower if side == "left" else upper
-        elif isinstance(label, (self._data._recognized_scalars, date)):
+        elif isinstance(label, self._data._recognized_scalars):
             self._deprecate_mismatched_indexing(label)
         else:
             raise self._invalid_indexer("slice", label)
@@ -802,6 +802,13 @@ def check_str_or_none(point):
         else:
             return indexer
 
+    @doc(Index.get_slice_bound)
+    def get_slice_bound(self, label, side: str, kind=None) -> int:
+        # GH#42855 handle date here instead of _maybe_cast_slice_bound
+        if isinstance(label, date) and not isinstance(label, datetime):
+            label = Timestamp(label).to_pydatetime()
+        return super().get_slice_bound(label, side=side, kind=kind)
+
     # --------------------------------------------------------------------
 
     @property
diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py
index d705fa7f0ed2c..6eaf799ae2779 100644
--- a/pandas/tests/indexes/datetimes/test_indexing.py
+++ b/pandas/tests/indexes/datetimes/test_indexing.py
@@ -746,7 +746,7 @@ def test_get_slice_bounds_datetime_within(
             result = index.get_slice_bound(key, kind=kind, side=side)
         assert result == expected
 
-    @pytest.mark.parametrize("box", [date, datetime, Timestamp])
+    @pytest.mark.parametrize("box", [datetime, Timestamp])
     @pytest.mark.parametrize("kind", ["getitem", "loc", None])
     @pytest.mark.parametrize("side", ["left", "right"])
     @pytest.mark.parametrize("year, expected", [(1999, 0), (2020, 30)])
@@ -764,7 +764,7 @@ def test_get_slice_bounds_datetime_outside(
             result = index.get_slice_bound(key, kind=kind, side=side)
         assert result == expected
 
-    @pytest.mark.parametrize("box", [date, datetime, Timestamp])
+    @pytest.mark.parametrize("box", [datetime, Timestamp])
     @pytest.mark.parametrize("kind", ["getitem", "loc", None])
     def test_slice_datetime_locs(self, box, kind, tz_aware_fixture):
         # GH 34077

From e045034e5c89b932a526d6c9e691d3031784c377 Mon Sep 17 00:00:00 2001
From: Shoham Debnath <debnathshoham@gmail.com>
Date: Thu, 5 Aug 2021 11:22:22 +0530
Subject: [PATCH 12/12] TST: raising ValueError when inserting one dataframe in
 another (#42831)

* TST: raising ValueError when inserting one dataframe in another

* added GH issue reference

* rev msg

* included both msgs

* updated

* Update test_insert.py

* Update test_insert.py

* Update test_insert.py
---
 pandas/tests/frame/indexing/test_insert.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py
index 4f5ec8eff29a6..c2c862be42625 100644
--- a/pandas/tests/frame/indexing/test_insert.py
+++ b/pandas/tests/frame/indexing/test_insert.py
@@ -89,3 +89,13 @@ def test_insert_item_cache(self, using_array_manager):
         ser.values[0] = 99
 
         assert df.iloc[0, 0] == df[0][0]
+
+    def test_insert_frame(self):
+        # GH#42403
+        df = DataFrame({"col1": [1, 2], "col2": [3, 4]})
+        msg = (
+            "Expected a 1D array, got an array with shape "
+            r"\(2, 2\)|Wrong number of items passed 2, placement implies 1"
+        )
+        with pytest.raises(ValueError, match=msg):
+            df.insert(1, "newcol", df)