From 074b8baba988b289049a46d907f60fa5d26cb3e1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 4 Aug 2021 17:26:07 +0200 Subject: [PATCH 01/12] TST: update message in skip_array_manager mark (#42877) --- pandas/util/_test_decorators.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 62e31c0e46715..b78f1652dc419 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -286,7 +286,8 @@ def async_mark(): skip_array_manager_not_yet_implemented = pytest.mark.skipif( - get_option("mode.data_manager") == "array", reason="JSON C code relies on Blocks" + get_option("mode.data_manager") == "array", + reason="Not yet implemented for ArrayManager", ) skip_array_manager_invalid_test = pytest.mark.skipif( From 74e50ec515d668842f6ce55ef4d96a0f6001ccd8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 4 Aug 2021 17:44:59 +0200 Subject: [PATCH 02/12] TST: remove chained assignment outside indexing tests (#42882) --- pandas/tests/frame/methods/test_dropna.py | 2 +- .../tests/frame/methods/test_interpolate.py | 26 +++++++++---------- pandas/tests/frame/methods/test_isin.py | 6 ++--- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/pandas/tests/frame/methods/test_dropna.py b/pandas/tests/frame/methods/test_dropna.py index 76a6f3aa25362..bc2b48d3312d7 100644 --- a/pandas/tests/frame/methods/test_dropna.py +++ b/pandas/tests/frame/methods/test_dropna.py @@ -66,7 +66,7 @@ def test_dropIncompleteRows(self, float_frame): def test_dropna(self): df = DataFrame(np.random.randn(6, 4)) - df[2][:2] = np.nan + df.iloc[:2, 2] = np.nan dropped = df.dropna(axis=1) expected = df.loc[:, [0, 1, 3]] diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index d0551ffd5cffe..7a749f3705e35 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -102,34 +102,34 @@ def test_interp_various(self): expected = df.copy() result = df.interpolate(method="polynomial", order=1) - expected.A.loc[3] = 2.66666667 - expected.A.loc[13] = 5.76923076 + expected.loc[3, "A"] = 2.66666667 + expected.loc[13, "A"] = 5.76923076 tm.assert_frame_equal(result, expected) result = df.interpolate(method="cubic") # GH #15662. - expected.A.loc[3] = 2.81547781 - expected.A.loc[13] = 5.52964175 + expected.loc[3, "A"] = 2.81547781 + expected.loc[13, "A"] = 5.52964175 tm.assert_frame_equal(result, expected) result = df.interpolate(method="nearest") - expected.A.loc[3] = 2 - expected.A.loc[13] = 5 + expected.loc[3, "A"] = 2 + expected.loc[13, "A"] = 5 tm.assert_frame_equal(result, expected, check_dtype=False) result = df.interpolate(method="quadratic") - expected.A.loc[3] = 2.82150771 - expected.A.loc[13] = 6.12648668 + expected.loc[3, "A"] = 2.82150771 + expected.loc[13, "A"] = 6.12648668 tm.assert_frame_equal(result, expected) result = df.interpolate(method="slinear") - expected.A.loc[3] = 2.66666667 - expected.A.loc[13] = 5.76923077 + expected.loc[3, "A"] = 2.66666667 + expected.loc[13, "A"] = 5.76923077 tm.assert_frame_equal(result, expected) result = df.interpolate(method="zero") - expected.A.loc[3] = 2.0 - expected.A.loc[13] = 5 + expected.loc[3, "A"] = 2.0 + expected.loc[13, "A"] = 5 tm.assert_frame_equal(result, expected, check_dtype=False) @td.skip_if_no_scipy @@ -218,7 +218,7 @@ def test_interp_leading_nans(self, check_scipy): ) result = df.interpolate() expected = df.copy() - expected["B"].loc[3] = -3.75 + expected.loc[3, "B"] = -3.75 tm.assert_frame_equal(result, expected) if check_scipy: diff --git a/pandas/tests/frame/methods/test_isin.py b/pandas/tests/frame/methods/test_isin.py index d2ebd09c4cc48..e924963f588f3 100644 --- a/pandas/tests/frame/methods/test_isin.py +++ b/pandas/tests/frame/methods/test_isin.py @@ -79,8 +79,8 @@ def test_isin_df(self): df2 = DataFrame({"A": [0, 2, 12, 4], "B": [2, np.nan, 4, 5]}) expected = DataFrame(False, df1.index, df1.columns) result = df1.isin(df2) - expected["A"].loc[[1, 3]] = True - expected["B"].loc[[0, 2]] = True + expected.loc[[1, 3], "A"] = True + expected.loc[[0, 2], "B"] = True tm.assert_frame_equal(result, expected) # partial overlapping columns @@ -133,7 +133,7 @@ def test_isin_against_series(self): ) s = Series([1, 3, 11, 4], index=["a", "b", "c", "d"]) expected = DataFrame(False, index=df.index, columns=df.columns) - expected["A"].loc["a"] = True + expected.loc["a", "A"] = True expected.loc["d"] = True result = df.isin(s) tm.assert_frame_equal(result, expected) From c182565be618929c17d7afe365ef17cfade5ca89 Mon Sep 17 00:00:00 2001 From: Shoham Debnath Date: Thu, 5 Aug 2021 03:25:14 +0530 Subject: [PATCH 03/12] read_excel() modifies provided types dict when accessing file with duplicate column (#42508) --- doc/source/whatsnew/v1.3.2.rst | 1 + pandas/io/parsers/python_parser.py | 4 ++-- pandas/tests/io/excel/test_readers.py | 6 +++++- pandas/tests/io/parser/dtypes/test_dtypes_basic.py | 6 +++++- 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.3.2.rst b/doc/source/whatsnew/v1.3.2.rst index c716460e997d0..4e6ea85e2ff1d 100644 --- a/doc/source/whatsnew/v1.3.2.rst +++ b/doc/source/whatsnew/v1.3.2.rst @@ -30,6 +30,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Bug in :meth:`pandas.read_excel` modifies the dtypes dictionary when reading a file with duplicate columns (:issue:`42462`) - 1D slices over extension types turn into N-dimensional slices over ExtensionArrays (:issue:`42430`) - :meth:`.Styler.hide_columns` now hides the index name header row as well as column headers (:issue:`42101`) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 7c9fcde08bf24..af253fc062632 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -4,6 +4,7 @@ abc, defaultdict, ) +from copy import copy import csv from io import StringIO import re @@ -81,7 +82,7 @@ def __init__(self, f: FilePathOrBuffer | list, **kwds): self.verbose = kwds["verbose"] self.converters = kwds["converters"] - self.dtype = kwds["dtype"] + self.dtype = copy(kwds["dtype"]) self.thousands = kwds["thousands"] self.decimal = kwds["decimal"] @@ -432,7 +433,6 @@ def _infer_columns(self): and self.dtype.get(col) is None ): self.dtype.update({col: self.dtype.get(old_col)}) - this_columns[i] = col counts[col] = cur_count + 1 elif have_mi_columns: diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index cbd241ceda0b1..f999733192725 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -576,8 +576,12 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value): # GH#35211 basename = "df_mangle_dup_col_dtypes" - result = pd.read_excel(basename + read_ext, dtype={"a": str, **dtypes}) + dtype_dict = {"a": str, **dtypes} + dtype_dict_copy = dtype_dict.copy() + # GH#42462 + result = pd.read_excel(basename + read_ext, dtype=dtype_dict) expected = DataFrame({"a": ["1"], "a.1": [exp_value]}) + assert dtype_dict == dtype_dict_copy, "dtype dict changed" tm.assert_frame_equal(result, expected) def test_reader_spaces(self, read_ext): diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 6ed52ed86af2a..32a7ac44c0b38 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -245,8 +245,12 @@ def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): # GH#35211 parser = all_parsers data = """a,a\n1,1""" - result = parser.read_csv(StringIO(data), dtype={"a": str, **dtypes}) + dtype_dict = {"a": str, **dtypes} + # GH#42462 + dtype_dict_copy = dtype_dict.copy() + result = parser.read_csv(StringIO(data), dtype=dtype_dict) expected = DataFrame({"a": ["1"], "a.1": [exp_value]}) + assert dtype_dict == dtype_dict_copy, "dtype dict changed" tm.assert_frame_equal(result, expected) From 17c6798a5ebadce2ee5ae40963935b562e3287bb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 4 Aug 2021 15:18:04 -0700 Subject: [PATCH 04/12] BUG: ArrayManager reindex with copy=True not copying (#42647) --- pandas/core/generic.py | 2 ++ pandas/core/internals/array_manager.py | 2 ++ pandas/tests/frame/methods/test_reindex.py | 14 ++++++++++++++ 3 files changed, 18 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 19dd06074bf78..2eace06c7bd8d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4840,6 +4840,8 @@ def _reindex_axes( copy=copy, allow_dups=False, ) + # If we've made a copy once, no need to make another one + copy = False return obj diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 79c0aad66229c..3c429597ea3e2 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -601,6 +601,8 @@ def _reindex_indexer( ) else: arr = self.arrays[i] + if copy: + arr = arr.copy() new_arrays.append(arr) else: diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index d0765084adfa9..6b73c6a662da7 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -83,6 +83,20 @@ class TestDataFrameSelectReindex: # These are specific reindex-based tests; other indexing tests should go in # test_indexing + def test_reindex_copies(self): + # based on asv time_reindex_axis1 + N = 10 + df = DataFrame(np.random.randn(N * 10, N)) + cols = np.arange(N) + np.random.shuffle(cols) + + result = df.reindex(columns=cols, copy=True) + assert not np.shares_memory(result[0]._values, df[0]._values) + + # pass both columns and index + result2 = df.reindex(columns=cols, index=df.index, copy=True) + assert not np.shares_memory(result2[0]._values, df[0]._values) + def test_reindex_date_fill_value(self): # passing date to dt64 is deprecated arr = date_range("2016-01-01", periods=6).values.reshape(3, 2) From 2d9ca9d91cecc44c80fd89b0c548158804f39348 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin <37011898+mzeitlin11@users.noreply.github.com> Date: Wed, 4 Aug 2021 18:19:13 -0400 Subject: [PATCH 05/12] REGR: sample modifying `weights` inplace (#42843) --- pandas/core/sample.py | 6 +++++- pandas/tests/frame/methods/test_sample.py | 18 ++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/pandas/core/sample.py b/pandas/core/sample.py index e4bad22e8e43c..63b8789f3f551 100644 --- a/pandas/core/sample.py +++ b/pandas/core/sample.py @@ -63,7 +63,11 @@ def preprocess_weights(obj: FrameOrSeries, weights, axis: int) -> np.ndarray: if (weights < 0).any(): raise ValueError("weight vector many not include negative values") - weights[np.isnan(weights)] = 0 + missing = np.isnan(weights) + if missing.any(): + # Don't modify weights in place + weights = weights.copy() + weights[missing] = 0 return weights diff --git a/pandas/tests/frame/methods/test_sample.py b/pandas/tests/frame/methods/test_sample.py index 366722531329a..d5d1f975deefa 100644 --- a/pandas/tests/frame/methods/test_sample.py +++ b/pandas/tests/frame/methods/test_sample.py @@ -339,6 +339,24 @@ def test_sample_is_copy(self): with tm.assert_produces_warning(None): df2["d"] = 1 + def test_sample_does_not_modify_weights(self): + # GH-42843 + result = np.array([np.nan, 1, np.nan]) + expected = result.copy() + ser = Series([1, 2, 3]) + + # Test numpy array weights won't be modified in place + ser.sample(weights=result) + tm.assert_numpy_array_equal(result, expected) + + # Test DataFrame column won't be modified in place + df = DataFrame({"values": [1, 1, 1], "weights": [1, np.nan, np.nan]}) + expected = df["weights"].copy() + + df.sample(frac=1.0, replace=True, weights="weights") + result = df["weights"] + tm.assert_series_equal(result, expected) + def test_sample_ignore_index(self): # GH 38581 df = DataFrame( From c760df55e50caeffa128432ac7fd034570c4395f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 4 Aug 2021 15:24:51 -0700 Subject: [PATCH 06/12] PERF: Groupby.shift dont re-call libgroupby.group_shift_indexer (#42885) --- pandas/core/groupby/groupby.py | 18 +++++++++++------- .../tests/groupby/transform/test_transform.py | 2 +- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e57e48cb3ab11..5f9b1dec062f8 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3031,15 +3031,19 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): if freq is not None or axis != 0: return self.apply(lambda x: x.shift(periods, freq, axis, fill_value)) - return self._get_cythonized_result( - "group_shift_indexer", - numeric_only=False, - cython_dtype=np.dtype(np.int64), - needs_ngroups=True, - result_is_index=True, - periods=periods, + ids, _, ngroups = self.grouper.group_info + res_indexer = np.zeros(len(ids), dtype=np.int64) + + libgroupby.group_shift_indexer(res_indexer, ids, ngroups, periods) + + obj = self._obj_with_exclusions + + res = obj._reindex_with_indexers( + {self.axis: (obj.axes[self.axis], res_indexer)}, fill_value=fill_value, + allow_dups=True, ) + return res @final @Substitution(name="groupby") diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 6275a5440a0e2..441cbfe66f1d8 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -183,7 +183,7 @@ def test_transform_axis_1(request, transformation_func, using_array_manager): result = df.groupby([0, 0, 1], axis=1).transform(transformation_func, *args) expected = df.T.groupby([0, 0, 1]).transform(transformation_func, *args).T - if transformation_func == "diff": + if transformation_func in ["diff", "shift"]: # Result contains nans, so transpose coerces to float expected["b"] = expected["b"].astype("int64") From c5e236b6cc586a627f9364843ce00d2f293d57b4 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 4 Aug 2021 16:46:18 -0700 Subject: [PATCH 07/12] ENH: Add BytesIOWrapper (#42669) --- pandas/io/common.py | 56 ++++++++++++++++++++++++++++++++-- pandas/tests/io/test_common.py | 42 +++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 3 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 06b00a9cbb4eb..4e97eaf8b953c 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -6,11 +6,13 @@ from collections import abc import dataclasses import gzip +import io from io import ( BufferedIOBase, BytesIO, RawIOBase, StringIO, + TextIOBase, TextIOWrapper, ) import mmap @@ -50,7 +52,6 @@ lzma = import_lzma() - _VALID_URLS = set(uses_relative + uses_netloc + uses_params) _VALID_URLS.discard("") @@ -102,7 +103,7 @@ def close(self) -> None: avoid closing the potentially user-created buffer. """ if self.is_wrapped: - assert isinstance(self.handle, TextIOWrapper) + assert isinstance(self.handle, (TextIOWrapper, BytesIOWrapper)) self.handle.flush() self.handle.detach() self.created_handles.remove(self.handle) @@ -712,7 +713,16 @@ def get_handle( # Convert BytesIO or file objects passed with an encoding is_wrapped = False - if is_text and (compression or _is_binary_mode(handle, ioargs.mode)): + if not is_text and ioargs.mode == "rb" and isinstance(handle, TextIOBase): + handle = BytesIOWrapper( + handle, + encoding=ioargs.encoding, + ) + handles.append(handle) + # the (text) handle is always provided by the caller + # since get_handle would have opened it in binary mode + is_wrapped = True + elif is_text and (compression or _is_binary_mode(handle, ioargs.mode)): handle = TextIOWrapper( # error: Argument 1 to "TextIOWrapper" has incompatible type # "Union[IO[bytes], IO[Any], RawIOBase, BufferedIOBase, TextIOBase, mmap]"; @@ -878,6 +888,46 @@ def __next__(self) -> str: return newline.lstrip("\n") +# Wrapper that wraps a StringIO buffer and reads bytes from it +# Created for compat with pyarrow read_csv +class BytesIOWrapper(io.BytesIO): + buffer: StringIO | TextIOBase | None + + def __init__(self, buffer: StringIO | TextIOBase, encoding: str = "utf-8"): + self.buffer = buffer + self.encoding = encoding + # Because a character can be represented by more than 1 byte, + # it is possible that reading will produce more bytes than n + # We store the extra bytes in this overflow variable, and append the + # overflow to the front of the bytestring the next time reading is performed + self.overflow = b"" + + def __getattr__(self, attr: str): + return getattr(self.buffer, attr) + + def read(self, n: int | None = -1) -> bytes: + assert self.buffer is not None + bytestring = self.buffer.read(n).encode(self.encoding) + # When n=-1/n greater than remaining bytes: Read entire file/rest of file + combined_bytestring = self.overflow + bytestring + if n is None or n < 0 or n >= len(combined_bytestring): + self.overflow = b"" + return combined_bytestring + else: + to_return = combined_bytestring[:n] + self.overflow = combined_bytestring[n:] + return to_return + + def detach(self): + # Slightly modified from Python's TextIOWrapper detach method + if self.buffer is None: + raise ValueError("buffer is already detached") + self.flush() + buffer = self.buffer + self.buffer = None + return buffer + + def _maybe_memory_map( handle: FileOrBuffer, memory_map: bool, diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index d52ea01ac35de..b48d676cd0f8a 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -135,6 +135,48 @@ def test_get_handle_with_buffer(self): assert not input_buffer.closed input_buffer.close() + # Test that BytesIOWrapper(get_handle) returns correct amount of bytes every time + def test_bytesiowrapper_returns_correct_bytes(self): + # Test latin1, ucs-2, and ucs-4 chars + data = """a,b,c +1,2,3 +©,®,® +Look,a snake,🐍""" + with icom.get_handle(StringIO(data), "rb", is_text=False) as handles: + result = b"" + chunksize = 5 + while True: + chunk = handles.handle.read(chunksize) + # Make sure each chunk is correct amount of bytes + assert len(chunk) <= chunksize + if len(chunk) < chunksize: + # Can be less amount of bytes, but only at EOF + # which happens when read returns empty + assert len(handles.handle.read()) == 0 + result += chunk + break + result += chunk + assert result == data.encode("utf-8") + + # Test that pyarrow can handle a file opened with get_handle + @td.skip_if_no("pyarrow", min_version="0.15.0") + def test_get_handle_pyarrow_compat(self): + from pyarrow import csv + + # Test latin1, ucs-2, and ucs-4 chars + data = """a,b,c +1,2,3 +©,®,® +Look,a snake,🐍""" + expected = pd.DataFrame( + {"a": ["1", "©", "Look"], "b": ["2", "®", "a snake"], "c": ["3", "®", "🐍"]} + ) + s = StringIO(data) + with icom.get_handle(s, "rb", is_text=False) as handles: + df = csv.read_csv(handles.handle).to_pandas() + tm.assert_frame_equal(df, expected) + assert not s.closed + def test_iterator(self): with pd.read_csv(StringIO(self.data1), chunksize=1) as reader: result = pd.concat(reader, ignore_index=True) From d68aa43ef65eed66c2965e990f75bd91687b8cec Mon Sep 17 00:00:00 2001 From: Francois Dion Date: Wed, 4 Aug 2021 19:51:05 -0400 Subject: [PATCH 08/12] BUG: column names with degree sign make query fail (#42826) --- doc/source/whatsnew/v1.4.0.rst | 2 ++ pandas/core/computation/parsing.py | 1 + pandas/tests/computation/test_eval.py | 10 ++++++++++ 3 files changed, 13 insertions(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 6763c3043b102..16474dd83a1f5 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -234,6 +234,8 @@ Indexing - Bug in indexing on a :class:`MultiIndex` failing to drop scalar levels when the indexer is a tuple containing a datetime-like string (:issue:`42476`) - Bug in :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` when passing an ascending value, failed to raise or incorrectly raising ``ValueError`` (:issue:`41634`) - Bug in updating values of :class:`pandas.Series` using boolean index, created by using :meth:`pandas.DataFrame.pop` (:issue:`42530`) +- Bug in :meth:`DataFrame.query` did not handle the degree sign in a backticked column name, such as \`Temp(°C)\`, used in an expression to query a dataframe (:issue:`42826`) +- Missing ^^^^^^^ diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index 5e000116d19f2..89d1f2133f77a 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -49,6 +49,7 @@ def create_valid_python_identifier(name: str) -> str: "!": "_EXCLAMATIONMARK_", "$": "_DOLLARSIGN_", "€": "_EUROSIGN_", + "°": "_DEGREESIGN_", # Including quotes works, but there are exceptions. "'": "_SINGLEQUOTE_", '"': "_DOUBLEQUOTE_", diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index f27112dbd3956..99c3cac9ba976 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -2035,6 +2035,16 @@ def test_truediv_deprecated(engine, parser): assert match in str(m[0].message) +@pytest.mark.parametrize("column", ["Temp(°C)", "Capacitance(μF)"]) +def test_query_token(engine, column): + # See: https://github.com/pandas-dev/pandas/pull/42826 + df = DataFrame(np.random.randn(5, 2), columns=[column, "b"]) + expected = df[df[column] > 5] + query_string = f"`{column}` > 5" + result = df.query(query_string, engine=engine) + tm.assert_frame_equal(result, expected) + + def test_negate_lt_eq_le(engine, parser): df = DataFrame([[0, 10], [1, 20]], columns=["cat", "count"]) expected = df[~(df.cat > 0)] From 7d96743d067ed4cac8173d55baa048eef5f8972a Mon Sep 17 00:00:00 2001 From: Andrew Hawyrluk <50434302+ahawryluk@users.noreply.github.com> Date: Wed, 4 Aug 2021 17:52:02 -0600 Subject: [PATCH 09/12] DOC: add cookbook link about hidden sheets (#42874) --- doc/source/user_guide/cookbook.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 5f3da133d9c09..03221e71ea32a 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -1211,6 +1211,9 @@ The :ref:`Excel ` docs `Modifying formatting in XlsxWriter output `__ +`Loading only visible sheets +`__ + .. _cookbook.html: HTML From 28849e387e63a94e59e986066a13310d96397430 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 4 Aug 2021 17:05:41 -0700 Subject: [PATCH 10/12] DEPR: unused kind arg in Index methods (#42857) --- doc/source/whatsnew/v1.4.0.rst | 2 ++ pandas/core/indexes/base.py | 20 +++++++++++++++---- pandas/core/indexes/datetimes.py | 4 +++- pandas/core/indexes/multi.py | 11 ++++++++-- pandas/core/indexes/numeric.py | 2 +- .../tests/indexes/base_class/test_indexing.py | 8 +++++--- pandas/tests/indexes/multi/test_indexing.py | 3 ++- pandas/tests/indexes/numeric/test_indexing.py | 7 +++++-- 8 files changed, 43 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 16474dd83a1f5..7395f9d2dcb9e 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -160,6 +160,8 @@ Deprecations - Deprecated treating ``numpy.datetime64`` objects as UTC times when passed to the :class:`Timestamp` constructor along with a timezone. In a future version, these will be treated as wall-times. To retain the old behavior, use ``Timestamp(dt64).tz_localize("UTC").tz_convert(tz)`` (:issue:`24559`) - Deprecated ignoring missing labels when indexing with a sequence of labels on a level of a MultiIndex (:issue:`42351`) - Creating an empty Series without a dtype will now raise a more visible ``FutureWarning`` instead of a ``DeprecationWarning`` (:issue:`30017`) +- Deprecated the 'kind' argument in :meth:`Index.get_slice_bound`, :meth:`Index.slice_indexer`, :meth:`Index.slice_locs`; in a future version passing 'kind' will raise (:issue:`42857`) +- .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 54271f0f9b492..1c94baf74b60b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5839,7 +5839,7 @@ def slice_indexer( start: Hashable | None = None, end: Hashable | None = None, step: int | None = None, - kind: str_t | None = None, + kind=no_default, ) -> slice: """ Compute the slice indexer for input labels and step. @@ -5855,6 +5855,8 @@ def slice_indexer( step : int, default None kind : str, default None + .. deprecated:: 1.4.0 + Returns ------- indexer : slice @@ -5880,6 +5882,8 @@ def slice_indexer( >>> idx.slice_indexer(start='b', end=('c', 'g')) slice(1, 3, None) """ + self._deprecated_arg(kind, "kind", "slice_indexer") + start_slice, end_slice = self.slice_locs(start, end, step=step) # return a slice @@ -5928,6 +5932,8 @@ def _maybe_cast_slice_bound(self, label, side: str_t, kind=no_default): side : {'left', 'right'} kind : {'loc', 'getitem'} or None + .. deprecated:: 1.3.0 + Returns ------- label : object @@ -5962,7 +5968,7 @@ def _searchsorted_monotonic(self, label, side: str_t = "left"): raise ValueError("index must be monotonic increasing or decreasing") - def get_slice_bound(self, label, side: str_t, kind=None) -> int: + def get_slice_bound(self, label, side: str_t, kind=no_default) -> int: """ Calculate slice bound that corresponds to given label. @@ -5975,12 +5981,15 @@ def get_slice_bound(self, label, side: str_t, kind=None) -> int: side : {'left', 'right'} kind : {'loc', 'getitem'} or None + .. deprecated:: 1.4.0 + Returns ------- int Index of label. """ - assert kind in ["loc", "getitem", None] + assert kind in ["loc", "getitem", None, no_default] + self._deprecated_arg(kind, "kind", "get_slice_bound") if side not in ("left", "right"): raise ValueError( @@ -6030,7 +6039,7 @@ def get_slice_bound(self, label, side: str_t, kind=None) -> int: else: return slc - def slice_locs(self, start=None, end=None, step=None, kind=None): + def slice_locs(self, start=None, end=None, step=None, kind=no_default): """ Compute slice locations for input labels. @@ -6044,6 +6053,8 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): If None, defaults to 1. kind : {'loc', 'getitem'} or None + .. deprecated:: 1.4.0 + Returns ------- start, end : int @@ -6062,6 +6073,7 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): >>> idx.slice_locs(start='b', end='c') (1, 3) """ + self._deprecated_arg(kind, "kind", "slice_locs") inc = step is None or step >= 0 if not inc: diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 9712a5d95a234..348598c1309eb 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -729,7 +729,7 @@ def _maybe_cast_slice_bound(self, label, side: str, kind=lib.no_default): return self._maybe_cast_for_get_loc(label) - def slice_indexer(self, start=None, end=None, step=None, kind=None): + def slice_indexer(self, start=None, end=None, step=None, kind=lib.no_default): """ Return indexer for specified label slice. Index.slice_indexer, customized to handle time slicing. @@ -743,6 +743,8 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): value-based selection in non-monotonic cases. """ + self._deprecated_arg(kind, "kind", "slice_indexer") + # For historical reasons DatetimeIndex supports slices between two # instances of datetime.time as if it were applying a slice mask to # an array of (self.hour, self.minute, self.seconds, self.microsecond). diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index b4e8a763d1210..e5aa8e95e23de 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2586,7 +2586,7 @@ def _get_indexer_level_0(self, target) -> np.ndarray: return ci.get_indexer_for(target) def get_slice_bound( - self, label: Hashable | Sequence[Hashable], side: str, kind: str | None = None + self, label: Hashable | Sequence[Hashable], side: str, kind=lib.no_default ) -> int: """ For an ordered MultiIndex, compute slice bound @@ -2601,6 +2601,8 @@ def get_slice_bound( side : {'left', 'right'} kind : {'loc', 'getitem', None} + .. deprecated:: 1.4.0 + Returns ------- int @@ -2632,11 +2634,13 @@ def get_slice_bound( MultiIndex.get_locs : Get location for a label/slice/list/mask or a sequence of such. """ + self._deprecated_arg(kind, "kind", "get_slice_bound") + if not isinstance(label, tuple): label = (label,) return self._partial_tup_index(label, side=side) - def slice_locs(self, start=None, end=None, step=None, kind=None): + def slice_locs(self, start=None, end=None, step=None, kind=lib.no_default): """ For an ordered MultiIndex, compute the slice locations for input labels. @@ -2655,6 +2659,8 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): Slice step kind : string, optional, defaults None + .. deprecated:: 1.4.0 + Returns ------- (start, end) : (int, int) @@ -2688,6 +2694,7 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): MultiIndex.get_locs : Get location for a label/slice/list/mask or a sequence of such. """ + self._deprecated_arg(kind, "kind", "slice_locs") # This function adds nothing to its parent implementation (the magic # happens in get_slice_bound method), but it adds meaningful doc. return super().slice_locs(start, end, step) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index bb9a2688f0485..d31f6d6a252f3 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -244,7 +244,7 @@ def _convert_slice_indexer(self, key: slice, kind: str): # We always treat __getitem__ slicing as label-based # translate to locations - return self.slice_indexer(key.start, key.stop, key.step, kind=kind) + return self.slice_indexer(key.start, key.stop, key.step) return super()._convert_slice_indexer(key, kind=kind) diff --git a/pandas/tests/indexes/base_class/test_indexing.py b/pandas/tests/indexes/base_class/test_indexing.py index fd04a820037b9..654f5a89f1828 100644 --- a/pandas/tests/indexes/base_class/test_indexing.py +++ b/pandas/tests/indexes/base_class/test_indexing.py @@ -10,7 +10,8 @@ class TestGetSliceBounds: @pytest.mark.parametrize("side, expected", [("left", 4), ("right", 5)]) def test_get_slice_bounds_within(self, kind, side, expected): index = Index(list("abcdef")) - result = index.get_slice_bound("e", kind=kind, side=side) + with tm.assert_produces_warning(FutureWarning, match="'kind' argument"): + result = index.get_slice_bound("e", kind=kind, side=side) assert result == expected @pytest.mark.parametrize("kind", ["getitem", "loc", None]) @@ -20,12 +21,13 @@ def test_get_slice_bounds_within(self, kind, side, expected): ) def test_get_slice_bounds_outside(self, kind, side, expected, data, bound): index = Index(data) - result = index.get_slice_bound(bound, kind=kind, side=side) + with tm.assert_produces_warning(FutureWarning, match="'kind' argument"): + result = index.get_slice_bound(bound, kind=kind, side=side) assert result == expected def test_get_slice_bounds_invalid_side(self): with pytest.raises(ValueError, match="Invalid value for side kwarg"): - Index([]).get_slice_bound("a", kind=None, side="middle") + Index([]).get_slice_bound("a", side="middle") class TestGetIndexerNonUnique: diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index d2afc76076dc0..e142cbf89f1bd 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -820,7 +820,8 @@ def test_timestamp_multiindex_indexer(): def test_get_slice_bound_with_missing_value(index_arr, expected, target, algo): # issue 19132 idx = MultiIndex.from_arrays(index_arr) - result = idx.get_slice_bound(target, side=algo, kind="loc") + with tm.assert_produces_warning(FutureWarning, match="'kind' argument"): + result = idx.get_slice_bound(target, side=algo, kind="loc") assert result == expected diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py index e6b418868dbeb..8f113491dad60 100644 --- a/pandas/tests/indexes/numeric/test_indexing.py +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -545,7 +545,9 @@ class TestGetSliceBounds: @pytest.mark.parametrize("side, expected", [("left", 4), ("right", 5)]) def test_get_slice_bounds_within(self, kind, side, expected): index = Index(range(6)) - result = index.get_slice_bound(4, kind=kind, side=side) + with tm.assert_produces_warning(FutureWarning, match="'kind' argument"): + + result = index.get_slice_bound(4, kind=kind, side=side) assert result == expected @pytest.mark.parametrize("kind", ["getitem", "loc", None]) @@ -553,5 +555,6 @@ def test_get_slice_bounds_within(self, kind, side, expected): @pytest.mark.parametrize("bound, expected", [(-1, 0), (10, 6)]) def test_get_slice_bounds_outside(self, kind, side, expected, bound): index = Index(range(6)) - result = index.get_slice_bound(bound, kind=kind, side=side) + with tm.assert_produces_warning(FutureWarning, match="'kind' argument"): + result = index.get_slice_bound(bound, kind=kind, side=side) assert result == expected From 6ef154eb7c44b8f56c8f11a324bb207363c5879e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 4 Aug 2021 17:15:25 -0700 Subject: [PATCH 11/12] REF: date arg not reachable in DTI._maybe_cast_slice_bound (#42855) --- pandas/core/indexes/datetimes.py | 9 ++++++++- pandas/tests/indexes/datetimes/test_indexing.py | 4 ++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 348598c1309eb..97c648013f9d1 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -722,7 +722,7 @@ def _maybe_cast_slice_bound(self, label, side: str, kind=lib.no_default): if self._is_strictly_monotonic_decreasing and len(self) > 1: return upper if side == "left" else lower return lower if side == "left" else upper - elif isinstance(label, (self._data._recognized_scalars, date)): + elif isinstance(label, self._data._recognized_scalars): self._deprecate_mismatched_indexing(label) else: raise self._invalid_indexer("slice", label) @@ -802,6 +802,13 @@ def check_str_or_none(point): else: return indexer + @doc(Index.get_slice_bound) + def get_slice_bound(self, label, side: str, kind=None) -> int: + # GH#42855 handle date here instead of _maybe_cast_slice_bound + if isinstance(label, date) and not isinstance(label, datetime): + label = Timestamp(label).to_pydatetime() + return super().get_slice_bound(label, side=side, kind=kind) + # -------------------------------------------------------------------- @property diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index d705fa7f0ed2c..6eaf799ae2779 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -746,7 +746,7 @@ def test_get_slice_bounds_datetime_within( result = index.get_slice_bound(key, kind=kind, side=side) assert result == expected - @pytest.mark.parametrize("box", [date, datetime, Timestamp]) + @pytest.mark.parametrize("box", [datetime, Timestamp]) @pytest.mark.parametrize("kind", ["getitem", "loc", None]) @pytest.mark.parametrize("side", ["left", "right"]) @pytest.mark.parametrize("year, expected", [(1999, 0), (2020, 30)]) @@ -764,7 +764,7 @@ def test_get_slice_bounds_datetime_outside( result = index.get_slice_bound(key, kind=kind, side=side) assert result == expected - @pytest.mark.parametrize("box", [date, datetime, Timestamp]) + @pytest.mark.parametrize("box", [datetime, Timestamp]) @pytest.mark.parametrize("kind", ["getitem", "loc", None]) def test_slice_datetime_locs(self, box, kind, tz_aware_fixture): # GH 34077 From e045034e5c89b932a526d6c9e691d3031784c377 Mon Sep 17 00:00:00 2001 From: Shoham Debnath Date: Thu, 5 Aug 2021 11:22:22 +0530 Subject: [PATCH 12/12] TST: raising ValueError when inserting one dataframe in another (#42831) * TST: raising ValueError when inserting one dataframe in another * added GH issue reference * rev msg * included both msgs * updated * Update test_insert.py * Update test_insert.py * Update test_insert.py --- pandas/tests/frame/indexing/test_insert.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py index 4f5ec8eff29a6..c2c862be42625 100644 --- a/pandas/tests/frame/indexing/test_insert.py +++ b/pandas/tests/frame/indexing/test_insert.py @@ -89,3 +89,13 @@ def test_insert_item_cache(self, using_array_manager): ser.values[0] = 99 assert df.iloc[0, 0] == df[0][0] + + def test_insert_frame(self): + # GH#42403 + df = DataFrame({"col1": [1, 2], "col2": [3, 4]}) + msg = ( + "Expected a 1D array, got an array with shape " + r"\(2, 2\)|Wrong number of items passed 2, placement implies 1" + ) + with pytest.raises(ValueError, match=msg): + df.insert(1, "newcol", df)