From 3ad714870791d45636252016d0704121ee323a4f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 19 Nov 2023 00:59:07 +0100 Subject: [PATCH 01/12] DEP: Deprecate Series.view --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/series.py | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index a33322aebab34..d46e139866ec4 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -266,6 +266,7 @@ Other Deprecations - Deprecated :func:`read_gbq` and :meth:`DataFrame.to_gbq`. Use ``pandas_gbq.read_gbq`` and ``pandas_gbq.to_gbq`` instead https://pandas-gbq.readthedocs.io/en/latest/api.html (:issue:`55525`) - Deprecated :meth:`.DataFrameGroupBy.fillna` and :meth:`.SeriesGroupBy.fillna`; use :meth:`.DataFrameGroupBy.ffill`, :meth:`.DataFrameGroupBy.bfill` for forward and backward filling or :meth:`.DataFrame.fillna` to fill with a single value (or the Series equivalents) (:issue:`55718`) - Deprecated :meth:`Index.format`, use ``index.astype(str)`` or ``index.map(formatter)`` instead (:issue:`55413`) +- Deprecated :meth:`Series.view` (:issue:`20251`) - Deprecated ``year``, ``month``, ``quarter``, ``day``, ``hour``, ``minute``, and ``second`` keywords in the :class:`PeriodIndex` constructor, use :meth:`PeriodIndex.from_fields` instead (:issue:`55960`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_clipboard`. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_csv` except ``path_or_buf``. (:issue:`54229`) diff --git a/pandas/core/series.py b/pandas/core/series.py index 1bbd10429ea22..c7ebf0f43df88 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -938,6 +938,11 @@ def view(self, dtype: Dtype | None = None) -> Series: 4 2 dtype: int8 """ + warnings.warn( + "Series.view is deprecated and will be removed in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) # self.array instead of self._values so we piggyback on NumpyExtensionArray # implementation res_values = self.array.view(dtype) From f82034dd8119b70b3f3d312f41947bf9a02cc560 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 19 Nov 2023 01:02:27 +0100 Subject: [PATCH 02/12] DEP: Deprecate Series.view --- pandas/tests/series/methods/test_view.py | 5 +++++ pandas/tests/series/test_constructors.py | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/tests/series/methods/test_view.py b/pandas/tests/series/methods/test_view.py index a9b29c9329193..9806d33204ff3 100644 --- a/pandas/tests/series/methods/test_view.py +++ b/pandas/tests/series/methods/test_view.py @@ -9,6 +9,11 @@ ) import pandas._testing as tm +# TODO(CoW-warn) avoid warnings in the stata reader code +pytestmark = pytest.mark.filterwarnings( + "ignore:Series.view is deprecated and will be removed in a future version.:FutureWarning" # noqa: E501 +) + class TestView: def test_view_i8_to_datetimelike(self): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 57cc674754cc7..661742f4dc1fa 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -966,7 +966,8 @@ def test_constructor_dtype_datetime64_10(self): # GH3414 related expected = Series(pydates, dtype="datetime64[ms]") - result = Series(Series(dates).view(np.int64) / 1000000, dtype="M8[ms]") + with tm.assert_produces_warning(FutureWarning, match="view is deprecated"): + result = Series(Series(dates).view(np.int64) / 1000000, dtype="M8[ms]") tm.assert_series_equal(result, expected) result = Series(dates, dtype="datetime64[ms]") From e33095c8cb1900c7a2d0661ee4ef45219dd0ac2c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 19 Nov 2023 15:02:34 +0100 Subject: [PATCH 03/12] Update test_view.py --- pandas/tests/series/methods/test_view.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/series/methods/test_view.py b/pandas/tests/series/methods/test_view.py index 9806d33204ff3..7e0ac372cd443 100644 --- a/pandas/tests/series/methods/test_view.py +++ b/pandas/tests/series/methods/test_view.py @@ -9,7 +9,6 @@ ) import pandas._testing as tm -# TODO(CoW-warn) avoid warnings in the stata reader code pytestmark = pytest.mark.filterwarnings( "ignore:Series.view is deprecated and will be removed in a future version.:FutureWarning" # noqa: E501 ) From 38379acec695bfe6bce1f21ff0c0b8452f73665e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 19 Nov 2023 15:14:14 +0100 Subject: [PATCH 04/12] Fix dep warnings --- pandas/core/window/ewm.py | 3 +++ pandas/io/stata.py | 4 ++-- pandas/tests/copy_view/test_methods.py | 3 ++- pandas/tests/frame/indexing/test_where.py | 3 ++- pandas/tests/frame/test_constructors.py | 4 ++-- pandas/tests/generic/test_finalize.py | 5 ----- pandas/tests/groupby/test_cumulative.py | 2 +- pandas/tests/groupby/test_timegrouper.py | 2 +- pandas/tests/io/json/test_pandas.py | 2 +- pandas/tests/io/test_sql.py | 2 +- pandas/tests/test_algos.py | 4 ++++ pandas/tests/test_nanops.py | 4 ++++ 12 files changed, 23 insertions(+), 15 deletions(-) diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 82b4746aa57a5..db659713c6f16 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -15,6 +15,7 @@ is_datetime64_ns_dtype, is_numeric_dtype, ) +from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.missing import isna from pandas.core import common @@ -118,6 +119,8 @@ def _calculate_deltas( np.ndarray Diff of the times divided by the half-life """ + if isinstance(times, ABCSeries): + times = times._values _times = np.asarray(times.view(np.int64), dtype=np.float64) # TODO: generalize to non-nano? _halflife = float(Timedelta(halflife).as_unit("ns")._value) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 70294e8a62cca..1eb8f531dc62a 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -429,9 +429,9 @@ def parse_dates_safe( d["year"] = date_index._data.year d["month"] = date_index._data.month if days: - days_in_ns = dates.view(np.int64) - to_datetime( + days_in_ns = dates._values.view(np.int64) - to_datetime( d["year"], format="%Y" - ).view(np.int64) + )._values.view(np.int64) d["days"] = days_in_ns // NS_PER_DAY elif infer_dtype(dates, skipna=False) == "datetime": diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 61569a49863cb..4ed493bc04633 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1922,7 +1922,8 @@ def test_series_view(using_copy_on_write, warn_copy_on_write): ser = Series([1, 2, 3]) ser_orig = ser.copy() - ser2 = ser.view() + with tm.assert_produces_warning(FutureWarning, match="is deprecated"): + ser2 = ser.view() assert np.shares_memory(get_array(ser), get_array(ser2)) if using_copy_on_write: assert not ser2._mgr._has_no_reference(0) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 4576a86ad27cd..6b2bf211ab748 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -771,7 +771,8 @@ def test_where_datetimelike_noop(self, dtype): # GH#45135, analogue to GH#44181 for Period don't raise on no-op # For td64/dt64/dt64tz we already don't raise, but also are # checking that we don't unnecessarily upcast to object. - ser = Series(np.arange(3) * 10**9, dtype=np.int64).view(dtype) + with tm.assert_produces_warning(FutureWarning, match="is deprecated"): + ser = Series(np.arange(3) * 10**9, dtype=np.int64).view(dtype) df = ser.to_frame() mask = np.array([False, False, False]) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index bf17b61b0e3f3..ee9255b732a86 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1101,8 +1101,8 @@ def test_constructor_maskedarray_nonfloat(self): mat2[0, 0] = 1 mat2[1, 2] = 2 frame = DataFrame(mat2, columns=["A", "B", "C"], index=[1, 2]) - assert 1 == frame["A"].view("i8")[1] - assert 2 == frame["C"].view("i8")[2] + assert 1 == frame["A"].astype("i8")[1] + assert 2 == frame["C"].astype("i8")[2] # masked bool promoted to object mat = ma.masked_all((2, 3), dtype=bool) diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 68746b9e9a803..866e9e203ffe3 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -31,11 +31,6 @@ # - Callable: pass the constructed value with attrs set to this. _all_methods = [ - ( - pd.Series, - (np.array([0], dtype="float64")), - operator.methodcaller("view", "int64"), - ), (pd.Series, ([0],), operator.methodcaller("take", [])), (pd.Series, ([0],), operator.methodcaller("__getitem__", [True])), (pd.Series, ([0],), operator.methodcaller("repeat", 2)), diff --git a/pandas/tests/groupby/test_cumulative.py b/pandas/tests/groupby/test_cumulative.py index 25534865b3486..bf572609f3d37 100644 --- a/pandas/tests/groupby/test_cumulative.py +++ b/pandas/tests/groupby/test_cumulative.py @@ -216,7 +216,7 @@ def test_cummax_i8_at_implementation_bound(): # the minimum value used to be treated as NPY_NAT+1 instead of NPY_NAT # for int64 dtype GH#46382 ser = Series([pd.NaT._value + n for n in range(5)]) - df = DataFrame({"A": 1, "B": ser, "C": ser.view("M8[ns]")}) + df = DataFrame({"A": 1, "B": ser, "C": ser._values.view("M8[ns]")}) gb = df.groupby("A") res = gb.cummax() diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index be02c7f79ba01..2bfb06c53ebc2 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -717,7 +717,7 @@ def test_groupby_groups_periods(self): def test_groupby_first_datetime64(self): df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)]) - df[1] = df[1].view("M8[ns]") + df[1] = df[1]._values.view("M8[ns]") assert issubclass(df[1].dtype.type, np.datetime64) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 7312facc44c26..411cc90ba41a7 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -168,7 +168,7 @@ def test_frame_non_unique_columns(self, orient, data): # in milliseconds; these are internally stored in nanosecond, # so divide to get where we need # TODO: a to_epoch method would also solve; see GH 14772 - expected.iloc[:, 0] = expected.iloc[:, 0].view(np.int64) // 1000000 + expected.iloc[:, 0] = expected.iloc[:, 0].astype(np.int64) // 1000000 elif orient == "split": expected = df expected.columns = ["x", "x.1"] diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 9ac20774b8c93..9926da27fbae7 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1527,7 +1527,7 @@ def test_api_timedelta(conn, request): result_count = df.to_sql(name="test_timedelta", con=conn) assert result_count == 2 result = sql.read_sql_query("SELECT * FROM test_timedelta", conn) - tm.assert_series_equal(result["foo"], df["foo"].view("int64")) + tm.assert_series_equal(result["foo"], df["foo"].astype("int64")) @pytest.mark.parametrize("conn", all_connectable) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 97119127b1665..501d0bfbc4a1e 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -47,6 +47,10 @@ ) import pandas.core.common as com +pytestmark = pytest.mark.filterwarnings( + "ignore:Series.view is deprecated and will be removed in a future version.:FutureWarning" # noqa: E501 +) + class TestFactorize: def test_factorize_complex(self): diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index a0062d2b6dd44..632d9783c7f81 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -1114,12 +1114,16 @@ def test_nanmean(self, unit): expected = dti[1] for obj in [dti, DatetimeArray(dti), Series(dti)]: + if isinstance(obj, Series): + obj = obj._values result = nanops.nanmean(obj) assert result == expected dti2 = dti.insert(1, pd.NaT) for obj in [dti2, DatetimeArray(dti2), Series(dti2)]: + if isinstance(obj, Series): + obj = obj._values result = nanops.nanmean(obj) assert result == expected From 7854092711a456d6016a0a0387c85608ba7ad14e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 19 Nov 2023 17:55:41 +0100 Subject: [PATCH 05/12] Fix view --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index c7ebf0f43df88..0e76d83c8ede4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -918,7 +918,7 @@ def view(self, dtype: Dtype | None = None) -> Series: The 8 bit signed integer representation of `-1` is `0b11111111`, but the same bytes represent 255 if read as an 8 bit unsigned integer: - >>> us = s.view('uint8') + >>> us = s.view('uint8') # doctest: +SKIP >>> us 0 254 1 255 From b3a85f30bbe1e332f20061797b8e2f878918029f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 20 Nov 2023 01:47:52 +0100 Subject: [PATCH 06/12] Remove --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 0e76d83c8ede4..c7ebf0f43df88 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -918,7 +918,7 @@ def view(self, dtype: Dtype | None = None) -> Series: The 8 bit signed integer representation of `-1` is `0b11111111`, but the same bytes represent 255 if read as an 8 bit unsigned integer: - >>> us = s.view('uint8') # doctest: +SKIP + >>> us = s.view('uint8') >>> us 0 254 1 255 From e5d525c749323c9537c86bd326702a8de5caf7ff Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 20 Nov 2023 15:34:05 +0100 Subject: [PATCH 07/12] Update pandas/tests/groupby/test_timegrouper.py Co-authored-by: Joris Van den Bossche --- pandas/tests/groupby/test_timegrouper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 2bfb06c53ebc2..d8b00b9b9a483 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -717,7 +717,7 @@ def test_groupby_groups_periods(self): def test_groupby_first_datetime64(self): df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)]) - df[1] = df[1]._values.view("M8[ns]") + df[1] = df[1].astype("M8[ns]") assert issubclass(df[1].dtype.type, np.datetime64) From 4e42dd99836bbcd28f6d97dd5b9e2fbb2d3f69fc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 20 Nov 2023 15:34:25 +0100 Subject: [PATCH 08/12] Update pandas/tests/series/test_constructors.py Co-authored-by: Joris Van den Bossche --- pandas/tests/series/test_constructors.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 157206f912438..a898e558322ac 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -966,8 +966,7 @@ def test_constructor_dtype_datetime64_10(self): # GH3414 related expected = Series(pydates, dtype="datetime64[ms]") - with tm.assert_produces_warning(FutureWarning, match="view is deprecated"): - result = Series(Series(dates).view(np.int64) / 1000000, dtype="M8[ms]") + result = Series(Series(dates).astype(np.int64) / 1000000, dtype="M8[ms]") tm.assert_series_equal(result, expected) result = Series(dates, dtype="datetime64[ms]") From d9d6faa2ce9288318aa66c93219c2d2c8fffa607 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 21 Nov 2023 21:04:05 +0100 Subject: [PATCH 09/12] Fix --- pandas/core/series.py | 39 ++++++-------------------------------- pandas/tests/test_algos.py | 6 +----- 2 files changed, 7 insertions(+), 38 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index c7ebf0f43df88..aeb36676e16f6 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -881,6 +881,8 @@ def view(self, dtype: Dtype | None = None) -> Series: type. The new data type must preserve the same size in bytes as to not cause index misalignment. + .. deprecated:: 2.2.0 + Parameters ---------- dtype : data type @@ -906,42 +908,13 @@ def view(self, dtype: Dtype | None = None) -> Series: Examples -------- - >>> s = pd.Series([-2, -1, 0, 1, 2], dtype='int8') - >>> s - 0 -2 - 1 -1 - 2 0 - 3 1 - 4 2 - dtype: int8 - - The 8 bit signed integer representation of `-1` is `0b11111111`, but - the same bytes represent 255 if read as an 8 bit unsigned integer: - - >>> us = s.view('uint8') - >>> us - 0 254 - 1 255 - 2 0 - 3 1 - 4 2 - dtype: uint8 - - The views share the same underlying values: - - >>> us[0] = 128 - >>> s - 0 -128 - 1 -1 - 2 0 - 3 1 - 4 2 - dtype: int8 + Use ``astype`` to change the dtype instead. """ warnings.warn( - "Series.view is deprecated and will be removed in a future version.", + "Series.view is deprecated and will be removed in a future version. " + "Use ``astype`` as an alternative to change the dtype.", FutureWarning, - stacklevel=find_stack_level(), + stacklevel=2, ) # self.array instead of self._values so we piggyback on NumpyExtensionArray # implementation diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 501d0bfbc4a1e..35cdd57b9c806 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -47,10 +47,6 @@ ) import pandas.core.common as com -pytestmark = pytest.mark.filterwarnings( - "ignore:Series.view is deprecated and will be removed in a future version.:FutureWarning" # noqa: E501 -) - class TestFactorize: def test_factorize_complex(self): @@ -961,7 +957,7 @@ def test_isin_datetimelike_values_numeric_comps(self, dtype, dtype1): # Anything but object and we get all-False shortcut dta = date_range("2013-01-01", periods=3)._values - arr = Series(dta.view("i8")).view(dtype1)._values + arr = Series(dta.view("i8")).array.view(dtype1) comps = arr.view("i8").astype(dtype) From 81a33561a2232fcfdb49932758f96aa8fbd3dda1 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 21 Nov 2023 21:05:10 +0100 Subject: [PATCH 10/12] Merge remote-tracking branch 'upstream/main' into depview # Conflicts: # doc/source/whatsnew/v2.2.0.rst --- .github/workflows/comment-commands.yml | 2 +- .../workflows/deprecation-tracking-bot.yml | 2 +- asv_bench/benchmarks/io/csv.py | 11 + doc/source/user_guide/copy_on_write.rst | 10 + doc/source/user_guide/indexing.rst | 48 ++ pandas/_libs/tslibs/conversion.pyx | 10 +- pandas/_libs/tslibs/parsing.pxd | 4 +- pandas/_libs/tslibs/parsing.pyx | 58 +- pandas/_libs/tslibs/tzconversion.pyx | 9 +- pandas/conftest.py | 113 +--- pandas/core/arrays/datetimelike.py | 8 + pandas/core/arrays/datetimes.py | 1 + pandas/core/arrays/sparse/array.py | 3 +- pandas/core/generic.py | 17 + pandas/core/indexes/base.py | 10 + pandas/core/indexes/interval.py | 7 +- pandas/core/internals/__init__.py | 44 +- pandas/core/internals/api.py | 45 +- pandas/core/reshape/encoding.py | 12 +- pandas/core/reshape/melt.py | 58 +- pandas/core/reshape/pivot.py | 7 +- pandas/errors/__init__.py | 13 + pandas/io/parsers/arrow_parser_wrapper.py | 9 + pandas/io/parsers/readers.py | 23 +- pandas/tests/apply/conftest.py | 30 -- pandas/tests/apply/test_frame_apply.py | 35 +- pandas/tests/apply/test_invalid_arg.py | 14 +- pandas/tests/apply/test_numba.py | 5 + pandas/tests/arithmetic/conftest.py | 73 +-- pandas/tests/arithmetic/test_numeric.py | 28 + pandas/tests/arithmetic/test_period.py | 75 +++ pandas/tests/arrays/categorical/conftest.py | 6 - pandas/tests/arrays/categorical/test_take.py | 6 + pandas/tests/arrays/interval/test_interval.py | 175 ------ .../arrays/interval/test_interval_pyarrow.py | 160 ++++++ pandas/tests/arrays/sparse/test_indexing.py | 13 +- .../copy_view/test_core_functionalities.py | 3 +- pandas/tests/copy_view/test_indexing.py | 28 +- pandas/tests/copy_view/test_replace.py | 12 + pandas/tests/copy_view/test_setitem.py | 3 - pandas/tests/extension/base/constructors.py | 6 +- pandas/tests/extension/base/getitem.py | 15 +- pandas/tests/extension/base/methods.py | 11 +- pandas/tests/extension/base/reshaping.py | 20 +- pandas/tests/extension/base/setitem.py | 3 +- pandas/tests/extension/conftest.py | 10 + pandas/tests/extension/json/test_json.py | 16 +- pandas/tests/extension/test_arrow.py | 4 +- pandas/tests/extension/test_sparse.py | 39 +- pandas/tests/frame/conftest.py | 136 +---- pandas/tests/frame/indexing/test_indexing.py | 9 +- pandas/tests/frame/methods/test_astype.py | 14 +- pandas/tests/frame/methods/test_clip.py | 8 +- pandas/tests/frame/methods/test_set_index.py | 28 + pandas/tests/frame/methods/test_transpose.py | 10 +- pandas/tests/frame/test_arithmetic.py | 17 + pandas/tests/frame/test_reductions.py | 72 +++ .../tests/groupby/aggregate/test_aggregate.py | 4 +- pandas/tests/groupby/conftest.py | 31 +- pandas/tests/groupby/methods/test_describe.py | 4 +- pandas/tests/groupby/methods/test_nth.py | 11 +- pandas/tests/groupby/test_api.py | 14 +- pandas/tests/groupby/test_groupby.py | 49 +- pandas/tests/groupby/test_grouping.py | 38 +- pandas/tests/indexes/conftest.py | 20 - .../indexes/datetimes/test_date_range.py | 7 +- pandas/tests/indexes/interval/test_base.py | 56 -- .../indexes/interval/test_constructors.py | 17 + .../tests/indexes/interval/test_indexing.py | 63 +++ .../tests/indexes/interval/test_interval.py | 20 - .../indexes/interval/test_interval_range.py | 7 +- pandas/tests/indexes/multi/conftest.py | 50 -- pandas/tests/indexes/multi/test_duplicates.py | 19 + pandas/tests/indexes/multi/test_formats.py | 16 +- pandas/tests/indexes/multi/test_get_set.py | 3 +- pandas/tests/indexes/multi/test_names.py | 6 +- .../tests/indexes/period/test_constructors.py | 6 +- pandas/tests/indexes/test_base.py | 6 +- pandas/tests/indexes/test_setops.py | 19 + .../indexes/timedeltas/test_constructors.py | 7 +- pandas/tests/indexing/test_iloc.py | 2 +- pandas/tests/internals/test_api.py | 27 +- pandas/tests/internals/test_internals.py | 1 - pandas/tests/io/conftest.py | 28 +- pandas/tests/io/data/gbq_fake_job.txt | 1 - pandas/tests/io/excel/conftest.py | 41 -- pandas/tests/io/excel/test_odswriter.py | 5 +- pandas/tests/io/excel/test_openpyxl.py | 7 +- pandas/tests/io/excel/test_readers.py | 11 + pandas/tests/io/excel/test_writers.py | 30 +- pandas/tests/io/excel/test_xlsxwriter.py | 5 +- pandas/tests/io/json/conftest.py | 7 - pandas/tests/io/json/test_readlines.py | 7 + .../io/parser/common/test_common_basic.py | 26 +- .../io/parser/common/test_file_buffer_url.py | 7 +- pandas/tests/io/parser/test_c_parser_only.py | 19 - pandas/tests/io/parser/test_na_values.py | 144 ++++- pandas/tests/io/parser/test_network.py | 53 +- .../io/parser/usecols/test_usecols_basic.py | 2 +- pandas/tests/io/test_fsspec.py | 22 + pandas/tests/io/test_http_headers.py | 172 ++++++ pandas/tests/io/test_s3.py | 7 +- pandas/tests/io/test_user_agent.py | 403 -------------- pandas/tests/io/xml/conftest.py | 6 +- pandas/tests/plotting/test_misc.py | 9 + pandas/tests/resample/conftest.py | 37 -- pandas/tests/resample/test_datetime_index.py | 21 +- pandas/tests/resample/test_period_index.py | 30 ++ pandas/tests/reshape/test_pivot.py | 15 + .../tests/scalar/interval/test_arithmetic.py | 211 ++++++-- .../scalar/interval/test_constructors.py | 51 ++ pandas/tests/scalar/interval/test_contains.py | 73 +++ pandas/tests/scalar/interval/test_formats.py | 11 + pandas/tests/scalar/interval/test_interval.py | 192 ------- .../{test_ops.py => test_overlaps.py} | 52 -- pandas/tests/scalar/period/test_arithmetic.py | 486 +++++++++++++++++ pandas/tests/scalar/period/test_period.py | 498 +----------------- .../timestamp/methods/test_tz_convert.py | 6 - .../timestamp/methods/test_tz_localize.py | 38 ++ .../scalar/timestamp/test_constructors.py | 12 + .../tests/scalar/timestamp/test_timezones.py | 6 - .../series/accessors/test_dt_accessor.py | 4 +- pandas/tests/series/indexing/test_delitem.py | 15 +- pandas/tests/series/indexing/test_getitem.py | 8 +- pandas/tests/series/indexing/test_setitem.py | 46 +- pandas/tests/series/indexing/test_where.py | 3 + pandas/tests/series/methods/test_astype.py | 24 +- .../series/methods/test_combine_first.py | 2 +- .../series/methods/test_convert_dtypes.py | 11 + pandas/tests/series/methods/test_map.py | 28 +- pandas/tests/series/methods/test_reindex.py | 5 + pandas/tests/series/methods/test_rename.py | 17 +- pandas/tests/series/methods/test_replace.py | 10 +- .../tests/series/methods/test_reset_index.py | 24 +- pandas/tests/series/methods/test_to_csv.py | 8 +- pandas/tests/series/methods/test_update.py | 1 + pandas/tests/series/test_arithmetic.py | 23 +- pandas/tests/series/test_constructors.py | 19 +- pandas/tests/series/test_formats.py | 45 +- pandas/tests/series/test_logical_ops.py | 26 +- pandas/tests/series/test_reductions.py | 22 +- pandas/tests/strings/conftest.py | 51 -- pandas/tests/strings/test_api.py | 50 ++ .../tseries/frequencies/test_freq_code.py | 33 -- pandas/tests/tseries/offsets/conftest.py | 29 - pandas/tests/tseries/offsets/test_offsets.py | 27 + pandas/tests/tslibs/test_resolution.py | 33 ++ pandas/tests/window/test_numba.py | 15 +- pandas/tests/window/test_online.py | 16 +- pyproject.toml | 4 + scripts/validate_unwanted_patterns.py | 1 + 151 files changed, 2862 insertions(+), 2600 deletions(-) delete mode 100644 pandas/tests/apply/conftest.py create mode 100644 pandas/tests/arrays/interval/test_interval_pyarrow.py delete mode 100644 pandas/tests/indexes/interval/test_base.py delete mode 100644 pandas/tests/io/data/gbq_fake_job.txt delete mode 100644 pandas/tests/io/excel/conftest.py create mode 100644 pandas/tests/io/test_http_headers.py delete mode 100644 pandas/tests/io/test_user_agent.py create mode 100644 pandas/tests/scalar/interval/test_constructors.py create mode 100644 pandas/tests/scalar/interval/test_contains.py create mode 100644 pandas/tests/scalar/interval/test_formats.py rename pandas/tests/scalar/interval/{test_ops.py => test_overlaps.py} (54%) create mode 100644 pandas/tests/scalar/period/test_arithmetic.py diff --git a/.github/workflows/comment-commands.yml b/.github/workflows/comment-commands.yml index 55dd733d25b50..425abef850184 100644 --- a/.github/workflows/comment-commands.yml +++ b/.github/workflows/comment-commands.yml @@ -77,7 +77,7 @@ jobs: echo 'EOF' >> $GITHUB_ENV echo "REGEX=$REGEX" >> $GITHUB_ENV - - uses: actions/github-script@v6 + - uses: actions/github-script@v7 env: BENCH_OUTPUT: ${{env.BENCH_OUTPUT}} REGEX: ${{env.REGEX}} diff --git a/.github/workflows/deprecation-tracking-bot.yml b/.github/workflows/deprecation-tracking-bot.yml index b3f9bcd840c68..ec71daf6f84ab 100644 --- a/.github/workflows/deprecation-tracking-bot.yml +++ b/.github/workflows/deprecation-tracking-bot.yml @@ -21,7 +21,7 @@ jobs: env: DEPRECATION_TRACKER_ISSUE: 50578 steps: - - uses: actions/github-script@v6 + - uses: actions/github-script@v7 id: update-deprecation-issue with: script: | diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 1826291034dee..a45315f63d62e 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -621,4 +621,15 @@ def time_read_csv_index_col(self): ) +class ReadCSVCParserLowMemory: + # GH 16798 + def setup(self): + self.csv = StringIO( + "strings\n" + "\n".join(["x" * (1 << 20) for _ in range(2100)]) + ) + + def peakmem_over_2gb_input(self): + read_csv(self.csv, engine="c", low_memory=False) + + from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/user_guide/copy_on_write.rst b/doc/source/user_guide/copy_on_write.rst index d0c57b56585db..fc6f62ec2a4bb 100644 --- a/doc/source/user_guide/copy_on_write.rst +++ b/doc/source/user_guide/copy_on_write.rst @@ -6,6 +6,12 @@ Copy-on-Write (CoW) ******************* +.. note:: + + Copy-on-Write will become the default in pandas 3.0. We recommend + :ref:`turning it on now ` + to benefit from all improvements. + Copy-on-Write was first introduced in version 1.5.0. Starting from version 2.0 most of the optimizations that become possible through CoW are implemented and supported. All possible optimizations are supported starting from pandas 2.1. @@ -123,6 +129,8 @@ CoW triggers a copy when ``df`` is changed to avoid mutating ``view`` as well: df view +.. _copy_on_write_chained_assignment: + Chained Assignment ------------------ @@ -238,6 +246,8 @@ and :meth:`DataFrame.rename`. These methods return views when Copy-on-Write is enabled, which provides a significant performance improvement compared to the regular execution. +.. _copy_on_write_enabling: + How to enable CoW ----------------- diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 7b839d62ddde9..4954ee1538697 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -1727,6 +1727,22 @@ You can assign a custom index to the ``index`` attribute: Returning a view versus a copy ------------------------------ +.. warning:: + + :ref:`Copy-on-Write ` + will become the new default in pandas 3.0. This means than chained indexing will + never work. As a consequence, the ``SettingWithCopyWarning`` won't be necessary + anymore. + See :ref:`this section ` + for more context. + We recommend turning Copy-on-Write on to leverage the improvements with + + ``` + pd.options.mode.copy_on_write = True + ``` + + even before pandas 3.0 is available. + When setting values in a pandas object, care must be taken to avoid what is called ``chained indexing``. Here is an example. @@ -1765,6 +1781,22 @@ faster, and allows one to index *both* axes if so desired. Why does assignment fail when using chained indexing? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. warning:: + + :ref:`Copy-on-Write ` + will become the new default in pandas 3.0. This means than chained indexing will + never work. As a consequence, the ``SettingWithCopyWarning`` won't be necessary + anymore. + See :ref:`this section ` + for more context. + We recommend turning Copy-on-Write on to leverage the improvements with + + ``` + pd.options.mode.copy_on_write = True + ``` + + even before pandas 3.0 is available. + The problem in the previous section is just a performance issue. What's up with the ``SettingWithCopy`` warning? We don't **usually** throw warnings around when you do something that might cost a few extra milliseconds! @@ -1821,6 +1853,22 @@ Yikes! Evaluation order matters ~~~~~~~~~~~~~~~~~~~~~~~~ +.. warning:: + + :ref:`Copy-on-Write ` + will become the new default in pandas 3.0. This means than chained indexing will + never work. As a consequence, the ``SettingWithCopyWarning`` won't be necessary + anymore. + See :ref:`this section ` + for more context. + We recommend turning Copy-on-Write on to leverage the improvements with + + ``` + pd.options.mode.copy_on_write = True + ``` + + even before pandas 3.0 is available. + When you use chained indexing, the order and type of the indexing operation partially determine whether the result is a slice into the original object, or a copy of the slice. diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 84aceecb09a33..222ff2cde0ede 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -508,7 +508,7 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, npy_datetimestruct dts int out_local = 0, out_tzoffset = 0, string_to_dts_failed datetime dt - int64_t ival + int64_t ival, nanos = 0 NPY_DATETIMEUNIT out_bestunit, reso _TSObject obj @@ -560,10 +560,14 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, return obj dt = parse_datetime_string( - ts, dayfirst=dayfirst, yearfirst=yearfirst, out_bestunit=&out_bestunit + ts, + dayfirst=dayfirst, + yearfirst=yearfirst, + out_bestunit=&out_bestunit, + nanos=&nanos, ) reso = get_supported_reso(out_bestunit) - return convert_datetime_to_tsobject(dt, tz, nanos=0, reso=reso) + return convert_datetime_to_tsobject(dt, tz, nanos=nanos, reso=reso) return convert_datetime_to_tsobject(dt, tz) diff --git a/pandas/_libs/tslibs/parsing.pxd b/pandas/_libs/tslibs/parsing.pxd index 8809c81b530d0..fbe07e68f5adf 100644 --- a/pandas/_libs/tslibs/parsing.pxd +++ b/pandas/_libs/tslibs/parsing.pxd @@ -1,4 +1,5 @@ from cpython.datetime cimport datetime +from numpy cimport int64_t from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT @@ -10,5 +11,6 @@ cdef datetime parse_datetime_string( str date_string, bint dayfirst, bint yearfirst, - NPY_DATETIMEUNIT* out_bestunit + NPY_DATETIMEUNIT* out_bestunit, + int64_t* nanos, ) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 4918de5497c4b..d0872a509c440 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -34,6 +34,7 @@ from numpy cimport ( PyArray_IterNew, flatiter, float64_t, + int64_t, ) cnp.import_array() @@ -272,8 +273,11 @@ def py_parse_datetime_string( # parse_datetime_string cpdef bc it has a pointer argument) cdef: NPY_DATETIMEUNIT out_bestunit + int64_t nanos - return parse_datetime_string(date_string, dayfirst, yearfirst, &out_bestunit) + return parse_datetime_string( + date_string, dayfirst, yearfirst, &out_bestunit, &nanos + ) cdef datetime parse_datetime_string( @@ -283,7 +287,8 @@ cdef datetime parse_datetime_string( str date_string, bint dayfirst, bint yearfirst, - NPY_DATETIMEUNIT* out_bestunit + NPY_DATETIMEUNIT* out_bestunit, + int64_t* nanos, ): """ Parse datetime string, only returns datetime. @@ -311,7 +316,7 @@ cdef datetime parse_datetime_string( default = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) dt = dateutil_parse(date_string, default=default, dayfirst=dayfirst, yearfirst=yearfirst, - ignoretz=False, out_bestunit=out_bestunit) + ignoretz=False, out_bestunit=out_bestunit, nanos=nanos) return dt dt = _parse_delimited_date(date_string, dayfirst, out_bestunit) @@ -330,7 +335,7 @@ cdef datetime parse_datetime_string( dt = dateutil_parse(date_string, default=_DEFAULT_DATETIME, dayfirst=dayfirst, yearfirst=yearfirst, - ignoretz=False, out_bestunit=out_bestunit) + ignoretz=False, out_bestunit=out_bestunit, nanos=nanos) return dt @@ -436,7 +441,7 @@ def parse_datetime_string_with_reso( parsed = dateutil_parse(date_string, _DEFAULT_DATETIME, dayfirst=dayfirst, yearfirst=yearfirst, - ignoretz=False, out_bestunit=&out_bestunit) + ignoretz=False, out_bestunit=&out_bestunit, nanos=NULL) reso = npy_unit_to_attrname[out_bestunit] return parsed, reso @@ -639,7 +644,8 @@ cdef datetime dateutil_parse( bint ignoretz, bint dayfirst, bint yearfirst, - NPY_DATETIMEUNIT* out_bestunit + NPY_DATETIMEUNIT* out_bestunit, + int64_t* nanos, ): """ lifted from dateutil to get resolution""" @@ -671,11 +677,8 @@ cdef datetime dateutil_parse( if reso is None: raise DateParseError(f"Unable to parse datetime string: {timestr}") - if reso == "microsecond": - if repl["microsecond"] == 0: - reso = "second" - elif repl["microsecond"] % 1000 == 0: - reso = "millisecond" + if reso == "microsecond" and repl["microsecond"] % 1000 == 0: + reso = _find_subsecond_reso(timestr, nanos=nanos) try: ret = default.replace(**repl) @@ -745,6 +748,38 @@ cdef datetime dateutil_parse( return ret +cdef object _reso_pattern = re.compile(r"\d:\d{2}:\d{2}\.(?P\d+)") + +cdef _find_subsecond_reso(str timestr, int64_t* nanos): + # GH#55737 + # Check for trailing zeros in a H:M:S.f pattern + match = _reso_pattern.search(timestr) + if not match: + reso = "second" + else: + frac = match.groupdict()["frac"] + if len(frac) <= 3: + reso = "millisecond" + elif len(frac) > 6: + if frac[6:] == "0" * len(frac[6:]): + # corner case where we haven't lost any data + reso = "nanosecond" + elif len(frac) <= 9: + reso = "nanosecond" + if nanos is not NULL: + if len(frac) < 9: + frac = frac + "0" * (9 - len(frac)) + nanos[0] = int(frac[6:]) + else: + # TODO: should we warn/raise in higher-than-nano cases? + reso = "nanosecond" + if nanos is not NULL: + nanos[0] = int(frac[6:9]) + else: + reso = "microsecond" + return reso + + # ---------------------------------------------------------------------- # Parsing for type-inference @@ -916,6 +951,7 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None: yearfirst=False, ignoretz=False, out_bestunit=&out_bestunit, + nanos=NULL, ) except (ValueError, OverflowError, InvalidOperation): # In case the datetime can't be parsed, its format cannot be guessed diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index e77a385113e93..2c4f0cd14db13 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -416,8 +416,13 @@ timedelta-like} else: delta_idx = bisect_right_i8(info.tdata, new_local, info.ntrans) - - delta_idx = delta_idx - delta_idx_offset + # Logic similar to the precompute section. But check the current + # delta in case we are moving between UTC+0 and non-zero timezone + if (shift_forward or shift_delta > 0) and \ + info.deltas[delta_idx - 1] >= 0: + delta_idx = delta_idx - 1 + else: + delta_idx = delta_idx - delta_idx_offset result[i] = new_local - info.deltas[delta_idx] elif fill_nonexist: result[i] = NPY_NAT diff --git a/pandas/conftest.py b/pandas/conftest.py index 7f36252a638ca..b24606f8007d2 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -30,7 +30,6 @@ from decimal import Decimal import operator import os -from pathlib import Path from typing import ( TYPE_CHECKING, Callable, @@ -775,23 +774,6 @@ def series_with_simple_index(index) -> Series: return _create_series(index) -@pytest.fixture -def series_with_multilevel_index() -> Series: - """ - Fixture with a Series with a 2-level MultiIndex. - """ - arrays = [ - ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], - ["one", "two", "one", "two", "one", "two", "one", "two"], - ] - tuples = zip(*arrays) - index = MultiIndex.from_tuples(tuples) - data = np.random.default_rng(2).standard_normal(8) - ser = Series(data, index=index) - ser.iloc[3] = np.nan - return ser - - _narrow_series = { f"{dtype.__name__}-series": tm.make_rand_series(name="a", dtype=dtype) for dtype in tm.NARROW_NP_DTYPES @@ -865,35 +847,6 @@ def int_frame() -> DataFrame: return DataFrame(tm.getSeriesData()).astype("int64") -@pytest.fixture -def datetime_frame() -> DataFrame: - """ - Fixture for DataFrame of floats with DatetimeIndex - - Columns are ['A', 'B', 'C', 'D'] - - A B C D - 2000-01-03 -1.122153 0.468535 0.122226 1.693711 - 2000-01-04 0.189378 0.486100 0.007864 -1.216052 - 2000-01-05 0.041401 -0.835752 -0.035279 -0.414357 - 2000-01-06 0.430050 0.894352 0.090719 0.036939 - 2000-01-07 -0.620982 -0.668211 -0.706153 1.466335 - 2000-01-10 -0.752633 0.328434 -0.815325 0.699674 - 2000-01-11 -2.236969 0.615737 -0.829076 -1.196106 - ... ... ... ... ... - 2000-02-03 1.642618 -0.579288 0.046005 1.385249 - 2000-02-04 -0.544873 -1.160962 -0.284071 -1.418351 - 2000-02-07 -2.656149 -0.601387 1.410148 0.444150 - 2000-02-08 -1.201881 -1.289040 0.772992 -1.445300 - 2000-02-09 1.377373 0.398619 1.008453 -0.928207 - 2000-02-10 0.473194 -0.636677 0.984058 0.511519 - 2000-02-11 -0.965556 0.408313 -1.312844 -0.381948 - - [30 rows x 4 columns] - """ - return DataFrame(tm.getTimeSeriesData()) - - @pytest.fixture def float_frame() -> DataFrame: """ @@ -923,24 +876,6 @@ def float_frame() -> DataFrame: return DataFrame(tm.getSeriesData()) -@pytest.fixture -def mixed_type_frame() -> DataFrame: - """ - Fixture for DataFrame of float/int/string columns with RangeIndex - Columns are ['a', 'b', 'c', 'float32', 'int32']. - """ - return DataFrame( - { - "a": 1.0, - "b": 2, - "c": "foo", - "float32": np.array([1.0] * 10, dtype="float32"), - "int32": np.array([1] * 10, dtype="int32"), - }, - index=np.arange(10), - ) - - @pytest.fixture def rand_series_with_duplicate_datetimeindex() -> Series: """ @@ -1174,16 +1109,6 @@ def strict_data_files(pytestconfig): return pytestconfig.getoption("--no-strict-data-files") -@pytest.fixture -def tests_path() -> Path: - return Path(__file__).parent / "tests" - - -@pytest.fixture -def tests_io_data_path(tests_path) -> Path: - return tests_path / "io" / "data" - - @pytest.fixture def datapath(strict_data_files: str) -> Callable[..., str]: """ @@ -1218,14 +1143,6 @@ def deco(*args): return deco -@pytest.fixture -def iris(datapath) -> DataFrame: - """ - The iris dataset as a DataFrame. - """ - return pd.read_csv(datapath("io", "data", "csv", "iris.csv")) - - # ---------------------------------------------------------------- # Time zones # ---------------------------------------------------------------- @@ -1905,28 +1822,6 @@ def sort_by_key(request): return request.param -@pytest.fixture() -def fsspectest(): - pytest.importorskip("fsspec") - from fsspec import register_implementation - from fsspec.implementations.memory import MemoryFileSystem - from fsspec.registry import _registry as registry - - class TestMemoryFS(MemoryFileSystem): - protocol = "testmem" - test = [None] - - def __init__(self, **kwargs) -> None: - self.test[0] = kwargs.pop("test", None) - super().__init__(**kwargs) - - register_implementation("testmem", TestMemoryFS, clobber=True) - yield TestMemoryFS() - registry.pop("testmem", None) - TestMemoryFS.test[0] = None - TestMemoryFS.store.clear() - - @pytest.fixture( params=[ ("foo", None, None), @@ -2024,6 +1919,14 @@ def warn_copy_on_write() -> bool: ) +@pytest.fixture +def using_infer_string() -> bool: + """ + Fixture to check if infer_string is enabled. + """ + return pd.options.future.infer_string + + warsaws = ["Europe/Warsaw", "dateutil/Europe/Warsaw"] if zoneinfo is not None: warsaws.append(zoneinfo.ZoneInfo("Europe/Warsaw")) # type: ignore[arg-type] diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 76dccc49d6620..f0b9219682350 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -2454,6 +2454,14 @@ def validate_periods(periods: int | float | None) -> int | None: """ if periods is not None: if lib.is_float(periods): + warnings.warn( + # GH#56036 + "Non-integer 'periods' in pd.date_range, pd.timedelta_range, " + "pd.period_range, and pd.interval_range are deprecated and " + "will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) periods = int(periods) elif not lib.is_integer(periods): raise TypeError(f"periods must be a number, got {periods}") diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 34a6e118733ae..b76ad5268e76e 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2379,6 +2379,7 @@ def objects_to_datetime64( Raises ------ ValueError : if data cannot be converted to datetimes + TypeError : When a type cannot be converted to datetime """ assert errors in ["raise", "ignore", "coerce"] diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index cf349220e4ba7..5db77db2a9c66 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1086,9 +1086,10 @@ def _take_with_fill(self, indices, fill_value=None) -> np.ndarray: ) elif self.sp_index.npoints == 0: - # Avoid taking from the empty self.sp_values + # Use the old fill_value unless we took for an index of -1 _dtype = np.result_type(self.dtype.subtype, type(fill_value)) taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype) + taken[old_fill_indices] = self.fill_value else: taken = self.sp_values.take(sp_indexer) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 46bcfe0a32210..b23092e6f27e1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -100,6 +100,7 @@ SettingWithCopyError, SettingWithCopyWarning, _chained_assignment_method_msg, + _chained_assignment_warning_method_msg, ) from pandas.util._decorators import ( deprecate_nonkeyword_arguments, @@ -7773,6 +7774,22 @@ def replace( ChainedAssignmentError, stacklevel=2, ) + elif not PYPY and not using_copy_on_write(): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if isinstance(self, ABCSeries) and hasattr(self, "_cacher"): + # in non-CoW mode, chained Series access will populate the + # `_item_cache` which results in an increased ref count not below + # the threshold, while we still need to warn. We detect this case + # of a Series derived from a DataFrame through the presence of + # `_cacher` + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) if not is_bool(regex) and to_replace is not None: raise ValueError("'to_replace' must be 'None' if 'regex' is not a bool") diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 1b4e14f075f22..687d6feb74131 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5352,6 +5352,16 @@ def __getitem__(self, key): else: key = np.asarray(key, dtype=bool) + if not isinstance(self.dtype, ExtensionDtype): + if len(key) == 0 and len(key) != len(self): + warnings.warn( + "Using a boolean indexer with length 0 on an Index with " + "length greater than 0 is deprecated and will raise in a " + "future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + result = getitem(key) # Because we ruled out integer above, we always get an arraylike here if result.ndim > 1: diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 0982b376a27d5..4fcdb87974511 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -43,7 +43,6 @@ ) from pandas.core.dtypes.common import ( ensure_platform_int, - is_float, is_float_dtype, is_integer, is_integer_dtype, @@ -60,6 +59,7 @@ from pandas.core.dtypes.missing import is_valid_na_for_dtype from pandas.core.algorithms import unique +from pandas.core.arrays.datetimelike import validate_periods from pandas.core.arrays.interval import ( IntervalArray, _interval_shared_docs, @@ -1076,10 +1076,7 @@ def interval_range( if not _is_valid_endpoint(end): raise ValueError(f"end must be numeric or datetime-like, got {end}") - if is_float(periods): - periods = int(periods) - elif not is_integer(periods) and periods is not None: - raise TypeError(f"periods must be a number, got {periods}") + periods = validate_periods(periods) if freq is not None and not is_number(freq): try: diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 0f8cb9f053174..2eb413440ba9c 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -1,4 +1,4 @@ -from pandas.core.internals.api import make_block +from pandas.core.internals.api import make_block # 2023-09-18 pyarrow uses this from pandas.core.internals.array_manager import ( ArrayManager, SingleArrayManager, @@ -7,11 +7,6 @@ DataManager, SingleDataManager, ) -from pandas.core.internals.blocks import ( # io.pytables, io.packers - Block, - DatetimeTZBlock, - ExtensionBlock, -) from pandas.core.internals.concat import concatenate_managers from pandas.core.internals.managers import ( BlockManager, @@ -19,9 +14,9 @@ ) __all__ = [ - "Block", - "DatetimeTZBlock", - "ExtensionBlock", + "Block", # pylint: disable=undefined-all-variable + "DatetimeTZBlock", # pylint: disable=undefined-all-variable + "ExtensionBlock", # pylint: disable=undefined-all-variable "make_block", "DataManager", "ArrayManager", @@ -34,33 +29,54 @@ def __getattr__(name: str): + # GH#55139 import warnings - from pandas.util._exceptions import find_stack_level - if name == "create_block_manager_from_blocks": # GH#33892 warnings.warn( f"{name} is deprecated and will be removed in a future version. " "Use public APIs instead.", DeprecationWarning, - stacklevel=find_stack_level(), + # https://github.com/pandas-dev/pandas/pull/55139#pullrequestreview-1720690758 + # on hard-coding stacklevel + stacklevel=2, ) from pandas.core.internals.managers import create_block_manager_from_blocks return create_block_manager_from_blocks - if name in ["NumericBlock", "ObjectBlock"]: + if name in [ + "NumericBlock", + "ObjectBlock", + "Block", + "ExtensionBlock", + "DatetimeTZBlock", + ]: warnings.warn( f"{name} is deprecated and will be removed in a future version. " "Use public APIs instead.", DeprecationWarning, - stacklevel=find_stack_level(), + # https://github.com/pandas-dev/pandas/pull/55139#pullrequestreview-1720690758 + # on hard-coding stacklevel + stacklevel=2, ) if name == "NumericBlock": from pandas.core.internals.blocks import NumericBlock return NumericBlock + elif name == "DatetimeTZBlock": + from pandas.core.internals.blocks import DatetimeTZBlock + + return DatetimeTZBlock + elif name == "ExtensionBlock": + from pandas.core.internals.blocks import ExtensionBlock + + return ExtensionBlock + elif name == "Block": + from pandas.core.internals.blocks import Block + + return Block else: from pandas.core.internals.blocks import ObjectBlock diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index a3fd77fc8d9ea..b0b3937ca47ea 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -23,9 +23,6 @@ from pandas.core.arrays import DatetimeArray from pandas.core.construction import extract_array from pandas.core.internals.blocks import ( - Block, - DatetimeTZBlock, - ExtensionBlock, check_ndim, ensure_block_shape, extract_pandas_array, @@ -36,6 +33,8 @@ if TYPE_CHECKING: from pandas._typing import Dtype + from pandas.core.internals.blocks import Block + def make_block( values, placement, klass=None, ndim=None, dtype: Dtype | None = None @@ -56,6 +55,11 @@ def make_block( values, dtype = extract_pandas_array(values, dtype, ndim) + from pandas.core.internals.blocks import ( + DatetimeTZBlock, + ExtensionBlock, + ) + if klass is ExtensionBlock and isinstance(values.dtype, PeriodDtype): # GH-44681 changed PeriodArray to be stored in the 2D # NDArrayBackedExtensionBlock instead of ExtensionBlock @@ -108,21 +112,44 @@ def maybe_infer_ndim(values, placement: BlockPlacement, ndim: int | None) -> int def __getattr__(name: str): + # GH#55139 import warnings - from pandas.util._exceptions import find_stack_level - - if name == "create_block_manager_from_blocks": + if name in [ + "Block", + "ExtensionBlock", + "DatetimeTZBlock", + "create_block_manager_from_blocks", + ]: # GH#33892 warnings.warn( f"{name} is deprecated and will be removed in a future version. " "Use public APIs instead.", DeprecationWarning, - stacklevel=find_stack_level(), + # https://github.com/pandas-dev/pandas/pull/55139#pullrequestreview-1720690758 + # on hard-coding stacklevel + stacklevel=2, ) - from pandas.core.internals.managers import create_block_manager_from_blocks - return create_block_manager_from_blocks + if name == "create_block_manager_from_blocks": + from pandas.core.internals.managers import create_block_manager_from_blocks + + return create_block_manager_from_blocks + + elif name == "Block": + from pandas.core.internals.blocks import Block + + return Block + + elif name == "DatetimeTZBlock": + from pandas.core.internals.blocks import DatetimeTZBlock + + return DatetimeTZBlock + + elif name == "ExtensionBlock": + from pandas.core.internals.blocks import ExtensionBlock + + return ExtensionBlock raise AttributeError( f"module 'pandas.core.internals.api' has no attribute '{name}'" diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 9ebce3a71c966..6963bf677bcfb 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -321,13 +321,15 @@ def get_empty_frame(data) -> DataFrame: return concat(sparse_series, axis=1, copy=False) else: - # take on axis=1 + transpose to ensure ndarray layout is column-major - eye_dtype: NpDtype + # ensure ndarray layout is column-major + shape = len(codes), number_of_cols + dummy_dtype: NpDtype if isinstance(_dtype, np.dtype): - eye_dtype = _dtype + dummy_dtype = _dtype else: - eye_dtype = np.bool_ - dummy_mat = np.eye(number_of_cols, dtype=eye_dtype).take(codes, axis=1).T + dummy_dtype = np.bool_ + dummy_mat = np.zeros(shape=shape, dtype=dummy_dtype, order="F") + dummy_mat[np.arange(len(codes)), codes] = 1 if not dummy_na: # reset NaN GH4446 diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index e333d263a6b7a..bb1cd0d738dac 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -12,7 +12,6 @@ from pandas.core.dtypes.missing import notna import pandas.core.algorithms as algos -from pandas.core.arrays import Categorical from pandas.core.indexes.api import MultiIndex from pandas.core.reshape.concat import concat from pandas.core.reshape.util import tile_compat @@ -139,7 +138,7 @@ def melt( return result -def lreshape(data: DataFrame, groups, dropna: bool = True) -> DataFrame: +def lreshape(data: DataFrame, groups: dict, dropna: bool = True) -> DataFrame: """ Reshape wide-format data to long. Generalized inverse of DataFrame.pivot. @@ -192,30 +191,20 @@ def lreshape(data: DataFrame, groups, dropna: bool = True) -> DataFrame: 2 Red Sox 2008 545 3 Yankees 2008 526 """ - if isinstance(groups, dict): - keys = list(groups.keys()) - values = list(groups.values()) - else: - keys, values = zip(*groups) - - all_cols = list(set.union(*(set(x) for x in values))) - id_cols = list(data.columns.difference(all_cols)) - - K = len(values[0]) - - for seq in values: - if len(seq) != K: - raise ValueError("All column lists must be same length") - mdata = {} pivot_cols = [] - - for target, names in zip(keys, values): + all_cols: set[Hashable] = set() + K = len(next(iter(groups.values()))) + for target, names in groups.items(): + if len(names) != K: + raise ValueError("All column lists must be same length") to_concat = [data[col]._values for col in names] mdata[target] = concat_compat(to_concat) pivot_cols.append(target) + all_cols = all_cols.union(names) + id_cols = list(data.columns.difference(all_cols)) for col in id_cols: mdata[col] = np.tile(data[col]._values, K) @@ -467,10 +456,10 @@ def wide_to_long( two 2.9 """ - def get_var_names(df, stub: str, sep: str, suffix: str) -> list[str]: + def get_var_names(df, stub: str, sep: str, suffix: str): regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$" pattern = re.compile(regex) - return [col for col in df.columns if pattern.match(col)] + return df.columns[df.columns.str.match(pattern)] def melt_stub(df, stub: str, i, j, value_vars, sep: str): newdf = melt( @@ -480,7 +469,6 @@ def melt_stub(df, stub: str, i, j, value_vars, sep: str): value_name=stub.rstrip(sep), var_name=j, ) - newdf[j] = Categorical(newdf[j]) newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "", regex=True) # GH17627 Cast numerics suffixes to int/float @@ -497,7 +485,7 @@ def melt_stub(df, stub: str, i, j, value_vars, sep: str): else: stubnames = list(stubnames) - if any(col in stubnames for col in df.columns): + if df.columns.isin(stubnames).any(): raise ValueError("stubname can't be identical to a column name") if not is_list_like(i): @@ -508,18 +496,18 @@ def melt_stub(df, stub: str, i, j, value_vars, sep: str): if df[i].duplicated().any(): raise ValueError("the id variables need to uniquely identify each row") - value_vars = [get_var_names(df, stub, sep, suffix) for stub in stubnames] - - value_vars_flattened = [e for sublist in value_vars for e in sublist] - id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened)) + _melted = [] + value_vars_flattened = [] + for stub in stubnames: + value_var = get_var_names(df, stub, sep, suffix) + value_vars_flattened.extend(value_var) + _melted.append(melt_stub(df, stub, i, j, value_var, sep)) - _melted = [melt_stub(df, s, i, j, v, sep) for s, v in zip(stubnames, value_vars)] - melted = _melted[0].join(_melted[1:], how="outer") + melted = concat(_melted, axis=1) + id_vars = df.columns.difference(value_vars_flattened) + new = df[id_vars] if len(i) == 1: - new = df[id_vars].set_index(i).join(melted) - return new - - new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j]) - - return new + return new.set_index(i).join(melted) + else: + return new.merge(melted.reset_index(), on=i).set_index(i + [j]) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 79354fdd12a2d..c39fbfe6b6d33 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -421,9 +421,10 @@ def _all_key(key): row_margin = data[cols + values].groupby(cols, observed=observed).agg(aggfunc) row_margin = row_margin.stack(future_stack=True) - # slight hack - new_order = [len(cols)] + list(range(len(cols))) - row_margin.index = row_margin.index.reorder_levels(new_order) + # GH#26568. Use names instead of indices in case of numeric names + new_order_indices = [len(cols)] + list(range(len(cols))) + new_order_names = [row_margin.index.names[i] for i in new_order_indices] + row_margin.index = row_margin.index.reorder_levels(new_order_names) else: row_margin = data._constructor_sliced(np.nan, index=result.columns) diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 09a612eca0529..e2aa9010dc109 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -503,6 +503,19 @@ class ChainedAssignmentError(Warning): ) +_chained_assignment_warning_method_msg = ( + "A value is trying to be set on a copy of a DataFrame or Series " + "through chained assignment using an inplace method.\n" + "The behavior will change in pandas 3.0. This inplace method will " + "never work because the intermediate object on which we are setting " + "values always behaves as a copy.\n\n" + "For example, when doing 'df[col].method(value, inplace=True)', try " + "using 'df.method({col: value}, inplace=True)' or " + "df[col] = df[col].method(value) instead, to perform " + "the operation inplace on the original object.\n\n" +) + + class NumExprClobberingError(NameError): """ Exception raised when trying to use a built-in numexpr name as a variable name. diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 2dc88a5701033..5786073c9d9cc 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -250,6 +250,15 @@ def read(self) -> DataFrame: include = self.convert_options.get("include_columns", None) if include is not None: self._validate_usecols(include) + + nulls = self.convert_options.get("null_values", set()) + if not lib.is_list_like(nulls) or not all( + isinstance(x, str) for x in nulls + ): + raise TypeError( + "The 'pyarrow' engine requires all na_values to be strings" + ) + raise try: diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 83d75508920a4..66990de6d3b89 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1716,7 +1716,10 @@ def _clean_options( # Converting values to NA keep_default_na = options["keep_default_na"] - na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) + floatify = engine != "pyarrow" + na_values, na_fvalues = _clean_na_values( + na_values, keep_default_na, floatify=floatify + ) # handle skiprows; this is internally handled by the # c-engine, so only need for python and pyarrow parsers @@ -1928,7 +1931,7 @@ def TextParser(*args, **kwds) -> TextFileReader: return TextFileReader(*args, **kwds) -def _clean_na_values(na_values, keep_default_na: bool = True): +def _clean_na_values(na_values, keep_default_na: bool = True, floatify: bool = True): na_fvalues: set | dict if na_values is None: if keep_default_na: @@ -1956,7 +1959,7 @@ def _clean_na_values(na_values, keep_default_na: bool = True): else: if not is_list_like(na_values): na_values = [na_values] - na_values = _stringify_na_values(na_values) + na_values = _stringify_na_values(na_values, floatify) if keep_default_na: na_values = na_values | STR_NA_VALUES @@ -1978,7 +1981,7 @@ def _floatify_na_values(na_values): return result -def _stringify_na_values(na_values): +def _stringify_na_values(na_values, floatify: bool): """return a stringified and numeric for these values""" result: list[str | float] = [] for x in na_values: @@ -1993,13 +1996,15 @@ def _stringify_na_values(na_values): result.append(f"{v}.0") result.append(str(v)) - result.append(v) - except (TypeError, ValueError, OverflowError): - pass - try: - result.append(int(x)) + if floatify: + result.append(v) except (TypeError, ValueError, OverflowError): pass + if floatify: + try: + result.append(int(x)) + except (TypeError, ValueError, OverflowError): + pass return set(result) diff --git a/pandas/tests/apply/conftest.py b/pandas/tests/apply/conftest.py deleted file mode 100644 index acccdd845b53c..0000000000000 --- a/pandas/tests/apply/conftest.py +++ /dev/null @@ -1,30 +0,0 @@ -import numpy as np -import pytest - -from pandas import DataFrame - - -@pytest.fixture -def int_frame_const_col(): - """ - Fixture for DataFrame of ints which are constant per column - - Columns are ['A', 'B', 'C'], with values (per column): [1, 2, 3] - """ - df = DataFrame( - np.tile(np.arange(3, dtype="int64"), 6).reshape(6, -1) + 1, - columns=["A", "B", "C"], - ) - return df - - -@pytest.fixture(params=["python", pytest.param("numba", marks=pytest.mark.single_cpu)]) -def engine(request): - if request.param == "numba": - pytest.importorskip("numba") - return request.param - - -@pytest.fixture(params=[0, 1]) -def apply_axis(request): - return request.param diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 24f8a99235b70..2d7549e09a986 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -18,6 +18,27 @@ from pandas.tests.frame.common import zip_frames +@pytest.fixture +def int_frame_const_col(): + """ + Fixture for DataFrame of ints which are constant per column + + Columns are ['A', 'B', 'C'], with values (per column): [1, 2, 3] + """ + df = DataFrame( + np.tile(np.arange(3, dtype="int64"), 6).reshape(6, -1) + 1, + columns=["A", "B", "C"], + ) + return df + + +@pytest.fixture(params=["python", pytest.param("numba", marks=pytest.mark.single_cpu)]) +def engine(request): + if request.param == "numba": + pytest.importorskip("numba") + return request.param + + def test_apply(float_frame, engine, request): if engine == "numba": mark = pytest.mark.xfail(reason="numba engine not supporting numpy ufunc yet") @@ -269,7 +290,7 @@ def test_apply_raw_float_frame_no_reduction(float_frame, engine): @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_raw_mixed_type_frame(mixed_type_frame, axis, engine): +def test_apply_raw_mixed_type_frame(axis, engine): if engine == "numba": pytest.skip("isinstance check doesn't work with numba") @@ -278,7 +299,17 @@ def _assert_raw(x): assert x.ndim == 1 # Mixed dtype (GH-32423) - mixed_type_frame.apply(_assert_raw, axis=axis, engine=engine, raw=True) + df = DataFrame( + { + "a": 1.0, + "b": 2, + "c": "foo", + "float32": np.array([1.0] * 10, dtype="float32"), + "int32": np.array([1] * 10, dtype="int32"), + }, + index=np.arange(10), + ) + df.apply(_assert_raw, axis=axis, engine=engine, raw=True) def test_apply_axis1(float_frame): diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index 44829c598253d..9f5157181843e 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -24,9 +24,12 @@ @pytest.mark.parametrize("result_type", ["foo", 1]) -def test_result_type_error(result_type, int_frame_const_col): +def test_result_type_error(result_type): # allowed result_type - df = int_frame_const_col + df = DataFrame( + np.tile(np.arange(3, dtype="int64"), 6).reshape(6, -1) + 1, + columns=["A", "B", "C"], + ) msg = ( "invalid value for result_type, must be one of " @@ -282,8 +285,11 @@ def test_transform_none_to_type(): lambda x: Series([1, 2]), ], ) -def test_apply_broadcast_error(int_frame_const_col, func): - df = int_frame_const_col +def test_apply_broadcast_error(func): + df = DataFrame( + np.tile(np.arange(3, dtype="int64"), 6).reshape(6, -1) + 1, + columns=["A", "B", "C"], + ) # > 1 ndim msg = "too many dims to broadcast|cannot broadcast result" diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 3924d8e74e156..ee239568d057d 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -12,6 +12,11 @@ pytestmark = [td.skip_if_no("numba"), pytest.mark.single_cpu] +@pytest.fixture(params=[0, 1]) +def apply_axis(request): + return request.param + + def test_numba_vs_python_noop(float_frame, apply_axis): func = lambda x: x result = float_frame.apply(func, engine="numba", axis=apply_axis) diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index f77b81574e1c1..c7703b34a5e38 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -2,11 +2,7 @@ import pytest import pandas as pd -from pandas import ( - Index, - RangeIndex, -) -import pandas._testing as tm +from pandas import Index @pytest.fixture(params=[1, np.array(1, dtype=np.int64)]) @@ -63,27 +59,6 @@ def zero(request): return request.param -# ------------------------------------------------------------------ -# Vector Fixtures - - -@pytest.fixture( - params=[ - # TODO: add more dtypes here - Index(np.arange(5, dtype="float64")), - Index(np.arange(5, dtype="int64")), - Index(np.arange(5, dtype="uint64")), - RangeIndex(5), - ], - ids=lambda x: type(x).__name__, -) -def numeric_idx(request): - """ - Several types of numeric-dtypes Index objects - """ - return request.param - - # ------------------------------------------------------------------ # Scalar Fixtures @@ -148,22 +123,6 @@ def two_hours(request): ] -@pytest.fixture( - params=[ - pd.Timedelta(minutes=30).to_pytimedelta(), - np.timedelta64(30, "s"), - pd.Timedelta(seconds=30), - ] - + _common_mismatch -) -def not_hourly(request): - """ - Several timedelta-like and DateOffset instances that are _not_ - compatible with Hourly frequencies. - """ - return request.param - - @pytest.fixture( params=[ np.timedelta64(4, "h"), @@ -178,33 +137,3 @@ def not_daily(request): compatible with Daily frequencies. """ return request.param - - -@pytest.fixture( - params=[ - np.timedelta64(365, "D"), - pd.Timedelta(days=365).to_pytimedelta(), - pd.Timedelta(days=365), - ] - + _common_mismatch -) -def mismatched_freq(request): - """ - Several timedelta-like and DateOffset instances that are _not_ - compatible with Monthly or Annual frequencies. - """ - return request.param - - -# ------------------------------------------------------------------ - - -@pytest.fixture( - params=[Index, pd.Series, tm.to_array, np.array, list], ids=lambda x: x.__name__ -) -def box_1d_array(request): - """ - Fixture to test behavior for Index, Series, tm.to_array, numpy Array and list - classes - """ - return request.param diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index c2fba3c775de9..f89711c0edee7 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -44,6 +44,34 @@ def box_pandas_1d_array(request): return request.param +@pytest.fixture( + params=[ + # TODO: add more dtypes here + Index(np.arange(5, dtype="float64")), + Index(np.arange(5, dtype="int64")), + Index(np.arange(5, dtype="uint64")), + RangeIndex(5), + ], + ids=lambda x: type(x).__name__, +) +def numeric_idx(request): + """ + Several types of numeric-dtypes Index objects + """ + return request.param + + +@pytest.fixture( + params=[Index, Series, tm.to_array, np.array, list], ids=lambda x: x.__name__ +) +def box_1d_array(request): + """ + Fixture to test behavior for Index, Series, tm.to_array, numpy Array and list + classes + """ + return request.param + + def adjust_negative_zero(zero, expected): """ Helper to adjust the expected result if we are dividing by -0.0 diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 5af63258921ed..88c633f5e747f 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -31,6 +31,45 @@ get_upcast_box, ) +_common_mismatch = [ + pd.offsets.YearBegin(2), + pd.offsets.MonthBegin(1), + pd.offsets.Minute(), +] + + +@pytest.fixture( + params=[ + Timedelta(minutes=30).to_pytimedelta(), + np.timedelta64(30, "s"), + Timedelta(seconds=30), + ] + + _common_mismatch +) +def not_hourly(request): + """ + Several timedelta-like and DateOffset instances that are _not_ + compatible with Hourly frequencies. + """ + return request.param + + +@pytest.fixture( + params=[ + np.timedelta64(365, "D"), + Timedelta(days=365).to_pytimedelta(), + Timedelta(days=365), + ] + + _common_mismatch +) +def mismatched_freq(request): + """ + Several timedelta-like and DateOffset instances that are _not_ + compatible with Monthly or Annual frequencies. + """ + return request.param + + # ------------------------------------------------------------------ # Comparisons @@ -1310,6 +1349,42 @@ def test_parr_add_sub_object_array(self): expected = PeriodIndex(["2000-12-30"] * 3, freq="D")._data.astype(object) tm.assert_equal(result, expected) + def test_period_add_timestamp_raises(self, box_with_array): + # GH#17983 + ts = Timestamp("2017") + per = Period("2017", freq="M") + + arr = pd.Index([per], dtype="Period[M]") + arr = tm.box_expected(arr, box_with_array) + + msg = "cannot add PeriodArray and Timestamp" + with pytest.raises(TypeError, match=msg): + arr + ts + with pytest.raises(TypeError, match=msg): + ts + arr + msg = "cannot add PeriodArray and DatetimeArray" + with pytest.raises(TypeError, match=msg): + arr + Series([ts]) + with pytest.raises(TypeError, match=msg): + Series([ts]) + arr + with pytest.raises(TypeError, match=msg): + arr + pd.Index([ts]) + with pytest.raises(TypeError, match=msg): + pd.Index([ts]) + arr + + if box_with_array is pd.DataFrame: + msg = "cannot add PeriodArray and DatetimeArray" + else: + msg = r"unsupported operand type\(s\) for \+: 'Period' and 'DatetimeArray" + with pytest.raises(TypeError, match=msg): + arr + pd.DataFrame([ts]) + if box_with_array is pd.DataFrame: + msg = "cannot add PeriodArray and DatetimeArray" + else: + msg = r"unsupported operand type\(s\) for \+: 'DatetimeArray' and 'Period'" + with pytest.raises(TypeError, match=msg): + pd.DataFrame([ts]) + arr + class TestPeriodSeriesArithmetic: def test_parr_add_timedeltalike_scalar(self, three_days, box_with_array): diff --git a/pandas/tests/arrays/categorical/conftest.py b/pandas/tests/arrays/categorical/conftest.py index d5b49e3e5e8c8..37249210f28f4 100644 --- a/pandas/tests/arrays/categorical/conftest.py +++ b/pandas/tests/arrays/categorical/conftest.py @@ -3,12 +3,6 @@ from pandas import Categorical -@pytest.fixture(params=[True, False]) -def allow_fill(request): - """Boolean 'allow_fill' parameter for Categorical.take""" - return request.param - - @pytest.fixture def factor(): """Fixture returning a Categorical object""" diff --git a/pandas/tests/arrays/categorical/test_take.py b/pandas/tests/arrays/categorical/test_take.py index fb79fe4923522..373f1b30a13c2 100644 --- a/pandas/tests/arrays/categorical/test_take.py +++ b/pandas/tests/arrays/categorical/test_take.py @@ -5,6 +5,12 @@ import pandas._testing as tm +@pytest.fixture(params=[True, False]) +def allow_fill(request): + """Boolean 'allow_fill' parameter for Categorical.take""" + return request.param + + class TestTake: # https://github.com/pandas-dev/pandas/issues/20664 diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 024721896cc58..be4b2c3e7e74c 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -229,178 +229,3 @@ def test_min_max(self, left_right_dtypes, index_or_series_or_array): res = arr_na.max(skipna=True) assert res == MAX assert type(res) == type(MAX) - - -# ---------------------------------------------------------------------------- -# Arrow interaction - - -def test_arrow_extension_type(): - pa = pytest.importorskip("pyarrow") - - from pandas.core.arrays.arrow.extension_types import ArrowIntervalType - - p1 = ArrowIntervalType(pa.int64(), "left") - p2 = ArrowIntervalType(pa.int64(), "left") - p3 = ArrowIntervalType(pa.int64(), "right") - - assert p1.closed == "left" - assert p1 == p2 - assert p1 != p3 - assert hash(p1) == hash(p2) - assert hash(p1) != hash(p3) - - -def test_arrow_array(): - pa = pytest.importorskip("pyarrow") - - from pandas.core.arrays.arrow.extension_types import ArrowIntervalType - - intervals = pd.interval_range(1, 5, freq=1).array - - result = pa.array(intervals) - assert isinstance(result.type, ArrowIntervalType) - assert result.type.closed == intervals.closed - assert result.type.subtype == pa.int64() - assert result.storage.field("left").equals(pa.array([1, 2, 3, 4], type="int64")) - assert result.storage.field("right").equals(pa.array([2, 3, 4, 5], type="int64")) - - expected = pa.array([{"left": i, "right": i + 1} for i in range(1, 5)]) - assert result.storage.equals(expected) - - # convert to its storage type - result = pa.array(intervals, type=expected.type) - assert result.equals(expected) - - # unsupported conversions - with pytest.raises(TypeError, match="Not supported to convert IntervalArray"): - pa.array(intervals, type="float64") - - with pytest.raises(TypeError, match="Not supported to convert IntervalArray"): - pa.array(intervals, type=ArrowIntervalType(pa.float64(), "left")) - - -def test_arrow_array_missing(): - pa = pytest.importorskip("pyarrow") - - from pandas.core.arrays.arrow.extension_types import ArrowIntervalType - - arr = IntervalArray.from_breaks([0.0, 1.0, 2.0, 3.0]) - arr[1] = None - - result = pa.array(arr) - assert isinstance(result.type, ArrowIntervalType) - assert result.type.closed == arr.closed - assert result.type.subtype == pa.float64() - - # fields have missing values (not NaN) - left = pa.array([0.0, None, 2.0], type="float64") - right = pa.array([1.0, None, 3.0], type="float64") - assert result.storage.field("left").equals(left) - assert result.storage.field("right").equals(right) - - # structarray itself also has missing values on the array level - vals = [ - {"left": 0.0, "right": 1.0}, - {"left": None, "right": None}, - {"left": 2.0, "right": 3.0}, - ] - expected = pa.StructArray.from_pandas(vals, mask=np.array([False, True, False])) - assert result.storage.equals(expected) - - -@pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" -) -@pytest.mark.parametrize( - "breaks", - [[0.0, 1.0, 2.0, 3.0], date_range("2017", periods=4, freq="D")], - ids=["float", "datetime64[ns]"], -) -def test_arrow_table_roundtrip(breaks): - pa = pytest.importorskip("pyarrow") - - from pandas.core.arrays.arrow.extension_types import ArrowIntervalType - - arr = IntervalArray.from_breaks(breaks) - arr[1] = None - df = pd.DataFrame({"a": arr}) - - table = pa.table(df) - assert isinstance(table.field("a").type, ArrowIntervalType) - result = table.to_pandas() - assert isinstance(result["a"].dtype, pd.IntervalDtype) - tm.assert_frame_equal(result, df) - - table2 = pa.concat_tables([table, table]) - result = table2.to_pandas() - expected = pd.concat([df, df], ignore_index=True) - tm.assert_frame_equal(result, expected) - - # GH-41040 - table = pa.table( - [pa.chunked_array([], type=table.column(0).type)], schema=table.schema - ) - result = table.to_pandas() - tm.assert_frame_equal(result, expected[0:0]) - - -@pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" -) -@pytest.mark.parametrize( - "breaks", - [[0.0, 1.0, 2.0, 3.0], date_range("2017", periods=4, freq="D")], - ids=["float", "datetime64[ns]"], -) -def test_arrow_table_roundtrip_without_metadata(breaks): - pa = pytest.importorskip("pyarrow") - - arr = IntervalArray.from_breaks(breaks) - arr[1] = None - df = pd.DataFrame({"a": arr}) - - table = pa.table(df) - # remove the metadata - table = table.replace_schema_metadata() - assert table.schema.metadata is None - - result = table.to_pandas() - assert isinstance(result["a"].dtype, pd.IntervalDtype) - tm.assert_frame_equal(result, df) - - -def test_from_arrow_from_raw_struct_array(): - # in case pyarrow lost the Interval extension type (eg on parquet roundtrip - # with datetime64[ns] subtype, see GH-45881), still allow conversion - # from arrow to IntervalArray - pa = pytest.importorskip("pyarrow") - - arr = pa.array([{"left": 0, "right": 1}, {"left": 1, "right": 2}]) - dtype = pd.IntervalDtype(np.dtype("int64"), closed="neither") - - result = dtype.__from_arrow__(arr) - expected = IntervalArray.from_breaks( - np.array([0, 1, 2], dtype="int64"), closed="neither" - ) - tm.assert_extension_array_equal(result, expected) - - result = dtype.__from_arrow__(pa.chunked_array([arr])) - tm.assert_extension_array_equal(result, expected) - - -@pytest.mark.parametrize("timezone", ["UTC", "US/Pacific", "GMT"]) -def test_interval_index_subtype(timezone, inclusive_endpoints_fixture): - # GH 46999 - dates = date_range("2022", periods=3, tz=timezone) - dtype = f"interval[datetime64[ns, {timezone}], {inclusive_endpoints_fixture}]" - result = IntervalIndex.from_arrays( - ["2022-01-01", "2022-01-02"], - ["2022-01-02", "2022-01-03"], - closed=inclusive_endpoints_fixture, - dtype=dtype, - ) - expected = IntervalIndex.from_arrays( - dates[:-1], dates[1:], closed=inclusive_endpoints_fixture - ) - tm.assert_index_equal(result, expected) diff --git a/pandas/tests/arrays/interval/test_interval_pyarrow.py b/pandas/tests/arrays/interval/test_interval_pyarrow.py new file mode 100644 index 0000000000000..ef8701be81e2b --- /dev/null +++ b/pandas/tests/arrays/interval/test_interval_pyarrow.py @@ -0,0 +1,160 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import IntervalArray + + +def test_arrow_extension_type(): + pa = pytest.importorskip("pyarrow") + + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType + + p1 = ArrowIntervalType(pa.int64(), "left") + p2 = ArrowIntervalType(pa.int64(), "left") + p3 = ArrowIntervalType(pa.int64(), "right") + + assert p1.closed == "left" + assert p1 == p2 + assert p1 != p3 + assert hash(p1) == hash(p2) + assert hash(p1) != hash(p3) + + +def test_arrow_array(): + pa = pytest.importorskip("pyarrow") + + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType + + intervals = pd.interval_range(1, 5, freq=1).array + + result = pa.array(intervals) + assert isinstance(result.type, ArrowIntervalType) + assert result.type.closed == intervals.closed + assert result.type.subtype == pa.int64() + assert result.storage.field("left").equals(pa.array([1, 2, 3, 4], type="int64")) + assert result.storage.field("right").equals(pa.array([2, 3, 4, 5], type="int64")) + + expected = pa.array([{"left": i, "right": i + 1} for i in range(1, 5)]) + assert result.storage.equals(expected) + + # convert to its storage type + result = pa.array(intervals, type=expected.type) + assert result.equals(expected) + + # unsupported conversions + with pytest.raises(TypeError, match="Not supported to convert IntervalArray"): + pa.array(intervals, type="float64") + + with pytest.raises(TypeError, match="Not supported to convert IntervalArray"): + pa.array(intervals, type=ArrowIntervalType(pa.float64(), "left")) + + +def test_arrow_array_missing(): + pa = pytest.importorskip("pyarrow") + + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType + + arr = IntervalArray.from_breaks([0.0, 1.0, 2.0, 3.0]) + arr[1] = None + + result = pa.array(arr) + assert isinstance(result.type, ArrowIntervalType) + assert result.type.closed == arr.closed + assert result.type.subtype == pa.float64() + + # fields have missing values (not NaN) + left = pa.array([0.0, None, 2.0], type="float64") + right = pa.array([1.0, None, 3.0], type="float64") + assert result.storage.field("left").equals(left) + assert result.storage.field("right").equals(right) + + # structarray itself also has missing values on the array level + vals = [ + {"left": 0.0, "right": 1.0}, + {"left": None, "right": None}, + {"left": 2.0, "right": 3.0}, + ] + expected = pa.StructArray.from_pandas(vals, mask=np.array([False, True, False])) + assert result.storage.equals(expected) + + +@pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) +@pytest.mark.parametrize( + "breaks", + [[0.0, 1.0, 2.0, 3.0], pd.date_range("2017", periods=4, freq="D")], + ids=["float", "datetime64[ns]"], +) +def test_arrow_table_roundtrip(breaks): + pa = pytest.importorskip("pyarrow") + + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType + + arr = IntervalArray.from_breaks(breaks) + arr[1] = None + df = pd.DataFrame({"a": arr}) + + table = pa.table(df) + assert isinstance(table.field("a").type, ArrowIntervalType) + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.IntervalDtype) + tm.assert_frame_equal(result, df) + + table2 = pa.concat_tables([table, table]) + result = table2.to_pandas() + expected = pd.concat([df, df], ignore_index=True) + tm.assert_frame_equal(result, expected) + + # GH#41040 + table = pa.table( + [pa.chunked_array([], type=table.column(0).type)], schema=table.schema + ) + result = table.to_pandas() + tm.assert_frame_equal(result, expected[0:0]) + + +@pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) +@pytest.mark.parametrize( + "breaks", + [[0.0, 1.0, 2.0, 3.0], pd.date_range("2017", periods=4, freq="D")], + ids=["float", "datetime64[ns]"], +) +def test_arrow_table_roundtrip_without_metadata(breaks): + pa = pytest.importorskip("pyarrow") + + arr = IntervalArray.from_breaks(breaks) + arr[1] = None + df = pd.DataFrame({"a": arr}) + + table = pa.table(df) + # remove the metadata + table = table.replace_schema_metadata() + assert table.schema.metadata is None + + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.IntervalDtype) + tm.assert_frame_equal(result, df) + + +def test_from_arrow_from_raw_struct_array(): + # in case pyarrow lost the Interval extension type (eg on parquet roundtrip + # with datetime64[ns] subtype, see GH-45881), still allow conversion + # from arrow to IntervalArray + pa = pytest.importorskip("pyarrow") + + arr = pa.array([{"left": 0, "right": 1}, {"left": 1, "right": 2}]) + dtype = pd.IntervalDtype(np.dtype("int64"), closed="neither") + + result = dtype.__from_arrow__(arr) + expected = IntervalArray.from_breaks( + np.array([0, 1, 2], dtype="int64"), closed="neither" + ) + tm.assert_extension_array_equal(result, expected) + + result = dtype.__from_arrow__(pa.chunked_array([arr])) + tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/sparse/test_indexing.py b/pandas/tests/arrays/sparse/test_indexing.py index d63d0fb07b404..60029ac06ddb4 100644 --- a/pandas/tests/arrays/sparse/test_indexing.py +++ b/pandas/tests/arrays/sparse/test_indexing.py @@ -166,9 +166,16 @@ def test_take(self, arr_data, arr): tm.assert_sp_array_equal(arr.take([0, 1, 2]), exp) def test_take_all_empty(self): - a = pd.array([0, 0], dtype=SparseDtype("int64")) - result = a.take([0, 1], allow_fill=True, fill_value=np.nan) - tm.assert_sp_array_equal(a, result) + sparse = pd.array([0, 0], dtype=SparseDtype("int64")) + result = sparse.take([0, 1], allow_fill=True, fill_value=np.nan) + tm.assert_sp_array_equal(sparse, result) + + def test_take_different_fill_value(self): + # Take with a different fill value shouldn't overwrite the original + sparse = pd.array([0.0], dtype=SparseDtype("float64", fill_value=0.0)) + result = sparse.take([0, -1], allow_fill=True, fill_value=np.nan) + expected = pd.array([0, np.nan], dtype=sparse.dtype) + tm.assert_sp_array_equal(expected, result) def test_take_fill_value(self): data = np.array([1, np.nan, 0, 3, 0]) diff --git a/pandas/tests/copy_view/test_core_functionalities.py b/pandas/tests/copy_view/test_core_functionalities.py index 97d77ef33d849..8dc80c5cc0e0e 100644 --- a/pandas/tests/copy_view/test_core_functionalities.py +++ b/pandas/tests/copy_view/test_core_functionalities.py @@ -51,7 +51,8 @@ def test_setitem_with_view_invalidated_does_not_copy( df["b"] = 100 arr = get_array(df, "a") view = None # noqa: F841 - # TODO(CoW-warn) false positive? + # TODO(CoW-warn) false positive? -> block gets split because of `df["b"] = 100` + # which introduces additional refs, even when those of `view` go out of scopes with tm.assert_cow_warning(warn_copy_on_write): df.iloc[0, 0] = 100 if using_copy_on_write: diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 5a0ebff64a803..2e623f885b648 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -163,7 +163,6 @@ def test_subset_column_slice( subset.iloc[0, 0] = 0 assert not np.shares_memory(get_array(subset, "b"), get_array(df, "b")) elif warn_copy_on_write: - # TODO(CoW-warn) should warn with tm.assert_cow_warning(single_block): subset.iloc[0, 0] = 0 else: @@ -334,7 +333,6 @@ def test_subset_set_with_row_indexer( ): pytest.skip("setitem with labels selects on columns") - # TODO(CoW-warn) should warn if using_copy_on_write: indexer_si(subset)[indexer] = 0 elif warn_copy_on_write: @@ -369,7 +367,8 @@ def test_subset_set_with_mask(backend, using_copy_on_write, warn_copy_on_write): mask = subset > 3 - # TODO(CoW-warn) should warn + # TODO(CoW-warn) should warn -> mask is a DataFrame, which ends up going through + # DataFrame._where(..., inplace=True) if using_copy_on_write or warn_copy_on_write: subset[mask] = 0 else: @@ -403,7 +402,6 @@ def test_subset_set_column(backend, using_copy_on_write, warn_copy_on_write): else: arr = pd.array([10, 11], dtype="Int64") - # TODO(CoW-warn) should warn if using_copy_on_write or warn_copy_on_write: subset["a"] = arr else: @@ -512,7 +510,6 @@ def test_subset_set_columns(backend, using_copy_on_write, warn_copy_on_write, dt df_orig = df.copy() subset = df[1:3] - # TODO(CoW-warn) should warn if using_copy_on_write or warn_copy_on_write: subset[["a", "c"]] = 0 else: @@ -877,6 +874,8 @@ def test_series_subset_set_with_indexer( ) if warn_copy_on_write: # TODO(CoW-warn) should also warn for setting with mask + # -> Series.__setitem__ with boolean mask ends up using Series._set_values + # or Series._where depending on value being set with tm.assert_cow_warning( not is_mask, raise_on_extra_warnings=warn is not None ): @@ -1006,6 +1005,7 @@ def test_column_as_series_set_with_upcast( s[0] = "foo" expected = Series([1, 2, 3], name="a") elif using_copy_on_write or warn_copy_on_write or using_array_manager: + # TODO(CoW-warn) assert the FutureWarning for CoW is also raised with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): s[0] = "foo" expected = Series(["foo", 2, 3], dtype=object, name="a") @@ -1130,6 +1130,7 @@ def test_set_value_copy_only_necessary_column( view = df[:] if val == "a" and indexer[0] != slice(None): + # TODO(CoW-warn) assert the FutureWarning for CoW is also raised with tm.assert_produces_warning( FutureWarning, match="Setting an item of incompatible dtype is deprecated" ): @@ -1154,6 +1155,8 @@ def test_series_midx_slice(using_copy_on_write): ser = Series([1, 2, 3], index=pd.MultiIndex.from_arrays([[1, 1, 2], [3, 4, 5]])) result = ser[1] assert np.shares_memory(get_array(ser), get_array(result)) + # TODO(CoW-warn) should warn -> reference is only tracked in CoW mode, so + # warning is not triggered result.iloc[0] = 100 if using_copy_on_write: expected = Series( @@ -1162,7 +1165,9 @@ def test_series_midx_slice(using_copy_on_write): tm.assert_series_equal(ser, expected) -def test_getitem_midx_slice(using_copy_on_write, using_array_manager): +def test_getitem_midx_slice( + using_copy_on_write, warn_copy_on_write, using_array_manager +): df = DataFrame({("a", "x"): [1, 2], ("a", "y"): 1, ("b", "x"): 2}) df_orig = df.copy() new_df = df[("a",)] @@ -1175,6 +1180,15 @@ def test_getitem_midx_slice(using_copy_on_write, using_array_manager): if using_copy_on_write: new_df.iloc[0, 0] = 100 tm.assert_frame_equal(df_orig, df) + else: + if warn_copy_on_write: + with tm.assert_cow_warning(): + new_df.iloc[0, 0] = 100 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(SettingWithCopyWarning): + new_df.iloc[0, 0] = 100 + assert df.iloc[0, 0] == 100 def test_series_midx_tuples_slice(using_copy_on_write): @@ -1184,6 +1198,8 @@ def test_series_midx_tuples_slice(using_copy_on_write): ) result = ser[(1, 2)] assert np.shares_memory(get_array(ser), get_array(result)) + # TODO(CoW-warn) should warn -> reference is only tracked in CoW mode, so + # warning is not triggered result.iloc[0] = 100 if using_copy_on_write: expected = Series( diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 085f355dc4377..d11a2893becdc 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -4,6 +4,7 @@ from pandas import ( Categorical, DataFrame, + option_context, ) import pandas._testing as tm from pandas.tests.copy_view.util import get_array @@ -395,6 +396,17 @@ def test_replace_chained_assignment(using_copy_on_write): with tm.raises_chained_assignment_error(): df[["a"]].replace(1, 100, inplace=True) tm.assert_frame_equal(df, df_orig) + else: + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + with option_context("mode.chained_assignment", None): + df[["a"]].replace(1, 100, inplace=True) + + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + with option_context("mode.chained_assignment", None): + df[df.a > 5].replace(1, 100, inplace=True) + + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + df["a"].replace(1, 100, inplace=True) def test_replace_listlike(using_copy_on_write): diff --git a/pandas/tests/copy_view/test_setitem.py b/pandas/tests/copy_view/test_setitem.py index 4e08e00dac2b2..bc3b939734534 100644 --- a/pandas/tests/copy_view/test_setitem.py +++ b/pandas/tests/copy_view/test_setitem.py @@ -1,5 +1,4 @@ import numpy as np -import pytest from pandas import ( DataFrame, @@ -67,8 +66,6 @@ def test_set_column_with_index(using_copy_on_write): assert not np.shares_memory(get_array(df, "d"), arr) -# TODO(CoW-warn) this should NOT warn -@pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_set_columns_with_dataframe(using_copy_on_write): # Case: setting a DataFrame as new columns copies that data # (with delayed copy with CoW) diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py index 7215e910365cf..8828f33b7c62c 100644 --- a/pandas/tests/extension/base/constructors.py +++ b/pandas/tests/extension/base/constructors.py @@ -35,8 +35,7 @@ def test_series_constructor(self, data): if hasattr(result._mgr, "blocks"): assert isinstance(result2._mgr.blocks[0], EABackedBlock) - def test_series_constructor_no_data_with_index(self, dtype): - na_value = dtype.na_value + def test_series_constructor_no_data_with_index(self, dtype, na_value): result = pd.Series(index=[1, 2, 3], dtype=dtype) expected = pd.Series([na_value] * 3, index=[1, 2, 3], dtype=dtype) tm.assert_series_equal(result, expected) @@ -46,8 +45,7 @@ def test_series_constructor_no_data_with_index(self, dtype): expected = pd.Series([], index=pd.Index([], dtype="object"), dtype=dtype) tm.assert_series_equal(result, expected) - def test_series_constructor_scalar_na_with_index(self, dtype): - na_value = dtype.na_value + def test_series_constructor_scalar_na_with_index(self, dtype, na_value): result = pd.Series(na_value, index=[1, 2, 3], dtype=dtype) expected = pd.Series([na_value] * 3, index=[1, 2, 3], dtype=dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index be16a105384c6..5f0c1b960a475 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -148,8 +148,7 @@ def test_getitem_invalid(self, data): with pytest.raises(IndexError, match=msg): data[-ub - 1] - def test_getitem_scalar_na(self, data_missing, na_cmp): - na_value = data_missing.dtype.na_value + def test_getitem_scalar_na(self, data_missing, na_cmp, na_value): result = data_missing[0] assert na_cmp(result, na_value) @@ -349,8 +348,7 @@ def test_take_sequence(self, data): assert result.iloc[1] == data[1] assert result.iloc[2] == data[3] - def test_take(self, data, na_cmp): - na_value = data.dtype.na_value + def test_take(self, data, na_value, na_cmp): result = data.take([0, -1]) assert result.dtype == data.dtype assert result[0] == data[0] @@ -363,8 +361,7 @@ def test_take(self, data, na_cmp): with pytest.raises(IndexError, match="out of bounds"): data.take([len(data) + 1]) - def test_take_empty(self, data, na_cmp): - na_value = data.dtype.na_value + def test_take_empty(self, data, na_value, na_cmp): empty = data[:0] result = empty.take([-1], allow_fill=True) @@ -396,8 +393,7 @@ def test_take_non_na_fill_value(self, data_missing): expected = arr.take([1, 1]) tm.assert_extension_array_equal(result, expected) - def test_take_pandas_style_negative_raises(self, data): - na_value = data.dtype.na_value + def test_take_pandas_style_negative_raises(self, data, na_value): with pytest.raises(ValueError, match=""): data.take([0, -2], fill_value=na_value, allow_fill=True) @@ -417,8 +413,7 @@ def test_take_series(self, data): ) tm.assert_series_equal(result, expected) - def test_reindex(self, data): - na_value = data.dtype.na_value + def test_reindex(self, data, na_value): s = pd.Series(data) result = s.reindex([0, 1, 3]) expected = pd.Series(data.take([0, 1, 3]), index=[0, 1, 3]) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index e10c6ef9a7018..b9407c7197f20 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -121,7 +121,7 @@ def test_argsort_missing(self, data_missing_for_sorting): expected = pd.Series(np.array([1, -1, 0], dtype=np.intp)) tm.assert_series_equal(result, expected) - def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting): + def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, na_value): # GH 24382 is_bool = data_for_sorting.dtype._is_boolean @@ -154,10 +154,9 @@ def test_argmin_argmax_empty_array(self, method, data): getattr(data[:0], method)() @pytest.mark.parametrize("method", ["argmax", "argmin"]) - def test_argmin_argmax_all_na(self, method, data): + def test_argmin_argmax_all_na(self, method, data, na_value): # all missing with skipna=True is the same as empty err_msg = "attempt to get" - na_value = data.dtype.na_value data_na = type(data)._from_sequence([na_value, na_value], dtype=data.dtype) with pytest.raises(ValueError, match=err_msg): getattr(data_na, method)() @@ -556,8 +555,7 @@ def _test_searchsorted_bool_dtypes(self, data_for_sorting, as_series): sorter = np.array([1, 0]) assert data_for_sorting.searchsorted(a, sorter=sorter) == 0 - def test_where_series(self, data, as_frame): - na_value = data.dtype.na_value + def test_where_series(self, data, na_value, as_frame): assert data[0] != data[1] cls = type(data) a, b = data[:2] @@ -684,8 +682,7 @@ def test_insert_invalid_loc(self, data): data.insert(1.5, data[0]) @pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame]) - def test_equals(self, data, as_series, box): - na_value = data.dtype.na_value + def test_equals(self, data, na_value, as_series, box): data2 = type(data)._from_sequence([data[0]] * len(data), dtype=data.dtype) data_na = type(data)._from_sequence([na_value] * len(data), dtype=data.dtype) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 227a1b76088cb..4550e3b055cfe 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -72,8 +72,7 @@ def test_concat_mixed_dtypes(self, data): expected = pd.concat([df1["A"].astype("object"), df2["A"].astype("object")]) tm.assert_series_equal(result, expected) - def test_concat_columns(self, data): - na_value = data.dtype.na_value + def test_concat_columns(self, data, na_value): df1 = pd.DataFrame({"A": data[:3]}) df2 = pd.DataFrame({"B": [1, 2, 3]}) @@ -97,9 +96,8 @@ def test_concat_columns(self, data): result = pd.concat([df1["A"], df2["B"]], axis=1) tm.assert_frame_equal(result, expected) - def test_concat_extension_arrays_copy_false(self, data): + def test_concat_extension_arrays_copy_false(self, data, na_value): # GH 20756 - na_value = data.dtype.na_value df1 = pd.DataFrame({"A": data[:3]}) df2 = pd.DataFrame({"B": data[3:7]}) expected = pd.DataFrame( @@ -124,8 +122,7 @@ def test_concat_with_reindex(self, data): ) tm.assert_frame_equal(result, expected) - def test_align(self, data): - na_value = data.dtype.na_value + def test_align(self, data, na_value): a = data[:3] b = data[2:5] r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) @@ -136,8 +133,7 @@ def test_align(self, data): tm.assert_series_equal(r1, e1) tm.assert_series_equal(r2, e2) - def test_align_frame(self, data): - na_value = data.dtype.na_value + def test_align_frame(self, data, na_value): a = data[:3] b = data[2:5] r1, r2 = pd.DataFrame({"A": a}).align(pd.DataFrame({"A": b}, index=[1, 2, 3])) @@ -152,9 +148,8 @@ def test_align_frame(self, data): tm.assert_frame_equal(r1, e1) tm.assert_frame_equal(r2, e2) - def test_align_series_frame(self, data): + def test_align_series_frame(self, data, na_value): # https://github.com/pandas-dev/pandas/issues/20576 - na_value = data.dtype.na_value ser = pd.Series(data, name="a") df = pd.DataFrame({"col": np.arange(len(ser) + 1)}) r1, r2 = ser.align(df) @@ -185,7 +180,7 @@ def test_set_frame_overwrite_object(self, data): df["A"] = data assert df.dtypes["A"] == data.dtype - def test_merge(self, data): + def test_merge(self, data, na_value): # GH-20743 df1 = pd.DataFrame({"ext": data[:3], "int1": [1, 2, 3], "key": [0, 1, 2]}) df2 = pd.DataFrame({"int2": [1, 2, 3, 4], "key": [0, 0, 1, 3]}) @@ -210,8 +205,7 @@ def test_merge(self, data): "int2": [1, 2, 3, np.nan, 4], "key": [0, 0, 1, 2, 3], "ext": data._from_sequence( - [data[0], data[0], data[1], data[2], data.dtype.na_value], - dtype=data.dtype, + [data[0], data[0], data[1], data[2], na_value], dtype=data.dtype ), } ) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 2e2cbf5bf0d83..067b401ce2f23 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -359,8 +359,7 @@ def test_setitem_with_expansion_dataframe_column(self, data, full_indexer): tm.assert_frame_equal(result, expected) - def test_setitem_with_expansion_row(self, data): - na_value = data.dtype.na_value + def test_setitem_with_expansion_row(self, data, na_value): df = pd.DataFrame({"data": data[:1]}) df.loc[1, "data"] = data[1] diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index 27f6eabfd126e..c5b1295ee4a7d 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -118,6 +118,16 @@ def na_cmp(): return operator.is_ +@pytest.fixture +def na_value(dtype): + """ + The scalar missing value for this type. Default dtype.na_value. + + TODO: can be removed in 3.x (see https://github.com/pandas-dev/pandas/pull/54930) + """ + return dtype.na_value + + @pytest.fixture def data_for_grouping(): """ diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 71133030a5c18..592b88c4701d2 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -97,24 +97,24 @@ def test_from_dtype(self, data): super().test_from_dtype(data) @pytest.mark.xfail(reason="RecursionError, GH-33900") - def test_series_constructor_no_data_with_index(self, dtype): + def test_series_constructor_no_data_with_index(self, dtype, na_value): # RecursionError: maximum recursion depth exceeded in comparison rec_limit = sys.getrecursionlimit() try: # Limit to avoid stack overflow on Windows CI sys.setrecursionlimit(100) - super().test_series_constructor_no_data_with_index(dtype) + super().test_series_constructor_no_data_with_index(dtype, na_value) finally: sys.setrecursionlimit(rec_limit) @pytest.mark.xfail(reason="RecursionError, GH-33900") - def test_series_constructor_scalar_na_with_index(self, dtype): + def test_series_constructor_scalar_na_with_index(self, dtype, na_value): # RecursionError: maximum recursion depth exceeded in comparison rec_limit = sys.getrecursionlimit() try: # Limit to avoid stack overflow on Windows CI sys.setrecursionlimit(100) - super().test_series_constructor_scalar_na_with_index(dtype) + super().test_series_constructor_scalar_na_with_index(dtype, na_value) finally: sys.setrecursionlimit(rec_limit) @@ -214,19 +214,19 @@ def test_combine_first(self, data): super().test_combine_first(data) @pytest.mark.xfail(reason="broadcasting error") - def test_where_series(self, data): + def test_where_series(self, data, na_value): # Fails with # *** ValueError: operands could not be broadcast together # with shapes (4,) (4,) (0,) - super().test_where_series(data) + super().test_where_series(data, na_value) @pytest.mark.xfail(reason="Can't compare dicts.") def test_searchsorted(self, data_for_sorting): super().test_searchsorted(data_for_sorting) @pytest.mark.xfail(reason="Can't compare dicts.") - def test_equals(self, data, as_series): - super().test_equals(data, as_series) + def test_equals(self, data, na_value, as_series): + super().test_equals(data, na_value, as_series) @pytest.mark.skip("fill-value is interpreted as a dict of values") def test_fillna_copy_frame(self, data_missing): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index adbc4a1bb729b..8298d39a5eca9 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -503,7 +503,9 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, reque super().test_reduce_series_numeric(data, all_numeric_reductions, skipna) @pytest.mark.parametrize("skipna", [True, False]) - def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna, request): + def test_reduce_series_boolean( + self, data, all_boolean_reductions, skipna, na_value, request + ): pa_dtype = data.dtype.pyarrow_dtype xfail_mark = pytest.mark.xfail( raises=TypeError, diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 5c793e53cf759..8e46b90d4df1e 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -149,29 +149,29 @@ def test_concat_mixed_dtypes(self, data): def test_stack(self, data, columns, future_stack): super().test_stack(data, columns, future_stack) - def test_concat_columns(self, data): + def test_concat_columns(self, data, na_value): self._check_unsupported(data) - super().test_concat_columns(data) + super().test_concat_columns(data, na_value) - def test_concat_extension_arrays_copy_false(self, data): + def test_concat_extension_arrays_copy_false(self, data, na_value): self._check_unsupported(data) - super().test_concat_extension_arrays_copy_false(data) + super().test_concat_extension_arrays_copy_false(data, na_value) - def test_align(self, data): + def test_align(self, data, na_value): self._check_unsupported(data) - super().test_align(data) + super().test_align(data, na_value) - def test_align_frame(self, data): + def test_align_frame(self, data, na_value): self._check_unsupported(data) - super().test_align_frame(data) + super().test_align_frame(data, na_value) - def test_align_series_frame(self, data): + def test_align_series_frame(self, data, na_value): self._check_unsupported(data) - super().test_align_series_frame(data) + super().test_align_series_frame(data, na_value) - def test_merge(self, data): + def test_merge(self, data, na_value): self._check_unsupported(data) - super().test_merge(data) + super().test_merge(data, na_value) class TestGetitem(BaseSparseTests, base.BaseGetitemTests): @@ -183,9 +183,9 @@ def test_get(self, data): assert ser.get(4) == ser.iloc[2] assert ser.get(2) == ser.iloc[1] - def test_reindex(self, data): + def test_reindex(self, data, na_value): self._check_unsupported(data) - super().test_reindex(data) + super().test_reindex(data, na_value) class TestSetitem(BaseSparseTests, base.BaseSetitemTests): @@ -285,7 +285,7 @@ def test_fillna_copy_series(self, data_missing, using_copy_on_write): def test_fillna_length_mismatch(self, data_missing): super().test_fillna_length_mismatch(data_missing) - def test_where_series(self, data): + def test_where_series(self, data, na_value): assert data[0] != data[1] cls = type(data) a, b = data[:2] @@ -296,7 +296,6 @@ def test_where_series(self, data): result = ser.where(cond) new_dtype = SparseDtype("float", 0.0) - na_value = data.dtype.na_value expected = pd.Series( cls._from_sequence([a, a, na_value, na_value], dtype=new_dtype) ) @@ -320,15 +319,15 @@ def test_shift_0_periods(self, data): assert result._sparse_values[0] != result._sparse_values[1] @pytest.mark.parametrize("method", ["argmax", "argmin"]) - def test_argmin_argmax_all_na(self, method, data): + def test_argmin_argmax_all_na(self, method, data, na_value): # overriding because Sparse[int64, 0] cannot handle na_value self._check_unsupported(data) - super().test_argmin_argmax_all_na(method, data) + super().test_argmin_argmax_all_na(method, data, na_value) @pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame]) - def test_equals(self, data, as_series, box): + def test_equals(self, data, na_value, as_series, box): self._check_unsupported(data) - super().test_equals(data, as_series, box) + super().test_equals(data, na_value, as_series, box) @pytest.mark.parametrize( "func, na_action, expected", diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index fb2df0b82e5f4..f7ed5180b46d9 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -10,75 +10,32 @@ @pytest.fixture -def float_frame_with_na(): +def datetime_frame() -> DataFrame: """ - Fixture for DataFrame of floats with index of unique strings + Fixture for DataFrame of floats with DatetimeIndex - Columns are ['A', 'B', 'C', 'D']; some entries are missing + Columns are ['A', 'B', 'C', 'D'] A B C D - ABwBzA0ljw -1.128865 -0.897161 0.046603 0.274997 - DJiRzmbyQF 0.728869 0.233502 0.722431 -0.890872 - neMgPD5UBF 0.486072 -1.027393 -0.031553 1.449522 - 0yWA4n8VeX -1.937191 -1.142531 0.805215 -0.462018 - 3slYUbbqU1 0.153260 1.164691 1.489795 -0.545826 - soujjZ0A08 NaN NaN NaN NaN - 7W6NLGsjB9 NaN NaN NaN NaN + 2000-01-03 -1.122153 0.468535 0.122226 1.693711 + 2000-01-04 0.189378 0.486100 0.007864 -1.216052 + 2000-01-05 0.041401 -0.835752 -0.035279 -0.414357 + 2000-01-06 0.430050 0.894352 0.090719 0.036939 + 2000-01-07 -0.620982 -0.668211 -0.706153 1.466335 + 2000-01-10 -0.752633 0.328434 -0.815325 0.699674 + 2000-01-11 -2.236969 0.615737 -0.829076 -1.196106 ... ... ... ... ... - uhfeaNkCR1 -0.231210 -0.340472 0.244717 -0.901590 - n6p7GYuBIV -0.419052 1.922721 -0.125361 -0.727717 - ZhzAeY6p1y 1.234374 -1.425359 -0.827038 -0.633189 - uWdPsORyUh 0.046738 -0.980445 -1.102965 0.605503 - 3DJA6aN590 -0.091018 -1.684734 -1.100900 0.215947 - 2GBPAzdbMk -2.883405 -1.021071 1.209877 1.633083 - sHadBoyVHw -2.223032 -0.326384 0.258931 0.245517 + 2000-02-03 1.642618 -0.579288 0.046005 1.385249 + 2000-02-04 -0.544873 -1.160962 -0.284071 -1.418351 + 2000-02-07 -2.656149 -0.601387 1.410148 0.444150 + 2000-02-08 -1.201881 -1.289040 0.772992 -1.445300 + 2000-02-09 1.377373 0.398619 1.008453 -0.928207 + 2000-02-10 0.473194 -0.636677 0.984058 0.511519 + 2000-02-11 -0.965556 0.408313 -1.312844 -0.381948 [30 rows x 4 columns] """ - df = DataFrame(tm.getSeriesData()) - # set some NAs - df.iloc[5:10] = np.nan - df.iloc[15:20, -2:] = np.nan - return df - - -@pytest.fixture -def bool_frame_with_na(): - """ - Fixture for DataFrame of booleans with index of unique strings - - Columns are ['A', 'B', 'C', 'D']; some entries are missing - - A B C D - zBZxY2IDGd False False False False - IhBWBMWllt False True True True - ctjdvZSR6R True False True True - AVTujptmxb False True False True - G9lrImrSWq False False False True - sFFwdIUfz2 NaN NaN NaN NaN - s15ptEJnRb NaN NaN NaN NaN - ... ... ... ... ... - UW41KkDyZ4 True True False False - l9l6XkOdqV True False False False - X2MeZfzDYA False True False False - xWkIKU7vfX False True False True - QOhL6VmpGU False False False True - 22PwkRJdat False True False False - kfboQ3VeIK True False True False - - [30 rows x 4 columns] - """ - df = DataFrame(tm.getSeriesData()) > 0 - df = df.astype(object) - # set some NAs - df.iloc[5:10] = np.nan - df.iloc[15:20, -2:] = np.nan - - # For `any` tests we need to have at least one True before the first NaN - # in each column - for i in range(4): - df.iloc[i, i] = True - return df + return DataFrame(tm.getTimeSeriesData()) @pytest.fixture @@ -202,60 +159,3 @@ def timezone_frame(): df.iloc[1, 1] = NaT df.iloc[1, 2] = NaT return df - - -@pytest.fixture -def uint64_frame(): - """ - Fixture for DataFrame with uint64 values - - Columns are ['A', 'B'] - """ - return DataFrame( - {"A": np.arange(3), "B": [2**63, 2**63 + 5, 2**63 + 10]}, dtype=np.uint64 - ) - - -@pytest.fixture -def simple_frame(): - """ - Fixture for simple 3x3 DataFrame - - Columns are ['one', 'two', 'three'], index is ['a', 'b', 'c']. - - one two three - a 1.0 2.0 3.0 - b 4.0 5.0 6.0 - c 7.0 8.0 9.0 - """ - arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) - - return DataFrame(arr, columns=["one", "two", "three"], index=["a", "b", "c"]) - - -@pytest.fixture -def frame_of_index_cols(): - """ - Fixture for DataFrame of columns that can be used for indexing - - Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')]; - 'A' & 'B' contain duplicates (but are jointly unique), the rest are unique. - - A B C D E (tuple, as, label) - 0 foo one a 0.608477 -0.012500 -1.664297 - 1 foo two b -0.633460 0.249614 -0.364411 - 2 foo three c 0.615256 2.154968 -0.834666 - 3 bar one d 0.234246 1.085675 0.718445 - 4 bar two e 0.533841 -0.005702 -3.533912 - """ - df = DataFrame( - { - "A": ["foo", "foo", "foo", "bar", "bar"], - "B": ["one", "two", "three", "one", "two"], - "C": ["a", "b", "c", "d", "e"], - "D": np.random.default_rng(2).standard_normal(5), - "E": np.random.default_rng(2).standard_normal(5), - ("tuple", "as", "label"): np.random.default_rng(2).standard_normal(5), - } - ) - return df diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index d76bfd2d7ae7c..3cad2e73d3d9d 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1466,8 +1466,6 @@ def test_loc_named_tuple_for_midx(self): ) tm.assert_frame_equal(result, expected) - # TODO(CoW-warn) shouldn't warn, but does because of item cache - @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") @pytest.mark.parametrize("indexer", [["a"], "a"]) @pytest.mark.parametrize("col", [{}, {"b": 1}]) def test_set_2d_casting_date_to_int(self, col, indexer): @@ -1571,8 +1569,11 @@ def test_setitem_value_coercing_dtypes(self, indexer, idx): class TestDataFrameIndexingUInt64: - def test_setitem(self, uint64_frame): - df = uint64_frame + def test_setitem(self): + df = DataFrame( + {"A": np.arange(3), "B": [2**63, 2**63 + 5, 2**63 + 10]}, + dtype=np.uint64, + ) idx = df["A"].rename("foo") # setitem diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index eac10d307c61c..2578dfb622fbf 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -67,9 +67,19 @@ def test_astype_mixed_float(self, mixed_float_frame): casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float16") _check_cast(casted, "float16") - def test_astype_mixed_type(self, mixed_type_frame): + def test_astype_mixed_type(self): # mixed casting - mn = mixed_type_frame._get_numeric_data().copy() + df = DataFrame( + { + "a": 1.0, + "b": 2, + "c": "foo", + "float32": np.array([1.0] * 10, dtype="float32"), + "int32": np.array([1] * 10, dtype="int32"), + }, + index=np.arange(10), + ) + mn = df._get_numeric_data().copy() mn["little_float"] = np.array(12345.0, dtype="float16") mn["big_float"] = np.array(123456789101112.0, dtype="float64") diff --git a/pandas/tests/frame/methods/test_clip.py b/pandas/tests/frame/methods/test_clip.py index ed8ccaea92c58..f783a388d7517 100644 --- a/pandas/tests/frame/methods/test_clip.py +++ b/pandas/tests/frame/methods/test_clip.py @@ -94,9 +94,13 @@ def test_clip_against_series(self, inplace): (1, [[2.0, 3.0, 4.0], [4.0, 5.0, 6.0], [5.0, 6.0, 7.0]]), ], ) - def test_clip_against_list_like(self, simple_frame, inplace, lower, axis, res): + def test_clip_against_list_like(self, inplace, lower, axis, res): # GH#15390 - original = simple_frame.copy(deep=True) + arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) + + original = DataFrame( + arr, columns=["one", "two", "three"], index=["a", "b", "c"] + ) result = original.clip(lower=lower, upper=[5, 6, 7], axis=axis, inplace=inplace) diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py index 9b87ffb0241ef..98113b6c41821 100644 --- a/pandas/tests/frame/methods/test_set_index.py +++ b/pandas/tests/frame/methods/test_set_index.py @@ -24,6 +24,34 @@ import pandas._testing as tm +@pytest.fixture +def frame_of_index_cols(): + """ + Fixture for DataFrame of columns that can be used for indexing + + Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')]; + 'A' & 'B' contain duplicates (but are jointly unique), the rest are unique. + + A B C D E (tuple, as, label) + 0 foo one a 0.608477 -0.012500 -1.664297 + 1 foo two b -0.633460 0.249614 -0.364411 + 2 foo three c 0.615256 2.154968 -0.834666 + 3 bar one d 0.234246 1.085675 0.718445 + 4 bar two e 0.533841 -0.005702 -3.533912 + """ + df = DataFrame( + { + "A": ["foo", "foo", "foo", "bar", "bar"], + "B": ["one", "two", "three", "one", "two"], + "C": ["a", "b", "c", "d", "e"], + "D": np.random.default_rng(2).standard_normal(5), + "E": np.random.default_rng(2).standard_normal(5), + ("tuple", "as", "label"): np.random.default_rng(2).standard_normal(5), + } + ) + return df + + class TestSetIndex: def test_set_index_multiindex(self): # segfault in GH#3308 diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index 50fc6fe6984e7..f96f4e0558fa6 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -87,9 +87,13 @@ def test_transpose_object_to_tzaware_mixed_tz(self): res2 = df2.T assert (res2.dtypes == object).all() - def test_transpose_uint64(self, uint64_frame): - result = uint64_frame.T - expected = DataFrame(uint64_frame.values.T) + def test_transpose_uint64(self): + df = DataFrame( + {"A": np.arange(3), "B": [2**63, 2**63 + 5, 2**63 + 10]}, + dtype=np.uint64, + ) + result = df.T + expected = DataFrame(df.values.T) expected.index = ["A", "B"] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 9e3ee7c69b637..8083795a69413 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -28,6 +28,23 @@ ) +@pytest.fixture +def simple_frame(): + """ + Fixture for simple 3x3 DataFrame + + Columns are ['one', 'two', 'three'], index is ['a', 'b', 'c']. + + one two three + a 1.0 2.0 3.0 + b 4.0 5.0 6.0 + c 7.0 8.0 9.0 + """ + arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) + + return DataFrame(arr, columns=["one", "two", "three"], index=["a", "b", "c"]) + + @pytest.fixture(autouse=True, params=[0, 100], ids=["numexpr", "python"]) def switch_numexpr_min_elements(request, monkeypatch): with monkeypatch.context() as m: diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index fc7c1b0f01fed..20ad93e6dce4d 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -147,6 +147,78 @@ def wrapper(x): tm.assert_series_equal(r1, expected) +@pytest.fixture +def bool_frame_with_na(): + """ + Fixture for DataFrame of booleans with index of unique strings + + Columns are ['A', 'B', 'C', 'D']; some entries are missing + + A B C D + zBZxY2IDGd False False False False + IhBWBMWllt False True True True + ctjdvZSR6R True False True True + AVTujptmxb False True False True + G9lrImrSWq False False False True + sFFwdIUfz2 NaN NaN NaN NaN + s15ptEJnRb NaN NaN NaN NaN + ... ... ... ... ... + UW41KkDyZ4 True True False False + l9l6XkOdqV True False False False + X2MeZfzDYA False True False False + xWkIKU7vfX False True False True + QOhL6VmpGU False False False True + 22PwkRJdat False True False False + kfboQ3VeIK True False True False + + [30 rows x 4 columns] + """ + df = DataFrame(tm.getSeriesData()) > 0 + df = df.astype(object) + # set some NAs + df.iloc[5:10] = np.nan + df.iloc[15:20, -2:] = np.nan + + # For `any` tests we need to have at least one True before the first NaN + # in each column + for i in range(4): + df.iloc[i, i] = True + return df + + +@pytest.fixture +def float_frame_with_na(): + """ + Fixture for DataFrame of floats with index of unique strings + + Columns are ['A', 'B', 'C', 'D']; some entries are missing + + A B C D + ABwBzA0ljw -1.128865 -0.897161 0.046603 0.274997 + DJiRzmbyQF 0.728869 0.233502 0.722431 -0.890872 + neMgPD5UBF 0.486072 -1.027393 -0.031553 1.449522 + 0yWA4n8VeX -1.937191 -1.142531 0.805215 -0.462018 + 3slYUbbqU1 0.153260 1.164691 1.489795 -0.545826 + soujjZ0A08 NaN NaN NaN NaN + 7W6NLGsjB9 NaN NaN NaN NaN + ... ... ... ... ... + uhfeaNkCR1 -0.231210 -0.340472 0.244717 -0.901590 + n6p7GYuBIV -0.419052 1.922721 -0.125361 -0.727717 + ZhzAeY6p1y 1.234374 -1.425359 -0.827038 -0.633189 + uWdPsORyUh 0.046738 -0.980445 -1.102965 0.605503 + 3DJA6aN590 -0.091018 -1.684734 -1.100900 0.215947 + 2GBPAzdbMk -2.883405 -1.021071 1.209877 1.633083 + sHadBoyVHw -2.223032 -0.326384 0.258931 0.245517 + + [30 rows x 4 columns] + """ + df = DataFrame(tm.getSeriesData()) + # set some NAs + df.iloc[5:10] = np.nan + df.iloc[15:20, -2:] = np.nan + return df + + class TestDataFrameAnalytics: # --------------------------------------------------------------------- # Reductions diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 78b99a00d43ce..45884a4b3c20f 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -178,8 +178,8 @@ def test_agg_grouping_is_list_tuple(ts): tm.assert_frame_equal(result, expected) -def test_agg_python_multiindex(mframe): - grouped = mframe.groupby(["A", "B"]) +def test_agg_python_multiindex(multiindex_dataframe_random_data): + grouped = multiindex_dataframe_random_data.groupby(["A", "B"]) result = grouped.agg("mean") expected = grouped.mean() diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index 49fa9dc51f0d3..b8fb3b7fff676 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -24,21 +24,11 @@ def dropna(request): return request.param -@pytest.fixture(params=[True, False]) -def skipna(request): - return request.param - - @pytest.fixture(params=[True, False]) def observed(request): return request.param -@pytest.fixture -def mframe(multiindex_dataframe_random_data): - return multiindex_dataframe_random_data - - @pytest.fixture def df(): return DataFrame( @@ -57,25 +47,8 @@ def ts(): @pytest.fixture -def tsd(): - return tm.getTimeSeriesData() - - -@pytest.fixture -def tsframe(tsd): - return DataFrame(tsd) - - -@pytest.fixture -def df_mixed_floats(): - return DataFrame( - { - "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], - "B": ["one", "one", "two", "three", "two", "two", "one", "three"], - "C": np.random.default_rng(2).standard_normal(8), - "D": np.array(np.random.default_rng(2).standard_normal(8), dtype="float32"), - } - ) +def tsframe(): + return DataFrame(tm.getTimeSeriesData()) @pytest.fixture diff --git a/pandas/tests/groupby/methods/test_describe.py b/pandas/tests/groupby/methods/test_describe.py index c2ffcb04caa60..ee8f93bf3b549 100644 --- a/pandas/tests/groupby/methods/test_describe.py +++ b/pandas/tests/groupby/methods/test_describe.py @@ -11,8 +11,8 @@ import pandas._testing as tm -def test_apply_describe_bug(mframe): - grouped = mframe.groupby(level="first") +def test_apply_describe_bug(multiindex_dataframe_random_data): + grouped = multiindex_dataframe_random_data.groupby(level="first") grouped.describe() # it works! diff --git a/pandas/tests/groupby/methods/test_nth.py b/pandas/tests/groupby/methods/test_nth.py index 4a5571d0daa42..e39cfd520ba1a 100644 --- a/pandas/tests/groupby/methods/test_nth.py +++ b/pandas/tests/groupby/methods/test_nth.py @@ -122,8 +122,15 @@ def test_first_last_with_None_expanded(method, df, expected): tm.assert_frame_equal(result, expected) -def test_first_last_nth_dtypes(df_mixed_floats): - df = df_mixed_floats.copy() +def test_first_last_nth_dtypes(): + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.default_rng(2).standard_normal(8), + "D": np.array(np.random.default_rng(2).standard_normal(8), dtype="float32"), + } + ) df["E"] = True df["F"] = 1 diff --git a/pandas/tests/groupby/test_api.py b/pandas/tests/groupby/test_api.py index 1a030841ba3ab..3066825352fa7 100644 --- a/pandas/tests/groupby/test_api.py +++ b/pandas/tests/groupby/test_api.py @@ -24,8 +24,8 @@ ) -def test_tab_completion(mframe): - grp = mframe.groupby(level="second") +def test_tab_completion(multiindex_dataframe_random_data): + grp = multiindex_dataframe_random_data.groupby(level="second") results = {v for v in dir(grp) if not v.startswith("_")} expected = { "A", @@ -98,9 +98,13 @@ def test_tab_completion(mframe): assert results == expected -def test_all_methods_categorized(mframe): - grp = mframe.groupby(mframe.iloc[:, 0]) - names = {_ for _ in dir(grp) if not _.startswith("_")} - set(mframe.columns) +def test_all_methods_categorized(multiindex_dataframe_random_data): + grp = multiindex_dataframe_random_data.groupby( + multiindex_dataframe_random_data.iloc[:, 0] + ) + names = {_ for _ in dir(grp) if not _.startswith("_")} - set( + multiindex_dataframe_random_data.columns + ) new_names = set(names) new_names -= reduction_kernels new_names -= transformation_kernels diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 448fb53045e3d..c61d9fab0435e 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -41,7 +41,7 @@ def test_repr(): assert result == expected -# TODO(CoW-warn) this should NOT warn +# TODO(CoW-warn) this should NOT warn -> inplace operator triggers it @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_groupby_std_datetimelike(warn_copy_on_write): # GH#48481 @@ -136,18 +136,27 @@ def test_basic_aggregations(dtype): grouped.aggregate(lambda x: x * 2) -def test_groupby_nonobject_dtype(mframe, df_mixed_floats): - key = mframe.index.codes[0] - grouped = mframe.groupby(key) +def test_groupby_nonobject_dtype(multiindex_dataframe_random_data): + key = multiindex_dataframe_random_data.index.codes[0] + grouped = multiindex_dataframe_random_data.groupby(key) result = grouped.sum() - expected = mframe.groupby(key.astype("O")).sum() + expected = multiindex_dataframe_random_data.groupby(key.astype("O")).sum() assert result.index.dtype == np.int8 assert expected.index.dtype == np.int64 tm.assert_frame_equal(result, expected, check_index_type=False) + +def test_groupby_nonobject_dtype_mixed(): # GH 3911, mixed frame non-conversion - df = df_mixed_floats.copy() + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.default_rng(2).standard_normal(8), + "D": np.array(np.random.default_rng(2).standard_normal(8), dtype="float32"), + } + ) df["value"] = range(len(df)) def max_value(group): @@ -1052,7 +1061,7 @@ def test_raise_on_nuisance_python_multiple(three_group): grouped.mean() -def test_empty_groups_corner(mframe): +def test_empty_groups_corner(multiindex_dataframe_random_data): # handle empty groups df = DataFrame( { @@ -1069,7 +1078,7 @@ def test_empty_groups_corner(mframe): expected = grouped.mean(numeric_only=True) tm.assert_frame_equal(result, expected) - grouped = mframe[3:5].groupby(level=0) + grouped = multiindex_dataframe_random_data[3:5].groupby(level=0) agged = grouped.apply(lambda x: x.mean()) agged_A = grouped["A"].apply("mean") tm.assert_series_equal(agged["A"], agged_A) @@ -1083,8 +1092,8 @@ def test_nonsense_func(): df.groupby(lambda x: x + "foo") -def test_wrap_aggregated_output_multindex(mframe): - df = mframe.T +def test_wrap_aggregated_output_multindex(multiindex_dataframe_random_data): + df = multiindex_dataframe_random_data.T df["baz", "two"] = "peekaboo" keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] @@ -1103,24 +1112,24 @@ def aggfun(ser): df.groupby(keys).aggregate(aggfun) -def test_groupby_level_apply(mframe): - result = mframe.groupby(level=0).count() +def test_groupby_level_apply(multiindex_dataframe_random_data): + result = multiindex_dataframe_random_data.groupby(level=0).count() assert result.index.name == "first" - result = mframe.groupby(level=1).count() + result = multiindex_dataframe_random_data.groupby(level=1).count() assert result.index.name == "second" - result = mframe["A"].groupby(level=0).count() + result = multiindex_dataframe_random_data["A"].groupby(level=0).count() assert result.index.name == "first" -def test_groupby_level_mapper(mframe): - deleveled = mframe.reset_index() +def test_groupby_level_mapper(multiindex_dataframe_random_data): + deleveled = multiindex_dataframe_random_data.reset_index() mapper0 = {"foo": 0, "bar": 0, "baz": 1, "qux": 1} mapper1 = {"one": 0, "two": 0, "three": 1} - result0 = mframe.groupby(mapper0, level=0).sum() - result1 = mframe.groupby(mapper1, level=1).sum() + result0 = multiindex_dataframe_random_data.groupby(mapper0, level=0).sum() + result1 = multiindex_dataframe_random_data.groupby(mapper1, level=1).sum() mapped_level0 = np.array( [mapper0.get(x) for x in deleveled["first"]], dtype=np.int64 @@ -1128,8 +1137,8 @@ def test_groupby_level_mapper(mframe): mapped_level1 = np.array( [mapper1.get(x) for x in deleveled["second"]], dtype=np.int64 ) - expected0 = mframe.groupby(mapped_level0).sum() - expected1 = mframe.groupby(mapped_level1).sum() + expected0 = multiindex_dataframe_random_data.groupby(mapped_level0).sum() + expected1 = multiindex_dataframe_random_data.groupby(mapped_level1).sum() expected0.index.name, expected1.index.name = "first", "second" tm.assert_frame_equal(result0, expected0) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 3e52476be9dbd..e3cc41afa4679 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -534,22 +534,24 @@ def test_multiindex_passthru(self): result = gb.first() tm.assert_frame_equal(result, df) - def test_multiindex_negative_level(self, mframe): + def test_multiindex_negative_level(self, multiindex_dataframe_random_data): # GH 13901 - result = mframe.groupby(level=-1).sum() - expected = mframe.groupby(level="second").sum() + result = multiindex_dataframe_random_data.groupby(level=-1).sum() + expected = multiindex_dataframe_random_data.groupby(level="second").sum() tm.assert_frame_equal(result, expected) - result = mframe.groupby(level=-2).sum() - expected = mframe.groupby(level="first").sum() + result = multiindex_dataframe_random_data.groupby(level=-2).sum() + expected = multiindex_dataframe_random_data.groupby(level="first").sum() tm.assert_frame_equal(result, expected) - result = mframe.groupby(level=[-2, -1]).sum() - expected = mframe.sort_index() + result = multiindex_dataframe_random_data.groupby(level=[-2, -1]).sum() + expected = multiindex_dataframe_random_data.sort_index() tm.assert_frame_equal(result, expected) - result = mframe.groupby(level=[-1, "first"]).sum() - expected = mframe.groupby(level=["second", "first"]).sum() + result = multiindex_dataframe_random_data.groupby(level=[-1, "first"]).sum() + expected = multiindex_dataframe_random_data.groupby( + level=["second", "first"] + ).sum() tm.assert_frame_equal(result, expected) def test_multifunc_select_col_integer_cols(self, df): @@ -641,9 +643,9 @@ def test_groupby_multiindex_partial_indexing_equivalence(self): tm.assert_dict_equal(expected_groups, result_groups) @pytest.mark.parametrize("sort", [True, False]) - def test_groupby_level(self, sort, mframe, df): + def test_groupby_level(self, sort, multiindex_dataframe_random_data, df): # GH 17537 - frame = mframe + frame = multiindex_dataframe_random_data deleveled = frame.reset_index() result0 = frame.groupby(level=0, sort=sort).sum() @@ -724,9 +726,9 @@ def test_groupby_level_with_nas(self, sort): expected = Series([6.0, 18.0], index=[0.0, 1.0]) tm.assert_series_equal(result, expected) - def test_groupby_args(self, mframe): + def test_groupby_args(self, multiindex_dataframe_random_data): # PR8618 and issue 8015 - frame = mframe + frame = multiindex_dataframe_random_data msg = "You have to supply one of 'by' and 'level'" with pytest.raises(TypeError, match=msg): @@ -743,14 +745,16 @@ def test_groupby_args(self, mframe): [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]], ], ) - def test_level_preserve_order(self, sort, labels, mframe): + def test_level_preserve_order(self, sort, labels, multiindex_dataframe_random_data): # GH 17537 - grouped = mframe.groupby(level=0, sort=sort) + grouped = multiindex_dataframe_random_data.groupby(level=0, sort=sort) exp_labels = np.array(labels, np.intp) tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels) - def test_grouping_labels(self, mframe): - grouped = mframe.groupby(mframe.index.get_level_values(0)) + def test_grouping_labels(self, multiindex_dataframe_random_data): + grouped = multiindex_dataframe_random_data.groupby( + multiindex_dataframe_random_data.index.get_level_values(0) + ) exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp) tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels) diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py index 808a1687390ff..bfb7acdcf4812 100644 --- a/pandas/tests/indexes/conftest.py +++ b/pandas/tests/indexes/conftest.py @@ -5,7 +5,6 @@ Series, array, ) -import pandas._testing as tm @pytest.fixture(params=[None, False]) @@ -40,22 +39,3 @@ def listlike_box(request): Types that may be passed as the indexer to searchsorted. """ return request.param - - -@pytest.fixture( - params=tm.ALL_REAL_NUMPY_DTYPES - + [ - "object", - "category", - "datetime64[ns]", - "timedelta64[ns]", - ] -) -def any_dtype_for_small_pos_integer_indexes(request): - """ - Dtypes that can be given to an Index with small positive integers. - - This means that for any dtype `x` in the params list, `Index([1, 2, 3], dtype=x)` is - valid and gives the correct Index (sub-)class. - """ - return request.param diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 80b5880257483..4ccc9a323a6c1 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -139,9 +139,10 @@ def test_date_range_invalid_periods(self): with pytest.raises(TypeError, match=msg): date_range(start="1/1/2000", periods="foo", freq="D") - def test_date_range_float_periods(self): - # TODO: reconsider allowing this? - rng = date_range("1/1/2000", periods=10.5) + def test_date_range_fractional_period(self): + msg = "Non-integer 'periods' in pd.date_range, pd.timedelta_range" + with tm.assert_produces_warning(FutureWarning, match=msg): + rng = date_range("1/1/2000", periods=10.5) exp = date_range("1/1/2000", periods=10) tm.assert_index_equal(rng, exp) diff --git a/pandas/tests/indexes/interval/test_base.py b/pandas/tests/indexes/interval/test_base.py deleted file mode 100644 index e0155a13481ac..0000000000000 --- a/pandas/tests/indexes/interval/test_base.py +++ /dev/null @@ -1,56 +0,0 @@ -import numpy as np -import pytest - -from pandas import IntervalIndex -import pandas._testing as tm - - -class TestInterval: - """ - Tests specific to the shared common index tests; unrelated tests should be placed - in test_interval.py or the specific test file (e.g. test_astype.py) - """ - - @pytest.fixture - def simple_index(self) -> IntervalIndex: - return IntervalIndex.from_breaks(range(11), closed="right") - - @pytest.fixture - def index(self): - return tm.makeIntervalIndex(10) - - def test_take(self, closed): - index = IntervalIndex.from_breaks(range(11), closed=closed) - - result = index.take(range(10)) - tm.assert_index_equal(result, index) - - result = index.take([0, 0, 1]) - expected = IntervalIndex.from_arrays([0, 0, 1], [1, 1, 2], closed=closed) - tm.assert_index_equal(result, expected) - - def test_where(self, simple_index, listlike_box): - klass = listlike_box - - idx = simple_index - cond = [True] * len(idx) - expected = idx - result = expected.where(klass(cond)) - tm.assert_index_equal(result, expected) - - cond = [False] + [True] * len(idx[1:]) - expected = IntervalIndex([np.nan] + idx[1:].tolist()) - result = idx.where(klass(cond)) - tm.assert_index_equal(result, expected) - - def test_getitem_2d_deprecated(self, simple_index): - # GH#30588 multi-dim indexing is deprecated, but raising is also acceptable - idx = simple_index - with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): - idx[:, None] - with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): - # GH#44051 - idx[True] - with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): - # GH#44051 - idx[False] diff --git a/pandas/tests/indexes/interval/test_constructors.py b/pandas/tests/indexes/interval/test_constructors.py index 1efe5ff980f6c..078a0e06e0ed7 100644 --- a/pandas/tests/indexes/interval/test_constructors.py +++ b/pandas/tests/indexes/interval/test_constructors.py @@ -488,6 +488,23 @@ def test_index_mixed_closed(self): tm.assert_index_equal(result, expected) +@pytest.mark.parametrize("timezone", ["UTC", "US/Pacific", "GMT"]) +def test_interval_index_subtype(timezone, inclusive_endpoints_fixture): + # GH#46999 + dates = date_range("2022", periods=3, tz=timezone) + dtype = f"interval[datetime64[ns, {timezone}], {inclusive_endpoints_fixture}]" + result = IntervalIndex.from_arrays( + ["2022-01-01", "2022-01-02"], + ["2022-01-02", "2022-01-03"], + closed=inclusive_endpoints_fixture, + dtype=dtype, + ) + expected = IntervalIndex.from_arrays( + dates[:-1], dates[1:], closed=inclusive_endpoints_fixture + ) + tm.assert_index_equal(result, expected) + + def test_dtype_closed_mismatch(): # GH#38394 closed specified in both dtype and IntervalIndex constructor diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index db8f697b95cd8..2007a793843c9 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -19,12 +19,75 @@ array, date_range, interval_range, + isna, period_range, timedelta_range, ) import pandas._testing as tm +class TestGetItem: + def test_getitem(self, closed): + idx = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan), closed=closed) + assert idx[0] == Interval(0.0, 1.0, closed=closed) + assert idx[1] == Interval(1.0, 2.0, closed=closed) + assert isna(idx[2]) + + result = idx[0:1] + expected = IntervalIndex.from_arrays((0.0,), (1.0,), closed=closed) + tm.assert_index_equal(result, expected) + + result = idx[0:2] + expected = IntervalIndex.from_arrays((0.0, 1), (1.0, 2.0), closed=closed) + tm.assert_index_equal(result, expected) + + result = idx[1:3] + expected = IntervalIndex.from_arrays( + (1.0, np.nan), (2.0, np.nan), closed=closed + ) + tm.assert_index_equal(result, expected) + + def test_getitem_2d_deprecated(self): + # GH#30588 multi-dim indexing is deprecated, but raising is also acceptable + idx = IntervalIndex.from_breaks(range(11), closed="right") + with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): + idx[:, None] + with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): + # GH#44051 + idx[True] + with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): + # GH#44051 + idx[False] + + +class TestWhere: + def test_where(self, listlike_box): + klass = listlike_box + + idx = IntervalIndex.from_breaks(range(11), closed="right") + cond = [True] * len(idx) + expected = idx + result = expected.where(klass(cond)) + tm.assert_index_equal(result, expected) + + cond = [False] + [True] * len(idx[1:]) + expected = IntervalIndex([np.nan] + idx[1:].tolist()) + result = idx.where(klass(cond)) + tm.assert_index_equal(result, expected) + + +class TestTake: + def test_take(self, closed): + index = IntervalIndex.from_breaks(range(11), closed=closed) + + result = index.take(range(10)) + tm.assert_index_equal(result, index) + + result = index.take([0, 0, 1]) + expected = IntervalIndex.from_arrays([0, 0, 1], [1, 1, 2], closed=closed) + tm.assert_index_equal(result, expected) + + class TestGetLoc: @pytest.mark.parametrize("side", ["right", "left", "both", "neither"]) def test_get_loc_interval(self, closed, side): diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index dea40eff8d2ac..e19b1700236f5 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -341,26 +341,6 @@ def test_is_monotonic_with_nans(self): assert not index._is_strictly_monotonic_decreasing assert not index.is_monotonic_decreasing - def test_get_item(self, closed): - i = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan), closed=closed) - assert i[0] == Interval(0.0, 1.0, closed=closed) - assert i[1] == Interval(1.0, 2.0, closed=closed) - assert isna(i[2]) - - result = i[0:1] - expected = IntervalIndex.from_arrays((0.0,), (1.0,), closed=closed) - tm.assert_index_equal(result, expected) - - result = i[0:2] - expected = IntervalIndex.from_arrays((0.0, 1), (1.0, 2.0), closed=closed) - tm.assert_index_equal(result, expected) - - result = i[1:3] - expected = IntervalIndex.from_arrays( - (1.0, np.nan), (2.0, np.nan), closed=closed - ) - tm.assert_index_equal(result, expected) - @pytest.mark.parametrize( "breaks", [ diff --git a/pandas/tests/indexes/interval/test_interval_range.py b/pandas/tests/indexes/interval/test_interval_range.py index 6c531fb0428a3..37606bda9efca 100644 --- a/pandas/tests/indexes/interval/test_interval_range.py +++ b/pandas/tests/indexes/interval/test_interval_range.py @@ -219,12 +219,15 @@ def test_float_subtype(self, start, end, freq): expected = "int64" if is_integer(start + end) else "float64" assert result == expected - def test_constructor_coverage(self): + def test_interval_range_fractional_period(self): # float value for periods expected = interval_range(start=0, periods=10) - result = interval_range(start=0, periods=10.5) + msg = "Non-integer 'periods' in pd.date_range, .* pd.interval_range" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = interval_range(start=0, periods=10.5) tm.assert_index_equal(result, expected) + def test_constructor_coverage(self): # equivalent timestamp-like start/end start, end = Timestamp("2017-01-01"), Timestamp("2017-01-15") expected = interval_range(start=start, end=end) diff --git a/pandas/tests/indexes/multi/conftest.py b/pandas/tests/indexes/multi/conftest.py index 3cc4fa4713831..15062aee56e3a 100644 --- a/pandas/tests/indexes/multi/conftest.py +++ b/pandas/tests/indexes/multi/conftest.py @@ -1,7 +1,6 @@ import numpy as np import pytest -import pandas as pd from pandas import ( Index, MultiIndex, @@ -26,52 +25,3 @@ def idx(): verify_integrity=False, ) return mi - - -@pytest.fixture -def idx_dup(): - # compare tests/indexes/multi/conftest.py - major_axis = Index(["foo", "bar", "baz", "qux"]) - minor_axis = Index(["one", "two"]) - - major_codes = np.array([0, 0, 1, 0, 1, 1]) - minor_codes = np.array([0, 1, 0, 1, 0, 1]) - index_names = ["first", "second"] - mi = MultiIndex( - levels=[major_axis, minor_axis], - codes=[major_codes, minor_codes], - names=index_names, - verify_integrity=False, - ) - return mi - - -@pytest.fixture -def index_names(): - # names that match those in the idx fixture for testing equality of - # names assigned to the idx - return ["first", "second"] - - -@pytest.fixture -def narrow_multi_index(): - """ - Return a MultiIndex that is narrower than the display (<80 characters). - """ - n = 1000 - ci = pd.CategoricalIndex(list("a" * n) + (["abc"] * n)) - dti = pd.date_range("2000-01-01", freq="s", periods=n * 2) - return MultiIndex.from_arrays([ci, ci.codes + 9, dti], names=["a", "b", "dti"]) - - -@pytest.fixture -def wide_multi_index(): - """ - Return a MultiIndex that is wider than the display (>80 characters). - """ - n = 1000 - ci = pd.CategoricalIndex(list("a" * n) + (["abc"] * n)) - dti = pd.date_range("2000-01-01", freq="s", periods=n * 2) - levels = [ci, ci.codes + 9, dti, dti, dti] - names = ["a", "b", "dti_1", "dti_2", "dti_3"] - return MultiIndex.from_arrays(levels, names=names) diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index ee1edaa27f804..a69248cf038f8 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -11,12 +11,31 @@ from pandas import ( NA, DatetimeIndex, + Index, MultiIndex, Series, ) import pandas._testing as tm +@pytest.fixture +def idx_dup(): + # compare tests/indexes/multi/conftest.py + major_axis = Index(["foo", "bar", "baz", "qux"]) + minor_axis = Index(["one", "two"]) + + major_codes = np.array([0, 0, 1, 0, 1, 1]) + minor_codes = np.array([0, 1, 0, 1, 0, 1]) + index_names = ["first", "second"] + mi = MultiIndex( + levels=[major_axis, minor_axis], + codes=[major_codes, minor_codes], + names=index_names, + verify_integrity=False, + ) + return mi + + @pytest.mark.parametrize("names", [None, ["first", "second"]]) def test_unique(names): mi = MultiIndex.from_arrays([[1, 2, 1, 2], [1, 1, 1, 2]], names=names) diff --git a/pandas/tests/indexes/multi/test_formats.py b/pandas/tests/indexes/multi/test_formats.py index 1736f65e355fb..52ff3109128f2 100644 --- a/pandas/tests/indexes/multi/test_formats.py +++ b/pandas/tests/indexes/multi/test_formats.py @@ -139,8 +139,11 @@ def test_repr(self, idx): names=['first', ...], length=6)""" assert result == expected - def test_rjust(self, narrow_multi_index): - mi = narrow_multi_index + def test_rjust(self): + n = 1000 + ci = pd.CategoricalIndex(list("a" * n) + (["abc"] * n)) + dti = pd.date_range("2000-01-01", freq="s", periods=n * 2) + mi = MultiIndex.from_arrays([ci, ci.codes + 9, dti], names=["a", "b", "dti"]) result = mi[:1].__repr__() expected = """\ MultiIndex([('a', 9, '2000-01-01 00:00:00')], @@ -182,8 +185,13 @@ def test_rjust(self, narrow_multi_index): names=['a', 'b', 'dti'], length=2000)""" assert result == expected - def test_tuple_width(self, wide_multi_index): - mi = wide_multi_index + def test_tuple_width(self): + n = 1000 + ci = pd.CategoricalIndex(list("a" * n) + (["abc"] * n)) + dti = pd.date_range("2000-01-01", freq="s", periods=n * 2) + levels = [ci, ci.codes + 9, dti, dti, dti] + names = ["a", "b", "dti_1", "dti_2", "dti_3"] + mi = MultiIndex.from_arrays(levels, names=names) result = mi[:1].__repr__() expected = """MultiIndex([('a', 9, '2000-01-01 00:00:00', '2000-01-01 00:00:00', ...)], names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'])""" # noqa: E501 diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py index 0720a1e1c648c..e362fc8a05a46 100644 --- a/pandas/tests/indexes/multi/test_get_set.py +++ b/pandas/tests/indexes/multi/test_get_set.py @@ -95,8 +95,9 @@ def test_get_level_number_out_of_bounds(multiindex_dataframe_random_data): frame.index._get_level_number(-3) -def test_set_name_methods(idx, index_names): +def test_set_name_methods(idx): # so long as these are synonyms, we don't need to test set_names + index_names = ["first", "second"] assert idx.rename == idx.set_names new_names = [name + "SUFFIX" for name in index_names] ind = idx.set_names(new_names) diff --git a/pandas/tests/indexes/multi/test_names.py b/pandas/tests/indexes/multi/test_names.py index 8ae643eb3626d..45f19b4d70fb9 100644 --- a/pandas/tests/indexes/multi/test_names.py +++ b/pandas/tests/indexes/multi/test_names.py @@ -83,11 +83,11 @@ def test_copy_names(): multi_idx.copy(names=[["mario"], ["luigi"]]) -def test_names(idx, index_names): +def test_names(idx): # names are assigned in setup - assert index_names == ["first", "second"] + assert idx.names == ["first", "second"] level_names = [level.name for level in idx.levels] - assert level_names == index_names + assert level_names == idx.names # setting bad names on existing index = idx diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index a1923d29d3d0e..aecd3b3bace9a 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -176,8 +176,10 @@ def test_constructor_invalid_quarters(self): year=range(2000, 2004), quarter=list(range(4)), freq="Q-DEC" ) - def test_constructor_corner(self): - result = period_range("2007-01", periods=10.5, freq="M") + def test_period_range_fractional_period(self): + msg = "Non-integer 'periods' in pd.date_range, pd.timedelta_range" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = period_range("2007-01", periods=10.5, freq="M") exp = period_range("2007-01", periods=10, freq="M") tm.assert_index_equal(result, exp) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 7620bf272db11..dc624f0271a73 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -462,7 +462,11 @@ def test_empty_fancy(self, index, dtype): empty_index = type(index)([], dtype=index.dtype) assert index[[]].identical(empty_index) - assert index[empty_arr].identical(empty_index) + if dtype == np.bool_: + with tm.assert_produces_warning(FutureWarning, match="is deprecated"): + assert index[empty_arr].identical(empty_index) + else: + assert index[empty_arr].identical(empty_index) @pytest.mark.parametrize( "index", diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index d126d32e627cd..1f328c06b483b 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -30,6 +30,25 @@ ) +@pytest.fixture( + params=tm.ALL_REAL_NUMPY_DTYPES + + [ + "object", + "category", + "datetime64[ns]", + "timedelta64[ns]", + ] +) +def any_dtype_for_small_pos_integer_indexes(request): + """ + Dtypes that can be given to an Index with small positive integers. + + This means that for any dtype `x` in the params list, `Index([1, 2, 3], dtype=x)` is + valid and gives the correct Index (sub-)class. + """ + return request.param + + def test_union_same_types(index): # Union with a non-unique, non-monotonic index raises error # Only needed for bool index factory diff --git a/pandas/tests/indexes/timedeltas/test_constructors.py b/pandas/tests/indexes/timedeltas/test_constructors.py index ccbd61c4d6693..8c51f6c0a1d25 100644 --- a/pandas/tests/indexes/timedeltas/test_constructors.py +++ b/pandas/tests/indexes/timedeltas/test_constructors.py @@ -174,11 +174,14 @@ def test_constructor_iso(self): result = to_timedelta(durations) tm.assert_index_equal(result, expected) - def test_constructor_coverage(self): - rng = timedelta_range("1 days", periods=10.5) + def test_timedelta_range_fractional_period(self): + msg = "Non-integer 'periods' in pd.date_range, pd.timedelta_range" + with tm.assert_produces_warning(FutureWarning, match=msg): + rng = timedelta_range("1 days", periods=10.5) exp = timedelta_range("1 days", periods=10) tm.assert_index_equal(rng, exp) + def test_constructor_coverage(self): msg = "periods must be a number, got foo" with pytest.raises(TypeError, match=msg): timedelta_range(start="1 days", periods="foo", freq="D") diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index b3f93cbfd4113..cbcbf3396363a 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -424,7 +424,7 @@ def test_iloc_getitem_slice_dups(self): tm.assert_frame_equal(df.iloc[10:, :2], df2) tm.assert_frame_equal(df.iloc[10:, 2:], df1) - # TODO(CoW-warn) this should NOT warn + # TODO(CoW-warn) this should NOT warn -> Series inplace operator @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_iloc_setitem(self): df = DataFrame( diff --git a/pandas/tests/internals/test_api.py b/pandas/tests/internals/test_api.py index ffc672cc748be..1251a6ae97a1c 100644 --- a/pandas/tests/internals/test_api.py +++ b/pandas/tests/internals/test_api.py @@ -3,6 +3,8 @@ in core.internals """ +import pytest + import pandas as pd import pandas._testing as tm from pandas.core import internals @@ -27,9 +29,6 @@ def test_namespace(): "ops", ] expected = [ - "Block", - "DatetimeTZBlock", - "ExtensionBlock", "make_block", "DataManager", "ArrayManager", @@ -44,6 +43,28 @@ def test_namespace(): assert set(result) == set(expected + modules) +@pytest.mark.parametrize( + "name", + [ + "NumericBlock", + "ObjectBlock", + "Block", + "ExtensionBlock", + "DatetimeTZBlock", + ], +) +def test_deprecations(name): + # GH#55139 + msg = f"{name} is deprecated.* Use public APIs instead" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + getattr(internals, name) + + if name not in ["NumericBlock", "ObjectBlock"]: + # NumericBlock and ObjectBlock are not in the internals.api namespace + with tm.assert_produces_warning(DeprecationWarning, match=msg): + getattr(api, name) + + def test_make_block_2d_with_dti(): # GH#41168 dti = pd.date_range("2012", periods=3, tz="UTC") diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index ae79da4bbe0d3..792d1323ec730 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -991,7 +991,6 @@ def assert_slice_ok(mgr, axis, slobj): # 2D only support slice objects # boolean mask - assert_slice_ok(mgr, ax, np.array([], dtype=np.bool_)) assert_slice_ok(mgr, ax, np.ones(mgr.shape[ax], dtype=np.bool_)) assert_slice_ok(mgr, ax, np.zeros(mgr.shape[ax], dtype=np.bool_)) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index d3552ab5d39f5..ab6cacc4cc860 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -51,23 +51,7 @@ def xml_file(datapath): @pytest.fixture -def s3so(worker_id): - if is_ci_environment(): - url = "http://localhost:5000/" - else: - worker_id = "5" if worker_id == "master" else worker_id.lstrip("gw") - url = f"http://127.0.0.1:555{worker_id}/" - return {"client_kwargs": {"endpoint_url": url}} - - -@pytest.fixture(scope="function" if is_ci_environment() else "session") -def monkeysession(): - with pytest.MonkeyPatch.context() as mp: - yield mp - - -@pytest.fixture(scope="function" if is_ci_environment() else "session") -def s3_base(worker_id, monkeysession): +def s3_base(worker_id, monkeypatch): """ Fixture for mocking S3 interaction. @@ -79,8 +63,8 @@ def s3_base(worker_id, monkeysession): # temporary workaround as moto fails for botocore >= 1.11 otherwise, # see https://github.com/spulec/moto/issues/1924 & 1952 - monkeysession.setenv("AWS_ACCESS_KEY_ID", "foobar_key") - monkeysession.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret") + monkeypatch.setenv("AWS_ACCESS_KEY_ID", "foobar_key") + monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret") if is_ci_environment(): if is_platform_arm() or is_platform_mac() or is_platform_windows(): # NOT RUN on Windows/macOS/ARM, only Ubuntu @@ -93,6 +77,7 @@ def s3_base(worker_id, monkeysession): "Windows, macOS or ARM platforms" ) else: + # set in .github/workflows/unit-tests.yml yield "http://localhost:5000" else: requests = pytest.importorskip("requests") @@ -128,6 +113,11 @@ def s3_base(worker_id, monkeysession): proc.terminate() +@pytest.fixture +def s3so(s3_base): + return {"client_kwargs": {"endpoint_url": s3_base}} + + @pytest.fixture def s3_resource(s3_base): import boto3 diff --git a/pandas/tests/io/data/gbq_fake_job.txt b/pandas/tests/io/data/gbq_fake_job.txt deleted file mode 100644 index b0995222292e4..0000000000000 --- a/pandas/tests/io/data/gbq_fake_job.txt +++ /dev/null @@ -1 +0,0 @@ -{'status': {'state': 'DONE'}, 'kind': 'bigquery#job', 'statistics': {'query': {'cacheHit': True, 'totalBytesProcessed': '0'}, 'endTime': '1377668744674', 'totalBytesProcessed': '0', 'startTime': '1377668744466'}, 'jobReference': {'projectId': '57288129629', 'jobId': 'bqjob_r5f956972f0190bdf_00000140c374bf42_2'}, 'etag': '"4PTsVxg68bQkQs1RJ1Ndewqkgg4/oO4VmgFrAku4N6FWci9s7iFIftc"', 'configuration': {'query': {'createDisposition': 'CREATE_IF_NEEDED', 'query': 'SELECT * FROM [publicdata:samples.shakespeare]', 'writeDisposition': 'WRITE_TRUNCATE', 'destinationTable': {'projectId': '57288129629', 'tableId': 'anonb5ec450da88eeeb78a27784ea482ee75a146d442', 'datasetId': '_d0b4f5f0d50dc68a3eb0fa6cba66a9a8687d9253'}}}, 'id': '57288129629:bqjob_r5f956972f0190bdf_00000140c374bf42_2', 'selfLink': 'https://www.googleapis.com/bigquery/v2/projects/57288129629/jobs/bqjob_r5f956972f0190bdf_00000140c374bf42_2'} \ No newline at end of file diff --git a/pandas/tests/io/excel/conftest.py b/pandas/tests/io/excel/conftest.py deleted file mode 100644 index 15ff52d5bea48..0000000000000 --- a/pandas/tests/io/excel/conftest.py +++ /dev/null @@ -1,41 +0,0 @@ -import pytest - -import pandas._testing as tm - -from pandas.io.parsers import read_csv - - -@pytest.fixture -def frame(float_frame): - """ - Returns the first ten items in fixture "float_frame". - """ - return float_frame[:10] - - -@pytest.fixture -def tsframe(): - return tm.makeTimeDataFrame()[:5] - - -@pytest.fixture(params=[True, False]) -def merge_cells(request): - return request.param - - -@pytest.fixture -def df_ref(datapath): - """ - Obtain the reference data from read_csv with the Python engine. - """ - filepath = datapath("io", "data", "csv", "test1.csv") - df_ref = read_csv(filepath, index_col=0, parse_dates=True, engine="python") - return df_ref - - -@pytest.fixture(params=[".xls", ".xlsx", ".xlsm", ".ods", ".xlsb"]) -def read_ext(request): - """ - Valid extensions for reading Excel files. - """ - return request.param diff --git a/pandas/tests/io/excel/test_odswriter.py b/pandas/tests/io/excel/test_odswriter.py index ecee58362f8a9..271353a173d2a 100644 --- a/pandas/tests/io/excel/test_odswriter.py +++ b/pandas/tests/io/excel/test_odswriter.py @@ -13,7 +13,10 @@ odf = pytest.importorskip("odf") -pytestmark = pytest.mark.parametrize("ext", [".ods"]) + +@pytest.fixture +def ext(): + return ".ods" def test_write_append_mode_raises(ext): diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 53cbd1ce3cceb..2df9ec9e53516 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -17,10 +17,13 @@ openpyxl = pytest.importorskip("openpyxl") -pytestmark = pytest.mark.parametrize("ext", [".xlsx"]) +@pytest.fixture +def ext(): + return ".xlsx" -def test_to_excel_styleconverter(ext): + +def test_to_excel_styleconverter(): from openpyxl import styles hstyle = { diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 74fe5166df65f..abbdb77efad0e 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -22,6 +22,7 @@ Index, MultiIndex, Series, + read_csv, ) import pandas._testing as tm from pandas.core.arrays import ( @@ -117,6 +118,16 @@ def read_ext(engine_and_read_ext): return read_ext +@pytest.fixture +def df_ref(datapath): + """ + Obtain the reference data from read_csv with the Python engine. + """ + filepath = datapath("io", "data", "csv", "test1.csv") + df_ref = read_csv(filepath, index_col=0, parse_dates=True, engine="python") + return df_ref + + def adjust_expected(expected: DataFrame, read_ext: str) -> None: expected.index.name = None diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index b9ea440b3e859..22cd0621fd4c4 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -34,6 +34,19 @@ from pandas.io.excel._util import _writers +@pytest.fixture +def frame(float_frame): + """ + Returns the first ten items in fixture "float_frame". + """ + return float_frame[:10] + + +@pytest.fixture(params=[True, False]) +def merge_cells(request): + return request.param + + @pytest.fixture def path(ext): """ @@ -444,8 +457,8 @@ def test_mixed(self, frame, path): recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(mixed_frame, recons) - def test_ts_frame(self, tsframe, path): - df = tsframe + def test_ts_frame(self, path): + df = tm.makeTimeDataFrame()[:5] # freq doesn't round-trip index = pd.DatetimeIndex(np.asarray(df.index), freq=None) @@ -516,8 +529,9 @@ def test_inf_roundtrip(self, path): tm.assert_frame_equal(df, recons) - def test_sheets(self, frame, tsframe, path): + def test_sheets(self, frame, path): # freq doesn't round-trip + tsframe = tm.makeTimeDataFrame()[:5] index = pd.DatetimeIndex(np.asarray(tsframe.index), freq=None) tsframe.index = index @@ -633,10 +647,11 @@ def test_excel_roundtrip_indexname(self, merge_cells, path): tm.assert_frame_equal(result, df) assert result.index.name == "foo" - def test_excel_roundtrip_datetime(self, merge_cells, tsframe, path): + def test_excel_roundtrip_datetime(self, merge_cells, path): # datetime.date, not sure what to test here exactly # freq does not round-trip + tsframe = tm.makeTimeDataFrame()[:5] index = pd.DatetimeIndex(np.asarray(tsframe.index), freq=None) tsframe.index = index @@ -751,8 +766,8 @@ def test_to_excel_timedelta(self, path): recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(expected, recons) - def test_to_excel_periodindex(self, tsframe, path): - xp = tsframe.resample("ME", kind="period").mean() + def test_to_excel_periodindex(self, path): + xp = tm.makeTimeDataFrame()[:5].resample("ME", kind="period").mean() xp.to_excel(path, sheet_name="sht1") @@ -814,8 +829,9 @@ def test_to_excel_multiindex_cols(self, merge_cells, frame, path): frame.columns = [".".join(map(str, q)) for q in zip(*fm)] tm.assert_frame_equal(frame, df) - def test_to_excel_multiindex_dates(self, merge_cells, tsframe, path): + def test_to_excel_multiindex_dates(self, merge_cells, path): # try multiindex with dates + tsframe = tm.makeTimeDataFrame()[:5] new_index = [tsframe.index, np.arange(len(tsframe.index), dtype=np.int64)] tsframe.index = MultiIndex.from_arrays(new_index) diff --git a/pandas/tests/io/excel/test_xlsxwriter.py b/pandas/tests/io/excel/test_xlsxwriter.py index c4d02d71390cc..94f6bdfaf069c 100644 --- a/pandas/tests/io/excel/test_xlsxwriter.py +++ b/pandas/tests/io/excel/test_xlsxwriter.py @@ -9,7 +9,10 @@ xlsxwriter = pytest.importorskip("xlsxwriter") -pytestmark = pytest.mark.parametrize("ext", [".xlsx"]) + +@pytest.fixture +def ext(): + return ".xlsx" def test_column_format(ext): diff --git a/pandas/tests/io/json/conftest.py b/pandas/tests/io/json/conftest.py index f3736252e850a..4e848cd48b42d 100644 --- a/pandas/tests/io/json/conftest.py +++ b/pandas/tests/io/json/conftest.py @@ -7,10 +7,3 @@ def orient(request): Fixture for orients excluding the table format. """ return request.param - - -@pytest.fixture(params=["ujson", "pyarrow"]) -def engine(request): - if request.param == "pyarrow": - pytest.importorskip("pyarrow.json") - return request.param diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index f5342e0ab1a38..d96ccb4b94cc2 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -25,6 +25,13 @@ def lines_json_df(): return df.to_json(lines=True, orient="records") +@pytest.fixture(params=["ujson", "pyarrow"]) +def engine(request): + if request.param == "pyarrow": + pytest.importorskip("pyarrow.json") + return request.param + + def test_read_jsonl(): # GH9180 result = read_json(StringIO('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n'), lines=True) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 0c28db245de31..558fdb7632102 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -399,11 +399,16 @@ def test_escapechar(all_parsers): tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"])) -@xfail_pyarrow # ValueError: the 'pyarrow' engine does not support regex separators def test_ignore_leading_whitespace(all_parsers): # see gh-3374, gh-6607 parser = all_parsers data = " a b c\n 1 2 3\n 4 5 6\n 7 8 9" + + if parser.engine == "pyarrow": + msg = "the 'pyarrow' engine does not support regex separators" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), sep=r"\s+") + return result = parser.read_csv(StringIO(data), sep=r"\s+") expected = DataFrame({"a": [1, 4, 7], "b": [2, 5, 8], "c": [3, 6, 9]}) @@ -582,12 +587,14 @@ def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data, request): if sep == r"\s+": data = data.replace(",", " ") + if parser.engine == "pyarrow": - mark = pytest.mark.xfail( - raises=ValueError, - reason="the 'pyarrow' engine does not support regex separators", - ) - request.applymarker(mark) + msg = "the 'pyarrow' engine does not support regex separators" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines + ) + return result = parser.read_csv(StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines) expected = DataFrame(exp_data, columns=["A", "B", "C"]) @@ -610,7 +617,6 @@ def test_whitespace_lines(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: the 'pyarrow' engine does not support regex separators @pytest.mark.parametrize( "data,expected", [ @@ -635,6 +641,12 @@ def test_whitespace_lines(all_parsers): def test_whitespace_regex_separator(all_parsers, data, expected): # see gh-6607 parser = all_parsers + if parser.engine == "pyarrow": + msg = "the 'pyarrow' engine does not support regex separators" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), sep=r"\s+") + return + result = parser.read_csv(StringIO(data), sep=r"\s+") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index 69c39fdf4cdbe..c374795019ff4 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -235,7 +235,6 @@ def test_eof_states(all_parsers, data, kwargs, expected, msg, request): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: the 'pyarrow' engine does not support regex separators def test_temporary_file(all_parsers): # see gh-13398 parser = all_parsers @@ -246,6 +245,12 @@ def test_temporary_file(all_parsers): new_file.flush() new_file.seek(0) + if parser.engine == "pyarrow": + msg = "the 'pyarrow' engine does not support regex separators" + with pytest.raises(ValueError, match=msg): + parser.read_csv(new_file, sep=r"\s+", header=None) + return + result = parser.read_csv(new_file, sep=r"\s+", header=None) expected = DataFrame([[0, 0]]) diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index a9540c94ce10e..500863dce84ee 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -17,7 +17,6 @@ import numpy as np import pytest -from pandas.compat import is_ci_environment from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import ( ParserError, @@ -531,24 +530,6 @@ def test_read_tarfile(c_parser_only, csv_dir_path, tar_suffix): tm.assert_frame_equal(out, expected) -@pytest.mark.single_cpu -@pytest.mark.skipif(is_ci_environment(), reason="Too memory intensive for CI.") -def test_bytes_exceed_2gb(c_parser_only): - # see gh-16798 - # - # Read from a "CSV" that has a column larger than 2GB. - parser = c_parser_only - - if parser.low_memory: - pytest.skip("not a low_memory test") - - # csv takes 10 seconds to construct, spikes memory to 8GB+, the whole test - # spikes up to 10.4GB on the c_high case - csv = StringIO("strings\n" + "\n".join(["x" * (1 << 20) for _ in range(2100)])) - df = parser.read_csv(csv) - assert not df.empty - - def test_chunk_whitespace_on_boundary(c_parser_only): # see gh-9735: this issue is C parser-specific (bug when # parsing whitespace and characters at chunk boundary) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 437a5fb5e9f09..ca106fa772e82 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -59,7 +59,6 @@ def test_detect_string_na(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # TypeError: expected bytes, int found @pytest.mark.parametrize( "na_values", [ @@ -87,12 +86,26 @@ def test_detect_string_na(all_parsers): """, ], ) -def test_non_string_na_values(all_parsers, data, na_values): +def test_non_string_na_values(all_parsers, data, na_values, request): # see gh-3611: with an odd float format, we can't match # the string "999.0" exactly but still need float matching parser = all_parsers expected = DataFrame([[np.nan, 1.2], [2.0, np.nan], [3.0, 4.5]], columns=["A", "B"]) + if parser.engine == "pyarrow" and not all(isinstance(x, str) for x in na_values): + msg = "The 'pyarrow' engine requires all na_values to be strings" + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), na_values=na_values) + return + elif parser.engine == "pyarrow" and "-999.000" in data: + # bc the pyarrow engine does not include the float-ified version + # of "-999" -> -999, it does not match the entry with the trailing + # zeros, so "-999.000" is not treated as null. + mark = pytest.mark.xfail( + reason="pyarrow engined does not recognize equivalent floats" + ) + request.applymarker(mark) + result = parser.read_csv(StringIO(data), na_values=na_values) tm.assert_frame_equal(result, expected) @@ -145,8 +158,6 @@ def f(i, v): tm.assert_frame_equal(result, expected) -# ValueError: skiprows argument must be an integer when using engine='pyarrow' -@xfail_pyarrow @pytest.mark.parametrize("na_values", ["baz", ["baz"]]) def test_custom_na_values(all_parsers, na_values): parser = all_parsers @@ -159,6 +170,12 @@ def test_custom_na_values(all_parsers, na_values): expected = DataFrame( [[1.0, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], columns=["A", "B", "C"] ) + if parser.engine == "pyarrow": + msg = "skiprows argument must be an integer when using engine='pyarrow'" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1]) + return + result = parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1]) tm.assert_frame_equal(result, expected) @@ -183,7 +200,6 @@ def test_bool_na_values(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The pyarrow engine doesn't support passing a dict def test_na_value_dict(all_parsers): data = """A,B,C foo,bar,NA @@ -191,6 +207,13 @@ def test_na_value_dict(all_parsers): foo,bar,NA bar,foo,foo""" parser = all_parsers + + if parser.engine == "pyarrow": + msg = "pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), na_values={"A": ["foo"], "B": ["bar"]}) + return + df = parser.read_csv(StringIO(data), na_values={"A": ["foo"], "B": ["bar"]}) expected = DataFrame( { @@ -235,7 +258,6 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The pyarrow engine doesn't support passing a dict @pytest.mark.parametrize( "kwargs,expected", [ @@ -281,7 +303,7 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): ), ], ) -def test_na_values_keep_default(all_parsers, kwargs, expected): +def test_na_values_keep_default(all_parsers, kwargs, expected, request): data = """\ A,B,C a,1,one @@ -293,6 +315,15 @@ def test_na_values_keep_default(all_parsers, kwargs, expected): g,7,seven """ parser = all_parsers + if parser.engine == "pyarrow": + if "na_values" in kwargs and isinstance(kwargs["na_values"], dict): + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + return + mark = pytest.mark.xfail() + request.applymarker(mark) + result = parser.read_csv(StringIO(data), **kwargs) tm.assert_frame_equal(result, expected) @@ -323,11 +354,19 @@ def test_no_na_values_no_keep_default(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The pyarrow engine doesn't support passing a dict def test_no_keep_default_na_dict_na_values(all_parsers): # see gh-19227 data = "a,b\n,2" parser = all_parsers + + if parser.engine == "pyarrow": + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), na_values={"b": ["2"]}, keep_default_na=False + ) + return + result = parser.read_csv( StringIO(data), na_values={"b": ["2"]}, keep_default_na=False ) @@ -335,19 +374,24 @@ def test_no_keep_default_na_dict_na_values(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The pyarrow engine doesn't support passing a dict def test_no_keep_default_na_dict_na_scalar_values(all_parsers): # see gh-19227 # # Scalar values shouldn't cause the parsing to crash or fail. data = "a,b\n1,2" parser = all_parsers + + if parser.engine == "pyarrow": + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), na_values={"b": 2}, keep_default_na=False) + return + df = parser.read_csv(StringIO(data), na_values={"b": 2}, keep_default_na=False) expected = DataFrame({"a": [1], "b": [np.nan]}) tm.assert_frame_equal(df, expected) -@xfail_pyarrow # ValueError: The pyarrow engine doesn't support passing a dict @pytest.mark.parametrize("col_zero_na_values", [113125, "113125"]) def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values): # see gh-19227 @@ -368,6 +412,17 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v } ) + if parser.engine == "pyarrow": + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + header=None, + keep_default_na=False, + na_values={2: "", 6: "214.008", 1: "blah", 0: col_zero_na_values}, + ) + return + result = parser.read_csv( StringIO(data), header=None, @@ -427,7 +482,6 @@ def test_na_trailing_columns(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # TypeError: expected bytes, int found @pytest.mark.parametrize( "na_values,row_data", [ @@ -441,12 +495,27 @@ def test_na_values_scalar(all_parsers, na_values, row_data): names = ["a", "b"] data = "1,2\n2,1" + if parser.engine == "pyarrow" and isinstance(na_values, dict): + if isinstance(na_values, dict): + err = ValueError + msg = "The pyarrow engine doesn't support passing a dict for na_values" + else: + err = TypeError + msg = "The 'pyarrow' engine requires all na_values to be strings" + with pytest.raises(err, match=msg): + parser.read_csv(StringIO(data), names=names, na_values=na_values) + return + elif parser.engine == "pyarrow": + msg = "The 'pyarrow' engine requires all na_values to be strings" + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), names=names, na_values=na_values) + return + result = parser.read_csv(StringIO(data), names=names, na_values=na_values) expected = DataFrame(row_data, columns=names) tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The pyarrow engine doesn't support passing a dict def test_na_values_dict_aliasing(all_parsers): parser = all_parsers na_values = {"a": 2, "b": 1} @@ -456,25 +525,36 @@ def test_na_values_dict_aliasing(all_parsers): data = "1,2\n2,1" expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names) + + if parser.engine == "pyarrow": + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), names=names, na_values=na_values) + return + result = parser.read_csv(StringIO(data), names=names, na_values=na_values) tm.assert_frame_equal(result, expected) tm.assert_dict_equal(na_values, na_values_copy) -@xfail_pyarrow # ValueError: The pyarrow engine doesn't support passing a dict def test_na_values_dict_col_index(all_parsers): # see gh-14203 data = "a\nfoo\n1" parser = all_parsers na_values = {0: "foo"} + if parser.engine == "pyarrow": + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), na_values=na_values) + return + result = parser.read_csv(StringIO(data), na_values=na_values) expected = DataFrame({"a": [np.nan, 1]}) tm.assert_frame_equal(result, expected) -@xfail_pyarrow # TypeError: expected bytes, int found @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -487,9 +567,19 @@ def test_na_values_dict_col_index(all_parsers): (str(2**63) + "\n1", {"na_values": [2**63]}, DataFrame([np.nan, 1])), ], ) -def test_na_values_uint64(all_parsers, data, kwargs, expected): +def test_na_values_uint64(all_parsers, data, kwargs, expected, request): # see gh-14983 parser = all_parsers + + if parser.engine == "pyarrow" and "na_values" in kwargs: + msg = "The 'pyarrow' engine requires all na_values to be strings" + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), header=None, **kwargs) + return + elif parser.engine == "pyarrow": + mark = pytest.mark.xfail(reason="Returns float64 instead of object") + request.applymarker(mark) + result = parser.read_csv(StringIO(data), header=None, **kwargs) tm.assert_frame_equal(result, expected) @@ -566,11 +656,14 @@ def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): ) def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): parser = all_parsers - msg = ( - "(Bool column has NA values in column [0a])|" - "(cannot safely convert passed user dtype of " - "bool for object dtyped data in column 0)" + msg = "|".join( + [ + "Bool column has NA values in column [0a]", + "cannot safely convert passed user dtype of " + "bool for object dtyped data in column 0", + ] ) + with pytest.raises(ValueError, match=msg): parser.read_csv( StringIO(data), @@ -581,6 +674,8 @@ def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): ) +# TODO: this test isn't about the na_values keyword, it is about the empty entries +# being returned with NaN entries, whereas the pyarrow engine returns "nan" @xfail_pyarrow # mismatched shapes def test_str_nan_dropped(all_parsers): # see gh-21131 @@ -610,12 +705,19 @@ def test_str_nan_dropped(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The pyarrow engine doesn't support passing a dict def test_nan_multi_index(all_parsers): # GH 42446 parser = all_parsers data = "A,B,B\nX,Y,Z\n1,2,inf" + if parser.engine == "pyarrow": + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), header=list(range(2)), na_values={("B", "Z"): "inf"} + ) + return + result = parser.read_csv( StringIO(data), header=list(range(2)), na_values={("B", "Z"): "inf"} ) @@ -631,7 +733,7 @@ def test_nan_multi_index(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # Failed: DID NOT RAISE +@xfail_pyarrow # Failed: DID NOT RAISE ; it casts the NaN to False def test_bool_and_nan_to_bool(all_parsers): # GH#42808 parser = all_parsers diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 28e5f5ad9bb70..9351387dfc337 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -2,16 +2,13 @@ Tests parsers ability to read and parse non-local files and hence require a network connection to be read. """ -from io import ( - BytesIO, - StringIO, -) +from io import BytesIO import logging +import re import numpy as np import pytest -from pandas.compat import is_ci_environment import pandas.util._test_decorators as td from pandas import DataFrame @@ -292,39 +289,23 @@ def test_read_csv_handles_boto_s3_object( tm.assert_frame_equal(result, expected) @pytest.mark.single_cpu - @pytest.mark.skipif( - is_ci_environment(), - reason="GH: 45651: This test can hang in our CI min_versions build", - ) def test_read_csv_chunked_download(self, s3_public_bucket, caplog, s3so): # 8 MB, S3FS uses 5MB chunks - import s3fs - - df = DataFrame( - np.random.default_rng(2).standard_normal((100000, 4)), columns=list("abcd") - ) - str_buf = StringIO() - - df.to_csv(str_buf) - - buf = BytesIO(str_buf.getvalue().encode("utf-8")) - - s3_public_bucket.put_object(Key="large-file.csv", Body=buf) - - # Possibly some state leaking in between tests. - # If we don't clear this cache, we saw `GetObject operation: Forbidden`. - # Presumably the s3fs instance is being cached, with the directory listing - # from *before* we add the large-file.csv in the s3_public_bucket_with_data. - s3fs.S3FileSystem.clear_instance_cache() - - with caplog.at_level(logging.DEBUG, logger="s3fs"): - read_csv( - f"s3://{s3_public_bucket.name}/large-file.csv", - nrows=5, - storage_options=s3so, - ) - # log of fetch_range (start, stop) - assert (0, 5505024) in (x.args[-2:] for x in caplog.records) + df = DataFrame(np.zeros((100000, 4)), columns=list("abcd")) + with BytesIO(df.to_csv().encode("utf-8")) as buf: + s3_public_bucket.put_object(Key="large-file.csv", Body=buf) + uri = f"{s3_public_bucket.name}/large-file.csv" + match_re = re.compile(rf"^Fetch: {uri}, 0-(?P\d+)$") + with caplog.at_level(logging.DEBUG, logger="s3fs"): + read_csv( + f"s3://{uri}", + nrows=5, + storage_options=s3so, + ) + for log in caplog.messages: + if match := re.match(match_re, log): + # Less than 8 MB + assert int(match.group("stop")) < 8000000 def test_read_s3_with_hash_in_key(self, s3_public_bucket_with_data, tips_df, s3so): # GH 25945 diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index 15b321c4616ca..23138f2710caf 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -98,7 +98,7 @@ def test_usecols_with_names(all_parsers): @pytest.mark.parametrize( "names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])] ) -def test_usecols_relative_to_names(all_parsers, names, usecols, request): +def test_usecols_relative_to_names(all_parsers, names, usecols): data = """\ 1,2,3 4,5,6 diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 8726d44c9c3ed..a1dec8a2d05b4 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -23,6 +23,28 @@ ) +@pytest.fixture +def fsspectest(): + pytest.importorskip("fsspec") + from fsspec import register_implementation + from fsspec.implementations.memory import MemoryFileSystem + from fsspec.registry import _registry as registry + + class TestMemoryFS(MemoryFileSystem): + protocol = "testmem" + test = [None] + + def __init__(self, **kwargs) -> None: + self.test[0] = kwargs.pop("test", None) + super().__init__(**kwargs) + + register_implementation("testmem", TestMemoryFS, clobber=True) + yield TestMemoryFS() + registry.pop("testmem", None) + TestMemoryFS.test[0] = None + TestMemoryFS.store.clear() + + @pytest.fixture def df1(): return DataFrame( diff --git a/pandas/tests/io/test_http_headers.py b/pandas/tests/io/test_http_headers.py new file mode 100644 index 0000000000000..2ca11ad1f74e6 --- /dev/null +++ b/pandas/tests/io/test_http_headers.py @@ -0,0 +1,172 @@ +""" +Tests for the pandas custom headers in http(s) requests +""" +from functools import partial +import gzip +from io import BytesIO + +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm + +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.network, + pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" + ), +] + + +def gzip_bytes(response_bytes): + with BytesIO() as bio: + with gzip.GzipFile(fileobj=bio, mode="w") as zipper: + zipper.write(response_bytes) + return bio.getvalue() + + +def csv_responder(df): + return df.to_csv(index=False).encode("utf-8") + + +def gz_csv_responder(df): + return gzip_bytes(csv_responder(df)) + + +def json_responder(df): + return df.to_json().encode("utf-8") + + +def gz_json_responder(df): + return gzip_bytes(json_responder(df)) + + +def html_responder(df): + return df.to_html(index=False).encode("utf-8") + + +def parquetpyarrow_reponder(df): + return df.to_parquet(index=False, engine="pyarrow") + + +def parquetfastparquet_responder(df): + # the fastparquet engine doesn't like to write to a buffer + # it can do it via the open_with function being set appropriately + # however it automatically calls the close method and wipes the buffer + # so just overwrite that attribute on this instance to not do that + + # protected by an importorskip in the respective test + import fsspec + + df.to_parquet( + "memory://fastparquet_user_agent.parquet", + index=False, + engine="fastparquet", + compression=None, + ) + with fsspec.open("memory://fastparquet_user_agent.parquet", "rb") as f: + return f.read() + + +def pickle_respnder(df): + with BytesIO() as bio: + df.to_pickle(bio) + return bio.getvalue() + + +def stata_responder(df): + with BytesIO() as bio: + df.to_stata(bio, write_index=False) + return bio.getvalue() + + +@pytest.mark.parametrize( + "responder, read_method", + [ + (csv_responder, pd.read_csv), + (json_responder, pd.read_json), + ( + html_responder, + lambda *args, **kwargs: pd.read_html(*args, **kwargs)[0], + ), + pytest.param( + parquetpyarrow_reponder, + partial(pd.read_parquet, engine="pyarrow"), + marks=td.skip_if_no("pyarrow"), + ), + pytest.param( + parquetfastparquet_responder, + partial(pd.read_parquet, engine="fastparquet"), + # TODO(ArrayManager) fastparquet + marks=[ + td.skip_if_no("fastparquet"), + td.skip_if_no("fsspec"), + td.skip_array_manager_not_yet_implemented, + ], + ), + (pickle_respnder, pd.read_pickle), + (stata_responder, pd.read_stata), + (gz_csv_responder, pd.read_csv), + (gz_json_responder, pd.read_json), + ], +) +@pytest.mark.parametrize( + "storage_options", + [ + None, + {"User-Agent": "foo"}, + {"User-Agent": "foo", "Auth": "bar"}, + ], +) +def test_request_headers(responder, read_method, httpserver, storage_options): + expected = pd.DataFrame({"a": ["b"]}) + default_headers = ["Accept-Encoding", "Host", "Connection", "User-Agent"] + if "gz" in responder.__name__: + extra = {"Content-Encoding": "gzip"} + if storage_options is None: + storage_options = extra + else: + storage_options |= extra + else: + extra = None + expected_headers = set(default_headers).union( + storage_options.keys() if storage_options else [] + ) + httpserver.serve_content(content=responder(expected), headers=extra) + result = read_method(httpserver.url, storage_options=storage_options) + tm.assert_frame_equal(result, expected) + + request_headers = dict(httpserver.requests[0].headers) + for header in expected_headers: + exp = request_headers.pop(header) + if storage_options and header in storage_options: + assert exp == storage_options[header] + # No extra headers added + assert not request_headers + + +@pytest.mark.parametrize( + "engine", + [ + "pyarrow", + "fastparquet", + ], +) +def test_to_parquet_to_disk_with_storage_options(engine): + headers = { + "User-Agent": "custom", + "Auth": "other_custom", + } + + pytest.importorskip(engine) + + true_df = pd.DataFrame({"column_name": ["column_value"]}) + msg = ( + "storage_options passed with file object or non-fsspec file path|" + "storage_options passed with buffer, or non-supported URL" + ) + with pytest.raises(ValueError, match=msg): + true_df.to_parquet("/tmp/junk.parquet", storage_options=headers, engine=engine) diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py index 9ee3c09631d0e..79473895b662d 100644 --- a/pandas/tests/io/test_s3.py +++ b/pandas/tests/io/test_s3.py @@ -30,15 +30,10 @@ def test_read_without_creds_from_pub_bucket(s3_public_bucket_with_data, s3so): @pytest.mark.single_cpu -def test_read_with_creds_from_pub_bucket(s3_public_bucket_with_data, monkeypatch, s3so): +def test_read_with_creds_from_pub_bucket(s3_public_bucket_with_data, s3so): # Ensure we can read from a public bucket with credentials # GH 34626 - - # temporary workaround as moto fails for botocore >= 1.11 otherwise, - # see https://github.com/spulec/moto/issues/1924 & 1952 pytest.importorskip("s3fs") - monkeypatch.setenv("AWS_ACCESS_KEY_ID", "foobar_key") - monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret") df = read_csv( f"s3://{s3_public_bucket_with_data.name}/tips.csv", nrows=5, diff --git a/pandas/tests/io/test_user_agent.py b/pandas/tests/io/test_user_agent.py deleted file mode 100644 index a892e51f2f28d..0000000000000 --- a/pandas/tests/io/test_user_agent.py +++ /dev/null @@ -1,403 +0,0 @@ -""" -Tests for the pandas custom headers in http(s) requests -""" -import gzip -import http.server -from io import BytesIO -import multiprocessing -import socket -import time -import urllib.error - -import pytest - -from pandas.compat import is_ci_environment -import pandas.util._test_decorators as td - -import pandas as pd -import pandas._testing as tm - -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.skipif( - is_ci_environment(), - reason="GH 45651: This test can hang in our CI min_versions build", - ), - pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" - ), -] - - -class BaseUserAgentResponder(http.server.BaseHTTPRequestHandler): - """ - Base class for setting up a server that can be set up to respond - with a particular file format with accompanying content-type headers. - The interfaces on the different io methods are different enough - that this seemed logical to do. - """ - - def start_processing_headers(self): - """ - shared logic at the start of a GET request - """ - self.send_response(200) - self.requested_from_user_agent = self.headers["User-Agent"] - response_df = pd.DataFrame( - { - "header": [self.requested_from_user_agent], - } - ) - return response_df - - def gzip_bytes(self, response_bytes): - """ - some web servers will send back gzipped files to save bandwidth - """ - with BytesIO() as bio: - with gzip.GzipFile(fileobj=bio, mode="w") as zipper: - zipper.write(response_bytes) - response_bytes = bio.getvalue() - return response_bytes - - def write_back_bytes(self, response_bytes): - """ - shared logic at the end of a GET request - """ - self.wfile.write(response_bytes) - - -class CSVUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - - self.send_header("Content-Type", "text/csv") - self.end_headers() - - response_bytes = response_df.to_csv(index=False).encode("utf-8") - self.write_back_bytes(response_bytes) - - -class GzippedCSVUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - self.send_header("Content-Type", "text/csv") - self.send_header("Content-Encoding", "gzip") - self.end_headers() - - response_bytes = response_df.to_csv(index=False).encode("utf-8") - response_bytes = self.gzip_bytes(response_bytes) - - self.write_back_bytes(response_bytes) - - -class JSONUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - self.send_header("Content-Type", "application/json") - self.end_headers() - - response_bytes = response_df.to_json().encode("utf-8") - - self.write_back_bytes(response_bytes) - - -class GzippedJSONUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - self.send_header("Content-Type", "application/json") - self.send_header("Content-Encoding", "gzip") - self.end_headers() - - response_bytes = response_df.to_json().encode("utf-8") - response_bytes = self.gzip_bytes(response_bytes) - - self.write_back_bytes(response_bytes) - - -class HTMLUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - self.send_header("Content-Type", "text/html") - self.end_headers() - - response_bytes = response_df.to_html(index=False).encode("utf-8") - - self.write_back_bytes(response_bytes) - - -class ParquetPyArrowUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - self.send_header("Content-Type", "application/octet-stream") - self.end_headers() - - response_bytes = response_df.to_parquet(index=False, engine="pyarrow") - - self.write_back_bytes(response_bytes) - - -class ParquetFastParquetUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - self.send_header("Content-Type", "application/octet-stream") - self.end_headers() - - # the fastparquet engine doesn't like to write to a buffer - # it can do it via the open_with function being set appropriately - # however it automatically calls the close method and wipes the buffer - # so just overwrite that attribute on this instance to not do that - - # protected by an importorskip in the respective test - import fsspec - - response_df.to_parquet( - "memory://fastparquet_user_agent.parquet", - index=False, - engine="fastparquet", - compression=None, - ) - with fsspec.open("memory://fastparquet_user_agent.parquet", "rb") as f: - response_bytes = f.read() - - self.write_back_bytes(response_bytes) - - -class PickleUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - self.send_header("Content-Type", "application/octet-stream") - self.end_headers() - - bio = BytesIO() - response_df.to_pickle(bio) - response_bytes = bio.getvalue() - - self.write_back_bytes(response_bytes) - - -class StataUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - self.send_header("Content-Type", "application/octet-stream") - self.end_headers() - - bio = BytesIO() - response_df.to_stata(bio, write_index=False) - response_bytes = bio.getvalue() - - self.write_back_bytes(response_bytes) - - -class AllHeaderCSVResponder(http.server.BaseHTTPRequestHandler): - """ - Send all request headers back for checking round trip - """ - - def do_GET(self): - response_df = pd.DataFrame(self.headers.items()) - self.send_response(200) - self.send_header("Content-Type", "text/csv") - self.end_headers() - response_bytes = response_df.to_csv(index=False).encode("utf-8") - self.wfile.write(response_bytes) - - -def wait_until_ready(func, *args, **kwargs): - def inner(*args, **kwargs): - while True: - try: - return func(*args, **kwargs) - except urllib.error.URLError: - # Connection refused as http server is starting - time.sleep(0.1) - - return inner - - -def process_server(responder, port): - with http.server.HTTPServer(("localhost", port), responder) as server: - server.handle_request() - server.server_close() - - -@pytest.fixture -def responder(request): - """ - Fixture that starts a local http server in a separate process on localhost - and returns the port. - - Running in a separate process instead of a thread to allow termination/killing - of http server upon cleanup. - """ - # Find an available port - with socket.socket() as sock: - sock.bind(("localhost", 0)) - port = sock.getsockname()[1] - - server_process = multiprocessing.Process( - target=process_server, args=(request.param, port) - ) - server_process.start() - yield port - server_process.join(10) - server_process.terminate() - kill_time = 5 - wait_time = 0 - while server_process.is_alive(): - if wait_time > kill_time: - server_process.kill() - break - wait_time += 0.1 - time.sleep(0.1) - server_process.close() - - -@pytest.mark.parametrize( - "responder, read_method, parquet_engine", - [ - (CSVUserAgentResponder, pd.read_csv, None), - (JSONUserAgentResponder, pd.read_json, None), - ( - HTMLUserAgentResponder, - lambda *args, **kwargs: pd.read_html(*args, **kwargs)[0], - None, - ), - (ParquetPyArrowUserAgentResponder, pd.read_parquet, "pyarrow"), - pytest.param( - ParquetFastParquetUserAgentResponder, - pd.read_parquet, - "fastparquet", - # TODO(ArrayManager) fastparquet - marks=[ - td.skip_array_manager_not_yet_implemented, - ], - ), - (PickleUserAgentResponder, pd.read_pickle, None), - (StataUserAgentResponder, pd.read_stata, None), - (GzippedCSVUserAgentResponder, pd.read_csv, None), - (GzippedJSONUserAgentResponder, pd.read_json, None), - ], - indirect=["responder"], -) -def test_server_and_default_headers(responder, read_method, parquet_engine): - if parquet_engine is not None: - pytest.importorskip(parquet_engine) - if parquet_engine == "fastparquet": - pytest.importorskip("fsspec") - - read_method = wait_until_ready(read_method) - if parquet_engine is None: - df_http = read_method(f"http://localhost:{responder}") - else: - df_http = read_method(f"http://localhost:{responder}", engine=parquet_engine) - - assert not df_http.empty - - -@pytest.mark.parametrize( - "responder, read_method, parquet_engine", - [ - (CSVUserAgentResponder, pd.read_csv, None), - (JSONUserAgentResponder, pd.read_json, None), - ( - HTMLUserAgentResponder, - lambda *args, **kwargs: pd.read_html(*args, **kwargs)[0], - None, - ), - (ParquetPyArrowUserAgentResponder, pd.read_parquet, "pyarrow"), - pytest.param( - ParquetFastParquetUserAgentResponder, - pd.read_parquet, - "fastparquet", - # TODO(ArrayManager) fastparquet - marks=[ - td.skip_array_manager_not_yet_implemented, - ], - ), - (PickleUserAgentResponder, pd.read_pickle, None), - (StataUserAgentResponder, pd.read_stata, None), - (GzippedCSVUserAgentResponder, pd.read_csv, None), - (GzippedJSONUserAgentResponder, pd.read_json, None), - ], - indirect=["responder"], -) -def test_server_and_custom_headers(responder, read_method, parquet_engine): - if parquet_engine is not None: - pytest.importorskip(parquet_engine) - if parquet_engine == "fastparquet": - pytest.importorskip("fsspec") - - custom_user_agent = "Super Cool One" - df_true = pd.DataFrame({"header": [custom_user_agent]}) - - read_method = wait_until_ready(read_method) - if parquet_engine is None: - df_http = read_method( - f"http://localhost:{responder}", - storage_options={"User-Agent": custom_user_agent}, - ) - else: - df_http = read_method( - f"http://localhost:{responder}", - storage_options={"User-Agent": custom_user_agent}, - engine=parquet_engine, - ) - - tm.assert_frame_equal(df_true, df_http) - - -@pytest.mark.parametrize( - "responder, read_method", - [ - (AllHeaderCSVResponder, pd.read_csv), - ], - indirect=["responder"], -) -def test_server_and_all_custom_headers(responder, read_method): - custom_user_agent = "Super Cool One" - custom_auth_token = "Super Secret One" - storage_options = { - "User-Agent": custom_user_agent, - "Auth": custom_auth_token, - } - read_method = wait_until_ready(read_method) - df_http = read_method( - f"http://localhost:{responder}", - storage_options=storage_options, - ) - - df_http = df_http[df_http["0"].isin(storage_options.keys())] - df_http = df_http.sort_values(["0"]).reset_index() - df_http = df_http[["0", "1"]] - - keys = list(storage_options.keys()) - df_true = pd.DataFrame({"0": keys, "1": [storage_options[k] for k in keys]}) - df_true = df_true.sort_values(["0"]) - df_true = df_true.reset_index().drop(["index"], axis=1) - - tm.assert_frame_equal(df_true, df_http) - - -@pytest.mark.parametrize( - "engine", - [ - "pyarrow", - "fastparquet", - ], -) -def test_to_parquet_to_disk_with_storage_options(engine): - headers = { - "User-Agent": "custom", - "Auth": "other_custom", - } - - pytest.importorskip(engine) - - true_df = pd.DataFrame({"column_name": ["column_value"]}) - msg = ( - "storage_options passed with file object or non-fsspec file path|" - "storage_options passed with buffer, or non-supported URL" - ) - with pytest.raises(ValueError, match=msg): - true_df.to_parquet("/tmp/junk.parquet", storage_options=headers, engine=engine) diff --git a/pandas/tests/io/xml/conftest.py b/pandas/tests/io/xml/conftest.py index c88616eb78029..aafda0ff62bbd 100644 --- a/pandas/tests/io/xml/conftest.py +++ b/pandas/tests/io/xml/conftest.py @@ -1,9 +1,11 @@ +from pathlib import Path + import pytest @pytest.fixture -def xml_data_path(tests_io_data_path, datapath): - return tests_io_data_path / "xml" +def xml_data_path(): + return Path(__file__).parent.parent / "data" / "xml" @pytest.fixture diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 84f9cd87db97c..56028249fe517 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -15,6 +15,7 @@ interval_range, period_range, plotting, + read_csv, ) import pandas._testing as tm from pandas.tests.plotting.common import ( @@ -30,6 +31,14 @@ cm = pytest.importorskip("matplotlib.cm") +@pytest.fixture +def iris(datapath) -> DataFrame: + """ + The iris dataset as a DataFrame. + """ + return read_csv(datapath("io", "data", "csv", "iris.csv")) + + @td.skip_if_installed("matplotlib") def test_import_error_message(): # GH-19810 diff --git a/pandas/tests/resample/conftest.py b/pandas/tests/resample/conftest.py index 90c2a91a22158..1033d908eb22d 100644 --- a/pandas/tests/resample/conftest.py +++ b/pandas/tests/resample/conftest.py @@ -1,5 +1,4 @@ from datetime import datetime -import warnings import numpy as np import pytest @@ -8,8 +7,6 @@ DataFrame, Series, ) -from pandas.core.indexes.datetimes import date_range -from pandas.core.indexes.period import period_range # The various methods we support downsample_methods = [ @@ -44,40 +41,6 @@ def resample_method(request): return request.param -@pytest.fixture -def simple_date_range_series(): - """ - Series with date range index and random data for test purposes. - """ - - def _simple_date_range_series(start, end, freq="D"): - rng = date_range(start, end, freq=freq) - return Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - - return _simple_date_range_series - - -@pytest.fixture -def simple_period_range_series(): - """ - Series with period range index and random data for test purposes. - """ - - def _simple_period_range_series(start, end, freq="D"): - with warnings.catch_warnings(): - # suppress Period[B] deprecation warning - msg = "|".join(["Period with BDay freq", r"PeriodDtype\[B\] is deprecated"]) - warnings.filterwarnings( - "ignore", - msg, - category=FutureWarning, - ) - rng = period_range(start, end, freq=freq) - return Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - - return _simple_period_range_series - - @pytest.fixture def _index_start(): """Fixture for parametrization of index, series and frame.""" diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 2bb114593fcd5..554dc92d8508e 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -53,6 +53,19 @@ def unit(request): return request.param +@pytest.fixture +def simple_date_range_series(): + """ + Series with date range index and random data for test purposes. + """ + + def _simple_date_range_series(start, end, freq="D"): + rng = date_range(start, end, freq=freq) + return Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) + + return _simple_date_range_series + + def test_custom_grouper(index, unit): dti = index.as_unit(unit) s = Series(np.array([1] * len(dti)), index=dti, dtype="int64") @@ -1208,14 +1221,6 @@ def test_corner_cases(unit): tm.assert_index_equal(result.index, ex_index) -def test_corner_cases_period(simple_period_range_series): - # miscellaneous test coverage - len0pts = simple_period_range_series("2007-01", "2010-05", freq="M")[:0] - # it works - result = len0pts.resample("Y-DEC").mean() - assert len(result) == 0 - - def test_corner_cases_date(simple_date_range_series, unit): # resample to periods ts = simple_date_range_series("2000-04-28", "2000-04-30 11:00", freq="h") diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index f3d095bf4b5ed..2e1b0033fd447 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -1,4 +1,5 @@ from datetime import datetime +import warnings import dateutil import numpy as np @@ -40,6 +41,27 @@ def _series_name(): return "pi" +@pytest.fixture +def simple_period_range_series(): + """ + Series with period range index and random data for test purposes. + """ + + def _simple_period_range_series(start, end, freq="D"): + with warnings.catch_warnings(): + # suppress Period[B] deprecation warning + msg = "|".join(["Period with BDay freq", r"PeriodDtype\[B\] is deprecated"]) + warnings.filterwarnings( + "ignore", + msg, + category=FutureWarning, + ) + rng = period_range(start, end, freq=freq) + return Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) + + return _simple_period_range_series + + class TestPeriodIndex: @pytest.mark.parametrize("freq", ["2D", "1h", "2h"]) @pytest.mark.parametrize("kind", ["period", None, "timestamp"]) @@ -942,3 +964,11 @@ def test_resample_frequency_ME_QE_error_message(series_and_frame, freq_depr): obj = series_and_frame with pytest.raises(ValueError, match=msg): obj.resample(freq_depr) + + +def test_corner_cases_period(simple_period_range_series): + # miscellaneous test coverage + len0pts = simple_period_range_series("2007-01", "2010-05", freq="M")[:0] + # it works + result = len0pts.resample("Y-DEC").mean() + assert len(result) == 0 diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 5df23ec26da35..6ca6fde7c120d 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2666,3 +2666,18 @@ def test_pivot_table_handles_explicit_datetime_types(self): names=["a", "date"], ) tm.assert_index_equal(pivot.index, expected) + + def test_pivot_table_with_margins_and_numeric_column_names(self): + # GH#26568 + df = DataFrame([["a", "x", 1], ["a", "y", 2], ["b", "y", 3], ["b", "z", 4]]) + + result = df.pivot_table( + index=0, columns=1, values=2, aggfunc="sum", fill_value=0, margins=True + ) + + expected = DataFrame( + [[1, 2, 0, 3], [0, 3, 4, 7], [1, 5, 4, 10]], + columns=Index(["x", "y", "z", "All"], name=1), + index=Index(["a", "b", "All"], name=0), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/scalar/interval/test_arithmetic.py b/pandas/tests/scalar/interval/test_arithmetic.py index 863446c64de42..603763227cb88 100644 --- a/pandas/tests/scalar/interval/test_arithmetic.py +++ b/pandas/tests/scalar/interval/test_arithmetic.py @@ -8,56 +8,185 @@ Timedelta, Timestamp, ) +import pandas._testing as tm -@pytest.mark.parametrize("method", ["__add__", "__sub__"]) -@pytest.mark.parametrize( - "interval", - [ - Interval(Timestamp("2017-01-01 00:00:00"), Timestamp("2018-01-01 00:00:00")), - Interval(Timedelta(days=7), Timedelta(days=14)), - ], -) -@pytest.mark.parametrize( - "delta", [Timedelta(days=7), timedelta(7), np.timedelta64(7, "D")] -) -def test_time_interval_add_subtract_timedelta(interval, delta, method): - # https://github.com/pandas-dev/pandas/issues/32023 - result = getattr(interval, method)(delta) - left = getattr(interval.left, method)(delta) - right = getattr(interval.right, method)(delta) - expected = Interval(left, right) +class TestIntervalArithmetic: + def test_interval_add(self, closed): + interval = Interval(0, 1, closed=closed) + expected = Interval(1, 2, closed=closed) - assert result == expected + result = interval + 1 + assert result == expected + result = 1 + interval + assert result == expected -@pytest.mark.parametrize("interval", [Interval(1, 2), Interval(1.0, 2.0)]) -@pytest.mark.parametrize( - "delta", [Timedelta(days=7), timedelta(7), np.timedelta64(7, "D")] -) -def test_numeric_interval_add_timedelta_raises(interval, delta): - # https://github.com/pandas-dev/pandas/issues/32023 - msg = "|".join( + result = interval + result += 1 + assert result == expected + + msg = r"unsupported operand type\(s\) for \+" + with pytest.raises(TypeError, match=msg): + interval + interval + + with pytest.raises(TypeError, match=msg): + interval + "foo" + + def test_interval_sub(self, closed): + interval = Interval(0, 1, closed=closed) + expected = Interval(-1, 0, closed=closed) + + result = interval - 1 + assert result == expected + + result = interval + result -= 1 + assert result == expected + + msg = r"unsupported operand type\(s\) for -" + with pytest.raises(TypeError, match=msg): + interval - interval + + with pytest.raises(TypeError, match=msg): + interval - "foo" + + def test_interval_mult(self, closed): + interval = Interval(0, 1, closed=closed) + expected = Interval(0, 2, closed=closed) + + result = interval * 2 + assert result == expected + + result = 2 * interval + assert result == expected + + result = interval + result *= 2 + assert result == expected + + msg = r"unsupported operand type\(s\) for \*" + with pytest.raises(TypeError, match=msg): + interval * interval + + msg = r"can\'t multiply sequence by non-int" + with pytest.raises(TypeError, match=msg): + interval * "foo" + + def test_interval_div(self, closed): + interval = Interval(0, 1, closed=closed) + expected = Interval(0, 0.5, closed=closed) + + result = interval / 2.0 + assert result == expected + + result = interval + result /= 2.0 + assert result == expected + + msg = r"unsupported operand type\(s\) for /" + with pytest.raises(TypeError, match=msg): + interval / interval + + with pytest.raises(TypeError, match=msg): + interval / "foo" + + def test_interval_floordiv(self, closed): + interval = Interval(1, 2, closed=closed) + expected = Interval(0, 1, closed=closed) + + result = interval // 2 + assert result == expected + + result = interval + result //= 2 + assert result == expected + + msg = r"unsupported operand type\(s\) for //" + with pytest.raises(TypeError, match=msg): + interval // interval + + with pytest.raises(TypeError, match=msg): + interval // "foo" + + @pytest.mark.parametrize("method", ["__add__", "__sub__"]) + @pytest.mark.parametrize( + "interval", [ - "unsupported operand", - "cannot use operands", - "Only numeric, Timestamp and Timedelta endpoints are allowed", - ] + Interval( + Timestamp("2017-01-01 00:00:00"), Timestamp("2018-01-01 00:00:00") + ), + Interval(Timedelta(days=7), Timedelta(days=14)), + ], ) - with pytest.raises((TypeError, ValueError), match=msg): - interval + delta + @pytest.mark.parametrize( + "delta", [Timedelta(days=7), timedelta(7), np.timedelta64(7, "D")] + ) + def test_time_interval_add_subtract_timedelta(self, interval, delta, method): + # https://github.com/pandas-dev/pandas/issues/32023 + result = getattr(interval, method)(delta) + left = getattr(interval.left, method)(delta) + right = getattr(interval.right, method)(delta) + expected = Interval(left, right) + + assert result == expected + + @pytest.mark.parametrize("interval", [Interval(1, 2), Interval(1.0, 2.0)]) + @pytest.mark.parametrize( + "delta", [Timedelta(days=7), timedelta(7), np.timedelta64(7, "D")] + ) + def test_numeric_interval_add_timedelta_raises(self, interval, delta): + # https://github.com/pandas-dev/pandas/issues/32023 + msg = "|".join( + [ + "unsupported operand", + "cannot use operands", + "Only numeric, Timestamp and Timedelta endpoints are allowed", + ] + ) + with pytest.raises((TypeError, ValueError), match=msg): + interval + delta + + with pytest.raises((TypeError, ValueError), match=msg): + delta + interval + + @pytest.mark.parametrize("klass", [timedelta, np.timedelta64, Timedelta]) + def test_timedelta_add_timestamp_interval(self, klass): + delta = klass(0) + expected = Interval(Timestamp("2020-01-01"), Timestamp("2020-02-01")) + + result = delta + expected + assert result == expected + + result = expected + delta + assert result == expected - with pytest.raises((TypeError, ValueError), match=msg): - delta + interval +class TestIntervalComparisons: + def test_interval_equal(self): + assert Interval(0, 1) == Interval(0, 1, closed="right") + assert Interval(0, 1) != Interval(0, 1, closed="left") + assert Interval(0, 1) != 0 -@pytest.mark.parametrize("klass", [timedelta, np.timedelta64, Timedelta]) -def test_timedelta_add_timestamp_interval(klass): - delta = klass(0) - expected = Interval(Timestamp("2020-01-01"), Timestamp("2020-02-01")) + def test_interval_comparison(self): + msg = ( + "'<' not supported between instances of " + "'pandas._libs.interval.Interval' and 'int'" + ) + with pytest.raises(TypeError, match=msg): + Interval(0, 1) < 2 - result = delta + expected - assert result == expected + assert Interval(0, 1) < Interval(1, 2) + assert Interval(0, 1) < Interval(0, 2) + assert Interval(0, 1) < Interval(0.5, 1.5) + assert Interval(0, 1) <= Interval(0, 1) + assert Interval(0, 1) > Interval(-1, 2) + assert Interval(0, 1) >= Interval(0, 1) - result = expected + delta - assert result == expected + def test_equality_comparison_broadcasts_over_array(self): + # https://github.com/pandas-dev/pandas/issues/35931 + interval = Interval(0, 1) + arr = np.array([interval, interval]) + result = interval == arr + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/scalar/interval/test_constructors.py b/pandas/tests/scalar/interval/test_constructors.py new file mode 100644 index 0000000000000..a4bc00b923434 --- /dev/null +++ b/pandas/tests/scalar/interval/test_constructors.py @@ -0,0 +1,51 @@ +import pytest + +from pandas import ( + Interval, + Period, + Timestamp, +) + + +class TestIntervalConstructors: + @pytest.mark.parametrize( + "left, right", + [ + ("a", "z"), + (("a", "b"), ("c", "d")), + (list("AB"), list("ab")), + (Interval(0, 1), Interval(1, 2)), + (Period("2018Q1", freq="Q"), Period("2018Q1", freq="Q")), + ], + ) + def test_construct_errors(self, left, right): + # GH#23013 + msg = "Only numeric, Timestamp and Timedelta endpoints are allowed" + with pytest.raises(ValueError, match=msg): + Interval(left, right) + + def test_constructor_errors(self): + msg = "invalid option for 'closed': foo" + with pytest.raises(ValueError, match=msg): + Interval(0, 1, closed="foo") + + msg = "left side of interval must be <= right side" + with pytest.raises(ValueError, match=msg): + Interval(1, 0) + + @pytest.mark.parametrize( + "tz_left, tz_right", [(None, "UTC"), ("UTC", None), ("UTC", "US/Eastern")] + ) + def test_constructor_errors_tz(self, tz_left, tz_right): + # GH#18538 + left = Timestamp("2017-01-01", tz=tz_left) + right = Timestamp("2017-01-02", tz=tz_right) + + if tz_left is None or tz_right is None: + error = TypeError + msg = "Cannot compare tz-naive and tz-aware timestamps" + else: + error = ValueError + msg = "left and right must have the same time zone" + with pytest.raises(error, match=msg): + Interval(left, right) diff --git a/pandas/tests/scalar/interval/test_contains.py b/pandas/tests/scalar/interval/test_contains.py new file mode 100644 index 0000000000000..8dfca117a658b --- /dev/null +++ b/pandas/tests/scalar/interval/test_contains.py @@ -0,0 +1,73 @@ +import pytest + +from pandas import ( + Interval, + Timedelta, + Timestamp, +) + + +class TestContains: + def test_contains(self): + interval = Interval(0, 1) + assert 0.5 in interval + assert 1 in interval + assert 0 not in interval + + interval_both = Interval(0, 1, "both") + assert 0 in interval_both + assert 1 in interval_both + + interval_neither = Interval(0, 1, closed="neither") + assert 0 not in interval_neither + assert 0.5 in interval_neither + assert 1 not in interval_neither + + def test_contains_interval(self, inclusive_endpoints_fixture): + interval1 = Interval(0, 1, "both") + interval2 = Interval(0, 1, inclusive_endpoints_fixture) + assert interval1 in interval1 + assert interval2 in interval2 + assert interval2 in interval1 + assert interval1 not in interval2 or inclusive_endpoints_fixture == "both" + + def test_contains_infinite_length(self): + interval1 = Interval(0, 1, "both") + interval2 = Interval(float("-inf"), float("inf"), "neither") + assert interval1 in interval2 + assert interval2 not in interval1 + + def test_contains_zero_length(self): + interval1 = Interval(0, 1, "both") + interval2 = Interval(-1, -1, "both") + interval3 = Interval(0.5, 0.5, "both") + assert interval2 not in interval1 + assert interval3 in interval1 + assert interval2 not in interval3 and interval3 not in interval2 + assert interval1 not in interval2 and interval1 not in interval3 + + @pytest.mark.parametrize( + "type1", + [ + (0, 1), + (Timestamp(2000, 1, 1, 0), Timestamp(2000, 1, 1, 1)), + (Timedelta("0h"), Timedelta("1h")), + ], + ) + @pytest.mark.parametrize( + "type2", + [ + (0, 1), + (Timestamp(2000, 1, 1, 0), Timestamp(2000, 1, 1, 1)), + (Timedelta("0h"), Timedelta("1h")), + ], + ) + def test_contains_mixed_types(self, type1, type2): + interval1 = Interval(*type1) + interval2 = Interval(*type2) + if type1 == type2: + assert interval1 in interval2 + else: + msg = "^'<=' not supported between instances of" + with pytest.raises(TypeError, match=msg): + interval1 in interval2 diff --git a/pandas/tests/scalar/interval/test_formats.py b/pandas/tests/scalar/interval/test_formats.py new file mode 100644 index 0000000000000..6bf7aa91df3ce --- /dev/null +++ b/pandas/tests/scalar/interval/test_formats.py @@ -0,0 +1,11 @@ +from pandas import Interval + + +def test_interval_repr(): + interval = Interval(0, 1) + assert repr(interval) == "Interval(0, 1, closed='right')" + assert str(interval) == "(0, 1]" + + interval_left = Interval(0, 1, closed="left") + assert repr(interval_left) == "Interval(0, 1, closed='left')" + assert str(interval_left) == "[0, 1)" diff --git a/pandas/tests/scalar/interval/test_interval.py b/pandas/tests/scalar/interval/test_interval.py index 4841c488a5768..91b31e82f9c52 100644 --- a/pandas/tests/scalar/interval/test_interval.py +++ b/pandas/tests/scalar/interval/test_interval.py @@ -3,12 +3,9 @@ from pandas import ( Interval, - Period, Timedelta, Timestamp, ) -import pandas._testing as tm -import pandas.core.common as com @pytest.fixture @@ -23,48 +20,6 @@ def test_properties(self, interval): assert interval.right == 1 assert interval.mid == 0.5 - def test_repr(self, interval): - assert repr(interval) == "Interval(0, 1, closed='right')" - assert str(interval) == "(0, 1]" - - interval_left = Interval(0, 1, closed="left") - assert repr(interval_left) == "Interval(0, 1, closed='left')" - assert str(interval_left) == "[0, 1)" - - def test_contains(self, interval): - assert 0.5 in interval - assert 1 in interval - assert 0 not in interval - - interval_both = Interval(0, 1, "both") - assert 0 in interval_both - assert 1 in interval_both - - interval_neither = Interval(0, 1, closed="neither") - assert 0 not in interval_neither - assert 0.5 in interval_neither - assert 1 not in interval_neither - - def test_equal(self): - assert Interval(0, 1) == Interval(0, 1, closed="right") - assert Interval(0, 1) != Interval(0, 1, closed="left") - assert Interval(0, 1) != 0 - - def test_comparison(self): - msg = ( - "'<' not supported between instances of " - "'pandas._libs.interval.Interval' and 'int'" - ) - with pytest.raises(TypeError, match=msg): - Interval(0, 1) < 2 - - assert Interval(0, 1) < Interval(1, 2) - assert Interval(0, 1) < Interval(0, 2) - assert Interval(0, 1) < Interval(0.5, 1.5) - assert Interval(0, 1) <= Interval(0, 1) - assert Interval(0, 1) > Interval(-1, 2) - assert Interval(0, 1) >= Interval(0, 1) - def test_hash(self, interval): # should not raise hash(interval) @@ -130,150 +85,3 @@ def test_is_empty(self, left, right, closed): result = iv.is_empty expected = closed != "both" assert result is expected - - @pytest.mark.parametrize( - "left, right", - [ - ("a", "z"), - (("a", "b"), ("c", "d")), - (list("AB"), list("ab")), - (Interval(0, 1), Interval(1, 2)), - (Period("2018Q1", freq="Q"), Period("2018Q1", freq="Q")), - ], - ) - def test_construct_errors(self, left, right): - # GH 23013 - msg = "Only numeric, Timestamp and Timedelta endpoints are allowed" - with pytest.raises(ValueError, match=msg): - Interval(left, right) - - def test_math_add(self, closed): - interval = Interval(0, 1, closed=closed) - expected = Interval(1, 2, closed=closed) - - result = interval + 1 - assert result == expected - - result = 1 + interval - assert result == expected - - result = interval - result += 1 - assert result == expected - - msg = r"unsupported operand type\(s\) for \+" - with pytest.raises(TypeError, match=msg): - interval + interval - - with pytest.raises(TypeError, match=msg): - interval + "foo" - - def test_math_sub(self, closed): - interval = Interval(0, 1, closed=closed) - expected = Interval(-1, 0, closed=closed) - - result = interval - 1 - assert result == expected - - result = interval - result -= 1 - assert result == expected - - msg = r"unsupported operand type\(s\) for -" - with pytest.raises(TypeError, match=msg): - interval - interval - - with pytest.raises(TypeError, match=msg): - interval - "foo" - - def test_math_mult(self, closed): - interval = Interval(0, 1, closed=closed) - expected = Interval(0, 2, closed=closed) - - result = interval * 2 - assert result == expected - - result = 2 * interval - assert result == expected - - result = interval - result *= 2 - assert result == expected - - msg = r"unsupported operand type\(s\) for \*" - with pytest.raises(TypeError, match=msg): - interval * interval - - msg = r"can\'t multiply sequence by non-int" - with pytest.raises(TypeError, match=msg): - interval * "foo" - - def test_math_div(self, closed): - interval = Interval(0, 1, closed=closed) - expected = Interval(0, 0.5, closed=closed) - - result = interval / 2.0 - assert result == expected - - result = interval - result /= 2.0 - assert result == expected - - msg = r"unsupported operand type\(s\) for /" - with pytest.raises(TypeError, match=msg): - interval / interval - - with pytest.raises(TypeError, match=msg): - interval / "foo" - - def test_math_floordiv(self, closed): - interval = Interval(1, 2, closed=closed) - expected = Interval(0, 1, closed=closed) - - result = interval // 2 - assert result == expected - - result = interval - result //= 2 - assert result == expected - - msg = r"unsupported operand type\(s\) for //" - with pytest.raises(TypeError, match=msg): - interval // interval - - with pytest.raises(TypeError, match=msg): - interval // "foo" - - def test_constructor_errors(self): - msg = "invalid option for 'closed': foo" - with pytest.raises(ValueError, match=msg): - Interval(0, 1, closed="foo") - - msg = "left side of interval must be <= right side" - with pytest.raises(ValueError, match=msg): - Interval(1, 0) - - @pytest.mark.parametrize( - "tz_left, tz_right", [(None, "UTC"), ("UTC", None), ("UTC", "US/Eastern")] - ) - def test_constructor_errors_tz(self, tz_left, tz_right): - # GH 18538 - left = Timestamp("2017-01-01", tz=tz_left) - right = Timestamp("2017-01-02", tz=tz_right) - - if com.any_none(tz_left, tz_right): - error = TypeError - msg = "Cannot compare tz-naive and tz-aware timestamps" - else: - error = ValueError - msg = "left and right must have the same time zone" - with pytest.raises(error, match=msg): - Interval(left, right) - - def test_equality_comparison_broadcasts_over_array(self): - # https://github.com/pandas-dev/pandas/issues/35931 - interval = Interval(0, 1) - arr = np.array([interval, interval]) - result = interval == arr - expected = np.array([True, True]) - tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/scalar/interval/test_ops.py b/pandas/tests/scalar/interval/test_overlaps.py similarity index 54% rename from pandas/tests/scalar/interval/test_ops.py rename to pandas/tests/scalar/interval/test_overlaps.py index 92db6ac772830..7fcf59d7bb4af 100644 --- a/pandas/tests/scalar/interval/test_ops.py +++ b/pandas/tests/scalar/interval/test_overlaps.py @@ -1,4 +1,3 @@ -"""Tests for Interval-Interval operations, such as overlaps, contains, etc.""" import pytest from pandas import ( @@ -66,54 +65,3 @@ def test_overlaps_invalid_type(self, other): msg = f"`other` must be an Interval, got {type(other).__name__}" with pytest.raises(TypeError, match=msg): interval.overlaps(other) - - -class TestContains: - def test_contains_interval(self, inclusive_endpoints_fixture): - interval1 = Interval(0, 1, "both") - interval2 = Interval(0, 1, inclusive_endpoints_fixture) - assert interval1 in interval1 - assert interval2 in interval2 - assert interval2 in interval1 - assert interval1 not in interval2 or inclusive_endpoints_fixture == "both" - - def test_contains_infinite_length(self): - interval1 = Interval(0, 1, "both") - interval2 = Interval(float("-inf"), float("inf"), "neither") - assert interval1 in interval2 - assert interval2 not in interval1 - - def test_contains_zero_length(self): - interval1 = Interval(0, 1, "both") - interval2 = Interval(-1, -1, "both") - interval3 = Interval(0.5, 0.5, "both") - assert interval2 not in interval1 - assert interval3 in interval1 - assert interval2 not in interval3 and interval3 not in interval2 - assert interval1 not in interval2 and interval1 not in interval3 - - @pytest.mark.parametrize( - "type1", - [ - (0, 1), - (Timestamp(2000, 1, 1, 0), Timestamp(2000, 1, 1, 1)), - (Timedelta("0h"), Timedelta("1h")), - ], - ) - @pytest.mark.parametrize( - "type2", - [ - (0, 1), - (Timestamp(2000, 1, 1, 0), Timestamp(2000, 1, 1, 1)), - (Timedelta("0h"), Timedelta("1h")), - ], - ) - def test_contains_mixed_types(self, type1, type2): - interval1 = Interval(*type1) - interval2 = Interval(*type2) - if type1 == type2: - assert interval1 in interval2 - else: - msg = "^'<=' not supported between instances of" - with pytest.raises(TypeError, match=msg): - interval1 in interval2 diff --git a/pandas/tests/scalar/period/test_arithmetic.py b/pandas/tests/scalar/period/test_arithmetic.py new file mode 100644 index 0000000000000..5dc0858de466c --- /dev/null +++ b/pandas/tests/scalar/period/test_arithmetic.py @@ -0,0 +1,486 @@ +from datetime import timedelta + +import numpy as np +import pytest + +from pandas._libs.tslibs.period import IncompatibleFrequency + +from pandas import ( + NaT, + Period, + Timedelta, + Timestamp, + offsets, +) + + +class TestPeriodArithmetic: + def test_add_overflow_raises(self): + # GH#55503 + per = Timestamp.max.to_period("ns") + + msg = "|".join( + [ + "Python int too large to convert to C long", + # windows, 32bit linux builds + "int too big to convert", + ] + ) + with pytest.raises(OverflowError, match=msg): + per + 1 + + msg = "value too large" + with pytest.raises(OverflowError, match=msg): + per + Timedelta(1) + with pytest.raises(OverflowError, match=msg): + per + offsets.Nano(1) + + def test_period_add_integer(self): + per1 = Period(freq="D", year=2008, month=1, day=1) + per2 = Period(freq="D", year=2008, month=1, day=2) + assert per1 + 1 == per2 + assert 1 + per1 == per2 + + def test_period_add_invalid(self): + # GH#4731 + per1 = Period(freq="D", year=2008, month=1, day=1) + per2 = Period(freq="D", year=2008, month=1, day=2) + + msg = "|".join( + [ + r"unsupported operand type\(s\)", + "can only concatenate str", + "must be str, not Period", + ] + ) + with pytest.raises(TypeError, match=msg): + per1 + "str" + with pytest.raises(TypeError, match=msg): + "str" + per1 + with pytest.raises(TypeError, match=msg): + per1 + per2 + + def test_period_sub_period_annual(self): + left, right = Period("2011", freq="Y"), Period("2007", freq="Y") + result = left - right + assert result == 4 * right.freq + + msg = r"Input has different freq=M from Period\(freq=Y-DEC\)" + with pytest.raises(IncompatibleFrequency, match=msg): + left - Period("2007-01", freq="M") + + def test_period_sub_period(self): + per1 = Period("2011-01-01", freq="D") + per2 = Period("2011-01-15", freq="D") + + off = per1.freq + assert per1 - per2 == -14 * off + assert per2 - per1 == 14 * off + + msg = r"Input has different freq=M from Period\(freq=D\)" + with pytest.raises(IncompatibleFrequency, match=msg): + per1 - Period("2011-02", freq="M") + + @pytest.mark.parametrize("n", [1, 2, 3, 4]) + def test_sub_n_gt_1_ticks(self, tick_classes, n): + # GH#23878 + p1 = Period("19910905", freq=tick_classes(n)) + p2 = Period("19920406", freq=tick_classes(n)) + + expected = Period(str(p2), freq=p2.freq.base) - Period( + str(p1), freq=p1.freq.base + ) + + assert (p2 - p1) == expected + + @pytest.mark.parametrize("normalize", [True, False]) + @pytest.mark.parametrize("n", [1, 2, 3, 4]) + @pytest.mark.parametrize( + "offset, kwd_name", + [ + (offsets.YearEnd, "month"), + (offsets.QuarterEnd, "startingMonth"), + (offsets.MonthEnd, None), + (offsets.Week, "weekday"), + ], + ) + def test_sub_n_gt_1_offsets(self, offset, kwd_name, n, normalize): + # GH#23878 + kwds = {kwd_name: 3} if kwd_name is not None else {} + p1_d = "19910905" + p2_d = "19920406" + p1 = Period(p1_d, freq=offset(n, normalize, **kwds)) + p2 = Period(p2_d, freq=offset(n, normalize, **kwds)) + + expected = Period(p2_d, freq=p2.freq.base) - Period(p1_d, freq=p1.freq.base) + + assert (p2 - p1) == expected + + def test_period_add_offset(self): + # freq is DateOffset + for freq in ["Y", "2Y", "3Y"]: + per = Period("2011", freq=freq) + exp = Period("2013", freq=freq) + assert per + offsets.YearEnd(2) == exp + assert offsets.YearEnd(2) + per == exp + + for off in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: + msg = "Input has different freq|Input cannot be converted to Period" + with pytest.raises(IncompatibleFrequency, match=msg): + per + off + with pytest.raises(IncompatibleFrequency, match=msg): + off + per + + for freq in ["M", "2M", "3M"]: + per = Period("2011-03", freq=freq) + exp = Period("2011-05", freq=freq) + assert per + offsets.MonthEnd(2) == exp + assert offsets.MonthEnd(2) + per == exp + + exp = Period("2012-03", freq=freq) + assert per + offsets.MonthEnd(12) == exp + assert offsets.MonthEnd(12) + per == exp + + msg = "|".join( + [ + "Input has different freq", + "Input cannot be converted to Period", + ] + ) + + for off in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: + with pytest.raises(IncompatibleFrequency, match=msg): + per + off + with pytest.raises(IncompatibleFrequency, match=msg): + off + per + + # freq is Tick + for freq in ["D", "2D", "3D"]: + per = Period("2011-04-01", freq=freq) + + exp = Period("2011-04-06", freq=freq) + assert per + offsets.Day(5) == exp + assert offsets.Day(5) + per == exp + + exp = Period("2011-04-02", freq=freq) + assert per + offsets.Hour(24) == exp + assert offsets.Hour(24) + per == exp + + exp = Period("2011-04-03", freq=freq) + assert per + np.timedelta64(2, "D") == exp + assert np.timedelta64(2, "D") + per == exp + + exp = Period("2011-04-02", freq=freq) + assert per + np.timedelta64(3600 * 24, "s") == exp + assert np.timedelta64(3600 * 24, "s") + per == exp + + exp = Period("2011-03-30", freq=freq) + assert per + timedelta(-2) == exp + assert timedelta(-2) + per == exp + + exp = Period("2011-04-03", freq=freq) + assert per + timedelta(hours=48) == exp + assert timedelta(hours=48) + per == exp + + msg = "|".join( + [ + "Input has different freq", + "Input cannot be converted to Period", + ] + ) + + for off in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(4, "h"), + timedelta(hours=23), + ]: + with pytest.raises(IncompatibleFrequency, match=msg): + per + off + with pytest.raises(IncompatibleFrequency, match=msg): + off + per + + for freq in ["h", "2h", "3h"]: + per = Period("2011-04-01 09:00", freq=freq) + + exp = Period("2011-04-03 09:00", freq=freq) + assert per + offsets.Day(2) == exp + assert offsets.Day(2) + per == exp + + exp = Period("2011-04-01 12:00", freq=freq) + assert per + offsets.Hour(3) == exp + assert offsets.Hour(3) + per == exp + + msg = "cannot use operands with types" + exp = Period("2011-04-01 12:00", freq=freq) + assert per + np.timedelta64(3, "h") == exp + assert np.timedelta64(3, "h") + per == exp + + exp = Period("2011-04-01 10:00", freq=freq) + assert per + np.timedelta64(3600, "s") == exp + assert np.timedelta64(3600, "s") + per == exp + + exp = Period("2011-04-01 11:00", freq=freq) + assert per + timedelta(minutes=120) == exp + assert timedelta(minutes=120) + per == exp + + exp = Period("2011-04-05 12:00", freq=freq) + assert per + timedelta(days=4, minutes=180) == exp + assert timedelta(days=4, minutes=180) + per == exp + + msg = "|".join( + [ + "Input has different freq", + "Input cannot be converted to Period", + ] + ) + + for off in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(3200, "s"), + timedelta(hours=23, minutes=30), + ]: + with pytest.raises(IncompatibleFrequency, match=msg): + per + off + with pytest.raises(IncompatibleFrequency, match=msg): + off + per + + def test_period_sub_offset(self): + # freq is DateOffset + msg = "|".join( + [ + "Input has different freq", + "Input cannot be converted to Period", + ] + ) + + for freq in ["Y", "2Y", "3Y"]: + per = Period("2011", freq=freq) + assert per - offsets.YearEnd(2) == Period("2009", freq=freq) + + for off in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: + with pytest.raises(IncompatibleFrequency, match=msg): + per - off + + for freq in ["M", "2M", "3M"]: + per = Period("2011-03", freq=freq) + assert per - offsets.MonthEnd(2) == Period("2011-01", freq=freq) + assert per - offsets.MonthEnd(12) == Period("2010-03", freq=freq) + + for off in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: + with pytest.raises(IncompatibleFrequency, match=msg): + per - off + + # freq is Tick + for freq in ["D", "2D", "3D"]: + per = Period("2011-04-01", freq=freq) + assert per - offsets.Day(5) == Period("2011-03-27", freq=freq) + assert per - offsets.Hour(24) == Period("2011-03-31", freq=freq) + assert per - np.timedelta64(2, "D") == Period("2011-03-30", freq=freq) + assert per - np.timedelta64(3600 * 24, "s") == Period( + "2011-03-31", freq=freq + ) + assert per - timedelta(-2) == Period("2011-04-03", freq=freq) + assert per - timedelta(hours=48) == Period("2011-03-30", freq=freq) + + for off in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(4, "h"), + timedelta(hours=23), + ]: + with pytest.raises(IncompatibleFrequency, match=msg): + per - off + + for freq in ["h", "2h", "3h"]: + per = Period("2011-04-01 09:00", freq=freq) + assert per - offsets.Day(2) == Period("2011-03-30 09:00", freq=freq) + assert per - offsets.Hour(3) == Period("2011-04-01 06:00", freq=freq) + assert per - np.timedelta64(3, "h") == Period("2011-04-01 06:00", freq=freq) + assert per - np.timedelta64(3600, "s") == Period( + "2011-04-01 08:00", freq=freq + ) + assert per - timedelta(minutes=120) == Period("2011-04-01 07:00", freq=freq) + assert per - timedelta(days=4, minutes=180) == Period( + "2011-03-28 06:00", freq=freq + ) + + for off in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(3200, "s"), + timedelta(hours=23, minutes=30), + ]: + with pytest.raises(IncompatibleFrequency, match=msg): + per - off + + @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) + def test_period_addsub_nat(self, freq): + # GH#13071 + per = Period("2011-01", freq=freq) + + # For subtraction, NaT is treated as another Period object + assert NaT - per is NaT + assert per - NaT is NaT + + # For addition, NaT is treated as offset-like + assert NaT + per is NaT + assert per + NaT is NaT + + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "m"]) + def test_period_add_sub_td64_nat(self, unit): + # GH#47196 + per = Period("2022-06-01", "D") + nat = np.timedelta64("NaT", unit) + + assert per + nat is NaT + assert nat + per is NaT + assert per - nat is NaT + + with pytest.raises(TypeError, match="unsupported operand"): + nat - per + + def test_period_ops_offset(self): + per = Period("2011-04-01", freq="D") + result = per + offsets.Day() + exp = Period("2011-04-02", freq="D") + assert result == exp + + result = per - offsets.Day(2) + exp = Period("2011-03-30", freq="D") + assert result == exp + + msg = r"Input cannot be converted to Period\(freq=D\)" + with pytest.raises(IncompatibleFrequency, match=msg): + per + offsets.Hour(2) + + with pytest.raises(IncompatibleFrequency, match=msg): + per - offsets.Hour(2) + + def test_period_add_timestamp_raises(self): + # GH#17983 + ts = Timestamp("2017") + per = Period("2017", freq="M") + + msg = r"unsupported operand type\(s\) for \+: 'Timestamp' and 'Period'" + with pytest.raises(TypeError, match=msg): + ts + per + + msg = r"unsupported operand type\(s\) for \+: 'Period' and 'Timestamp'" + with pytest.raises(TypeError, match=msg): + per + ts + + +class TestPeriodComparisons: + def test_period_comparison_same_freq(self): + jan = Period("2000-01", "M") + feb = Period("2000-02", "M") + + assert not jan == feb + assert jan != feb + assert jan < feb + assert jan <= feb + assert not jan > feb + assert not jan >= feb + + def test_period_comparison_same_period_different_object(self): + # Separate Period objects for the same period + left = Period("2000-01", "M") + right = Period("2000-01", "M") + + assert left == right + assert left >= right + assert left <= right + assert not left < right + assert not left > right + + def test_period_comparison_mismatched_freq(self): + jan = Period("2000-01", "M") + day = Period("2012-01-01", "D") + + assert not jan == day + assert jan != day + msg = r"Input has different freq=D from Period\(freq=M\)" + with pytest.raises(IncompatibleFrequency, match=msg): + jan < day + with pytest.raises(IncompatibleFrequency, match=msg): + jan <= day + with pytest.raises(IncompatibleFrequency, match=msg): + jan > day + with pytest.raises(IncompatibleFrequency, match=msg): + jan >= day + + def test_period_comparison_invalid_type(self): + jan = Period("2000-01", "M") + + assert not jan == 1 + assert jan != 1 + + int_or_per = "'(Period|int)'" + msg = f"not supported between instances of {int_or_per} and {int_or_per}" + for left, right in [(jan, 1), (1, jan)]: + with pytest.raises(TypeError, match=msg): + left > right + with pytest.raises(TypeError, match=msg): + left >= right + with pytest.raises(TypeError, match=msg): + left < right + with pytest.raises(TypeError, match=msg): + left <= right + + def test_period_comparison_nat(self): + per = Period("2011-01-01", freq="D") + + ts = Timestamp("2011-01-01") + # confirm Period('NaT') work identical with Timestamp('NaT') + for left, right in [ + (NaT, per), + (per, NaT), + (NaT, ts), + (ts, NaT), + ]: + assert not left < right + assert not left > right + assert not left == right + assert left != right + assert not left <= right + assert not left >= right + + @pytest.mark.parametrize( + "zerodim_arr, expected", + ((np.array(0), False), (np.array(Period("2000-01", "M")), True)), + ) + def test_period_comparison_numpy_zerodim_arr(self, zerodim_arr, expected): + per = Period("2000-01", "M") + + assert (per == zerodim_arr) is expected + assert (zerodim_arr == per) is expected diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index f466804fe0814..3e91264fdb3b1 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -17,12 +17,8 @@ ) from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas._libs.tslibs.parsing import DateParseError -from pandas._libs.tslibs.period import ( - INVALID_FREQ_ERR_MSG, - IncompatibleFrequency, -) +from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG -import pandas as pd from pandas import ( NaT, Period, @@ -1086,62 +1082,6 @@ def test_get_period_field_array_raises_on_out_of_range(self): class TestPeriodComparisons: - def test_comparison_same_period_different_object(self): - # Separate Period objects for the same period - left = Period("2000-01", "M") - right = Period("2000-01", "M") - - assert left == right - assert left >= right - assert left <= right - assert not left < right - assert not left > right - - def test_comparison_same_freq(self): - jan = Period("2000-01", "M") - feb = Period("2000-02", "M") - - assert not jan == feb - assert jan != feb - assert jan < feb - assert jan <= feb - assert not jan > feb - assert not jan >= feb - - def test_comparison_mismatched_freq(self): - jan = Period("2000-01", "M") - day = Period("2012-01-01", "D") - - assert not jan == day - assert jan != day - msg = r"Input has different freq=D from Period\(freq=M\)" - with pytest.raises(IncompatibleFrequency, match=msg): - jan < day - with pytest.raises(IncompatibleFrequency, match=msg): - jan <= day - with pytest.raises(IncompatibleFrequency, match=msg): - jan > day - with pytest.raises(IncompatibleFrequency, match=msg): - jan >= day - - def test_comparison_invalid_type(self): - jan = Period("2000-01", "M") - - assert not jan == 1 - assert jan != 1 - - int_or_per = "'(Period|int)'" - msg = f"not supported between instances of {int_or_per} and {int_or_per}" - for left, right in [(jan, 1), (1, jan)]: - with pytest.raises(TypeError, match=msg): - left > right - with pytest.raises(TypeError, match=msg): - left >= right - with pytest.raises(TypeError, match=msg): - left < right - with pytest.raises(TypeError, match=msg): - left <= right - def test_sort_periods(self): jan = Period("2000-01", "M") feb = Period("2000-02", "M") @@ -1150,442 +1090,6 @@ def test_sort_periods(self): correctPeriods = [jan, feb, mar] assert sorted(periods) == correctPeriods - def test_period_cmp_nat(self): - p = Period("2011-01-01", freq="D") - - t = Timestamp("2011-01-01") - # confirm Period('NaT') work identical with Timestamp('NaT') - for left, right in [ - (NaT, p), - (p, NaT), - (NaT, t), - (t, NaT), - ]: - assert not left < right - assert not left > right - assert not left == right - assert left != right - assert not left <= right - assert not left >= right - - @pytest.mark.parametrize( - "zerodim_arr, expected", - ((np.array(0), False), (np.array(Period("2000-01", "M")), True)), - ) - def test_comparison_numpy_zerodim_arr(self, zerodim_arr, expected): - p = Period("2000-01", "M") - - assert (p == zerodim_arr) is expected - assert (zerodim_arr == p) is expected - - -class TestArithmetic: - def test_add_overflow_raises(self): - # GH#55503 - per = Timestamp.max.to_period("ns") - - msg = "|".join( - [ - "Python int too large to convert to C long", - # windows, 32bit linux builds - "int too big to convert", - ] - ) - with pytest.raises(OverflowError, match=msg): - per + 1 - - msg = "value too large" - with pytest.raises(OverflowError, match=msg): - per + Timedelta(1) - with pytest.raises(OverflowError, match=msg): - per + offsets.Nano(1) - - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "m"]) - def test_add_sub_td64_nat(self, unit): - # GH#47196 - per = Period("2022-06-01", "D") - nat = np.timedelta64("NaT", unit) - - assert per + nat is NaT - assert nat + per is NaT - assert per - nat is NaT - - with pytest.raises(TypeError, match="unsupported operand"): - nat - per - - def test_sub_delta(self): - left, right = Period("2011", freq="Y"), Period("2007", freq="Y") - result = left - right - assert result == 4 * right.freq - - msg = r"Input has different freq=M from Period\(freq=Y-DEC\)" - with pytest.raises(IncompatibleFrequency, match=msg): - left - Period("2007-01", freq="M") - - def test_add_integer(self): - per1 = Period(freq="D", year=2008, month=1, day=1) - per2 = Period(freq="D", year=2008, month=1, day=2) - assert per1 + 1 == per2 - assert 1 + per1 == per2 - - def test_add_sub_nat(self): - # GH#13071 - p = Period("2011-01", freq="M") - assert p + NaT is NaT - assert NaT + p is NaT - assert p - NaT is NaT - assert NaT - p is NaT - - def test_add_invalid(self): - # GH#4731 - per1 = Period(freq="D", year=2008, month=1, day=1) - per2 = Period(freq="D", year=2008, month=1, day=2) - - msg = "|".join( - [ - r"unsupported operand type\(s\)", - "can only concatenate str", - "must be str, not Period", - ] - ) - with pytest.raises(TypeError, match=msg): - per1 + "str" - with pytest.raises(TypeError, match=msg): - "str" + per1 - with pytest.raises(TypeError, match=msg): - per1 + per2 - - boxes = [lambda x: x, lambda x: pd.Series([x]), lambda x: pd.Index([x])] - ids = ["identity", "Series", "Index"] - - @pytest.mark.parametrize("lbox", boxes, ids=ids) - @pytest.mark.parametrize("rbox", boxes, ids=ids) - def test_add_timestamp_raises(self, rbox, lbox): - # GH#17983 - ts = Timestamp("2017") - per = Period("2017", freq="M") - - # We may get a different message depending on which class raises - # the error. - msg = "|".join( - [ - "cannot add", - "unsupported operand", - "can only operate on a", - "incompatible type", - "ufunc add cannot use operands", - ] - ) - with pytest.raises(TypeError, match=msg): - lbox(ts) + rbox(per) - - with pytest.raises(TypeError, match=msg): - lbox(per) + rbox(ts) - - with pytest.raises(TypeError, match=msg): - lbox(per) + rbox(per) - - def test_sub(self): - per1 = Period("2011-01-01", freq="D") - per2 = Period("2011-01-15", freq="D") - - off = per1.freq - assert per1 - per2 == -14 * off - assert per2 - per1 == 14 * off - - msg = r"Input has different freq=M from Period\(freq=D\)" - with pytest.raises(IncompatibleFrequency, match=msg): - per1 - Period("2011-02", freq="M") - - @pytest.mark.parametrize("n", [1, 2, 3, 4]) - def test_sub_n_gt_1_ticks(self, tick_classes, n): - # GH 23878 - p1 = Period("19910905", freq=tick_classes(n)) - p2 = Period("19920406", freq=tick_classes(n)) - - expected = Period(str(p2), freq=p2.freq.base) - Period( - str(p1), freq=p1.freq.base - ) - - assert (p2 - p1) == expected - - @pytest.mark.parametrize("normalize", [True, False]) - @pytest.mark.parametrize("n", [1, 2, 3, 4]) - @pytest.mark.parametrize( - "offset, kwd_name", - [ - (offsets.YearEnd, "month"), - (offsets.QuarterEnd, "startingMonth"), - (offsets.MonthEnd, None), - (offsets.Week, "weekday"), - ], - ) - def test_sub_n_gt_1_offsets(self, offset, kwd_name, n, normalize): - # GH 23878 - kwds = {kwd_name: 3} if kwd_name is not None else {} - p1_d = "19910905" - p2_d = "19920406" - p1 = Period(p1_d, freq=offset(n, normalize, **kwds)) - p2 = Period(p2_d, freq=offset(n, normalize, **kwds)) - - expected = Period(p2_d, freq=p2.freq.base) - Period(p1_d, freq=p1.freq.base) - - assert (p2 - p1) == expected - - def test_add_offset(self): - # freq is DateOffset - for freq in ["Y", "2Y", "3Y"]: - p = Period("2011", freq=freq) - exp = Period("2013", freq=freq) - assert p + offsets.YearEnd(2) == exp - assert offsets.YearEnd(2) + p == exp - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(365, "D"), - timedelta(365), - ]: - msg = "Input has different freq|Input cannot be converted to Period" - with pytest.raises(IncompatibleFrequency, match=msg): - p + o - with pytest.raises(IncompatibleFrequency, match=msg): - o + p - - for freq in ["M", "2M", "3M"]: - p = Period("2011-03", freq=freq) - exp = Period("2011-05", freq=freq) - assert p + offsets.MonthEnd(2) == exp - assert offsets.MonthEnd(2) + p == exp - - exp = Period("2012-03", freq=freq) - assert p + offsets.MonthEnd(12) == exp - assert offsets.MonthEnd(12) + p == exp - - msg = "|".join( - [ - "Input has different freq", - "Input cannot be converted to Period", - ] - ) - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(365, "D"), - timedelta(365), - ]: - with pytest.raises(IncompatibleFrequency, match=msg): - p + o - with pytest.raises(IncompatibleFrequency, match=msg): - o + p - - # freq is Tick - for freq in ["D", "2D", "3D"]: - p = Period("2011-04-01", freq=freq) - - exp = Period("2011-04-06", freq=freq) - assert p + offsets.Day(5) == exp - assert offsets.Day(5) + p == exp - - exp = Period("2011-04-02", freq=freq) - assert p + offsets.Hour(24) == exp - assert offsets.Hour(24) + p == exp - - exp = Period("2011-04-03", freq=freq) - assert p + np.timedelta64(2, "D") == exp - assert np.timedelta64(2, "D") + p == exp - - exp = Period("2011-04-02", freq=freq) - assert p + np.timedelta64(3600 * 24, "s") == exp - assert np.timedelta64(3600 * 24, "s") + p == exp - - exp = Period("2011-03-30", freq=freq) - assert p + timedelta(-2) == exp - assert timedelta(-2) + p == exp - - exp = Period("2011-04-03", freq=freq) - assert p + timedelta(hours=48) == exp - assert timedelta(hours=48) + p == exp - - msg = "|".join( - [ - "Input has different freq", - "Input cannot be converted to Period", - ] - ) - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(4, "h"), - timedelta(hours=23), - ]: - with pytest.raises(IncompatibleFrequency, match=msg): - p + o - with pytest.raises(IncompatibleFrequency, match=msg): - o + p - - for freq in ["h", "2h", "3h"]: - p = Period("2011-04-01 09:00", freq=freq) - - exp = Period("2011-04-03 09:00", freq=freq) - assert p + offsets.Day(2) == exp - assert offsets.Day(2) + p == exp - - exp = Period("2011-04-01 12:00", freq=freq) - assert p + offsets.Hour(3) == exp - assert offsets.Hour(3) + p == exp - - msg = "cannot use operands with types" - exp = Period("2011-04-01 12:00", freq=freq) - assert p + np.timedelta64(3, "h") == exp - assert np.timedelta64(3, "h") + p == exp - - exp = Period("2011-04-01 10:00", freq=freq) - assert p + np.timedelta64(3600, "s") == exp - assert np.timedelta64(3600, "s") + p == exp - - exp = Period("2011-04-01 11:00", freq=freq) - assert p + timedelta(minutes=120) == exp - assert timedelta(minutes=120) + p == exp - - exp = Period("2011-04-05 12:00", freq=freq) - assert p + timedelta(days=4, minutes=180) == exp - assert timedelta(days=4, minutes=180) + p == exp - - msg = "|".join( - [ - "Input has different freq", - "Input cannot be converted to Period", - ] - ) - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(3200, "s"), - timedelta(hours=23, minutes=30), - ]: - with pytest.raises(IncompatibleFrequency, match=msg): - p + o - with pytest.raises(IncompatibleFrequency, match=msg): - o + p - - def test_sub_offset(self): - # freq is DateOffset - msg = "|".join( - [ - "Input has different freq", - "Input cannot be converted to Period", - ] - ) - - for freq in ["Y", "2Y", "3Y"]: - p = Period("2011", freq=freq) - assert p - offsets.YearEnd(2) == Period("2009", freq=freq) - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(365, "D"), - timedelta(365), - ]: - with pytest.raises(IncompatibleFrequency, match=msg): - p - o - - for freq in ["M", "2M", "3M"]: - p = Period("2011-03", freq=freq) - assert p - offsets.MonthEnd(2) == Period("2011-01", freq=freq) - assert p - offsets.MonthEnd(12) == Period("2010-03", freq=freq) - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(365, "D"), - timedelta(365), - ]: - with pytest.raises(IncompatibleFrequency, match=msg): - p - o - - # freq is Tick - for freq in ["D", "2D", "3D"]: - p = Period("2011-04-01", freq=freq) - assert p - offsets.Day(5) == Period("2011-03-27", freq=freq) - assert p - offsets.Hour(24) == Period("2011-03-31", freq=freq) - assert p - np.timedelta64(2, "D") == Period("2011-03-30", freq=freq) - assert p - np.timedelta64(3600 * 24, "s") == Period("2011-03-31", freq=freq) - assert p - timedelta(-2) == Period("2011-04-03", freq=freq) - assert p - timedelta(hours=48) == Period("2011-03-30", freq=freq) - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(4, "h"), - timedelta(hours=23), - ]: - with pytest.raises(IncompatibleFrequency, match=msg): - p - o - - for freq in ["h", "2h", "3h"]: - p = Period("2011-04-01 09:00", freq=freq) - assert p - offsets.Day(2) == Period("2011-03-30 09:00", freq=freq) - assert p - offsets.Hour(3) == Period("2011-04-01 06:00", freq=freq) - assert p - np.timedelta64(3, "h") == Period("2011-04-01 06:00", freq=freq) - assert p - np.timedelta64(3600, "s") == Period( - "2011-04-01 08:00", freq=freq - ) - assert p - timedelta(minutes=120) == Period("2011-04-01 07:00", freq=freq) - assert p - timedelta(days=4, minutes=180) == Period( - "2011-03-28 06:00", freq=freq - ) - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(3200, "s"), - timedelta(hours=23, minutes=30), - ]: - with pytest.raises(IncompatibleFrequency, match=msg): - p - o - - @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) - def test_period_addsub_nat(self, freq): - per = Period("2011-01", freq=freq) - - # For subtraction, NaT is treated as another Period object - assert NaT - per is NaT - assert per - NaT is NaT - - # For addition, NaT is treated as offset-like - assert NaT + per is NaT - assert per + NaT is NaT - - def test_period_ops_offset(self): - p = Period("2011-04-01", freq="D") - result = p + offsets.Day() - exp = Period("2011-04-02", freq="D") - assert result == exp - - result = p - offsets.Day(2) - exp = Period("2011-03-30", freq="D") - assert result == exp - - msg = r"Input cannot be converted to Period\(freq=D\)" - with pytest.raises(IncompatibleFrequency, match=msg): - p + offsets.Hour(2) - - with pytest.raises(IncompatibleFrequency, match=msg): - p - offsets.Hour(2) - def test_period_immutable(): # see gh-17116 diff --git a/pandas/tests/scalar/timestamp/methods/test_tz_convert.py b/pandas/tests/scalar/timestamp/methods/test_tz_convert.py index 02c85a8f325d5..a7bb3b90805ab 100644 --- a/pandas/tests/scalar/timestamp/methods/test_tz_convert.py +++ b/pandas/tests/scalar/timestamp/methods/test_tz_convert.py @@ -6,12 +6,6 @@ from pandas import Timestamp -try: - from zoneinfo import ZoneInfo -except ImportError: - # Cannot assign to a type - ZoneInfo = None # type: ignore[misc, assignment] - class TestTimestampTZConvert: @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) diff --git a/pandas/tests/scalar/timestamp/methods/test_tz_localize.py b/pandas/tests/scalar/timestamp/methods/test_tz_localize.py index 247a583bc38f3..9df0a023730de 100644 --- a/pandas/tests/scalar/timestamp/methods/test_tz_localize.py +++ b/pandas/tests/scalar/timestamp/methods/test_tz_localize.py @@ -123,6 +123,44 @@ def test_tz_localize_nonexistent(self, stamp, tz): ts.tz_localize(tz, nonexistent="raise") assert ts.tz_localize(tz, nonexistent="NaT") is NaT + @pytest.mark.parametrize( + "stamp, tz, forward_expected, backward_expected", + [ + ( + "2015-03-29 02:00:00", + "Europe/Warsaw", + "2015-03-29 03:00:00", + "2015-03-29 01:59:59", + ), # utc+1 -> utc+2 + ( + "2023-03-12 02:00:00", + "America/Los_Angeles", + "2023-03-12 03:00:00", + "2023-03-12 01:59:59", + ), # utc-8 -> utc-7 + ( + "2023-03-26 01:00:00", + "Europe/London", + "2023-03-26 02:00:00", + "2023-03-26 00:59:59", + ), # utc+0 -> utc+1 + ( + "2023-03-26 00:00:00", + "Atlantic/Azores", + "2023-03-26 01:00:00", + "2023-03-25 23:59:59", + ), # utc-1 -> utc+0 + ], + ) + def test_tz_localize_nonexistent_shift( + self, stamp, tz, forward_expected, backward_expected + ): + ts = Timestamp(stamp) + forward_ts = ts.tz_localize(tz, nonexistent="shift_forward") + assert forward_ts == Timestamp(forward_expected, tz=tz) + backward_ts = ts.tz_localize(tz, nonexistent="shift_backward") + assert backward_ts == Timestamp(backward_expected, tz=tz) + def test_tz_localize_ambiguous_raise(self): # GH#13057 ts = Timestamp("2015-11-1 01:00") diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index 8798a8904e161..0201e5d9af2ee 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -445,6 +445,18 @@ def test_constructor_str_infer_reso(self): ts = Timestamp("300 June 1:30:01.300") assert ts.unit == "ms" + # dateutil path -> don't drop trailing zeros + ts = Timestamp("01-01-2013T00:00:00.000000000+0000") + assert ts.unit == "ns" + + ts = Timestamp("2016/01/02 03:04:05.001000 UTC") + assert ts.unit == "us" + + # higher-than-nanosecond -> we drop the trailing bits + ts = Timestamp("01-01-2013T00:00:00.000000002100+0000") + assert ts == Timestamp("01-01-2013T00:00:00.000000002+0000") + assert ts.unit == "ns" + class TestTimestampConstructors: def test_weekday_but_no_day_raises(self): diff --git a/pandas/tests/scalar/timestamp/test_timezones.py b/pandas/tests/scalar/timestamp/test_timezones.py index 4ae196eaed2ea..cb2a35be907cd 100644 --- a/pandas/tests/scalar/timestamp/test_timezones.py +++ b/pandas/tests/scalar/timestamp/test_timezones.py @@ -7,12 +7,6 @@ from pandas import Timestamp -try: - from zoneinfo import ZoneInfo -except ImportError: - # Cannot assign to a type - ZoneInfo = None # type: ignore[misc, assignment] - class TestTimestampTZOperations: # ------------------------------------------------------------------ diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 53ac7fbf40af1..618f69eb744e3 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -586,13 +586,15 @@ def test_strftime_dt64_days(self): # dtype may be S10 or U10 depending on python version tm.assert_index_equal(result, expected) - def test_strftime_period_days(self): + def test_strftime_period_days(self, using_infer_string): period_index = period_range("20150301", periods=5) result = period_index.strftime("%Y/%m/%d") expected = Index( ["2015/03/01", "2015/03/02", "2015/03/03", "2015/03/04", "2015/03/05"], dtype="=U10", ) + if using_infer_string: + expected = expected.astype("string[pyarrow_numpy]") tm.assert_index_equal(result, expected) def test_strftime_dt64_microsecond_resolution(self): diff --git a/pandas/tests/series/indexing/test_delitem.py b/pandas/tests/series/indexing/test_delitem.py index af6b3910baec0..3d1082c3d040b 100644 --- a/pandas/tests/series/indexing/test_delitem.py +++ b/pandas/tests/series/indexing/test_delitem.py @@ -31,19 +31,16 @@ def test_delitem(self): del s[0] tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="int64"))) - def test_delitem_object_index(self): + def test_delitem_object_index(self, using_infer_string): # Index(dtype=object) - s = Series(1, index=["a"]) + dtype = "string[pyarrow_numpy]" if using_infer_string else object + s = Series(1, index=Index(["a"], dtype=dtype)) del s["a"] - tm.assert_series_equal( - s, Series(dtype="int64", index=Index([], dtype="object")) - ) + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype=dtype))) s["a"] = 1 - tm.assert_series_equal(s, Series(1, index=["a"])) + tm.assert_series_equal(s, Series(1, index=Index(["a"], dtype=dtype))) del s["a"] - tm.assert_series_equal( - s, Series(dtype="int64", index=Index([], dtype="object")) - ) + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype=dtype))) def test_delitem_missing_key(self): # empty diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index 479e74703bc0e..596a225c288b8 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -71,7 +71,7 @@ def test_getitem_unrecognized_scalar(self): def test_getitem_negative_out_of_bounds(self): ser = Series(["a"] * 10, index=["a"] * 10) - msg = "index -11 is out of bounds for axis 0 with size 10" + msg = "index -11 is out of bounds for axis 0 with size 10|index out of bounds" warn_msg = "Series.__getitem__ treating keys as positions is deprecated" with pytest.raises(IndexError, match=msg): with tm.assert_produces_warning(FutureWarning, match=warn_msg): @@ -363,7 +363,9 @@ def test_getitem_no_matches(self, box): key = Series(["C"], dtype=object) key = box(key) - msg = r"None of \[Index\(\['C'\], dtype='object'\)\] are in the \[index\]" + msg = ( + r"None of \[Index\(\['C'\], dtype='object|string'\)\] are in the \[index\]" + ) with pytest.raises(KeyError, match=msg): ser[key] @@ -437,7 +439,7 @@ def test_getitem_boolean_empty(self): # GH#5877 # indexing with empty series - ser = Series(["A", "B"]) + ser = Series(["A", "B"], dtype=object) expected = Series(dtype=object, index=Index([], dtype="int64")) result = ser[Series([], dtype=object)] tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 16c127e6ece7b..02d51d5119469 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -2,6 +2,7 @@ date, datetime, ) +from decimal import Decimal import numpy as np import pytest @@ -175,7 +176,8 @@ class TestSetitemScalarIndexer: def test_setitem_negative_out_of_bounds(self): ser = Series(["a"] * 10, index=["a"] * 10) - msg = "index -11 is out of bounds for axis 0 with size 10" + # string index falls back to positional + msg = "index -11|-1 is out of bounds for axis 0 with size 10" warn_msg = "Series.__setitem__ treating keys as positions is deprecated" with pytest.raises(IndexError, match=msg): with tm.assert_produces_warning(FutureWarning, match=warn_msg): @@ -527,8 +529,12 @@ def test_setitem_empty_series_timestamp_preserves_dtype(self): Timedelta("9 days").to_pytimedelta(), ], ) - def test_append_timedelta_does_not_cast(self, td): + def test_append_timedelta_does_not_cast(self, td, using_infer_string, request): # GH#22717 inserting a Timedelta should _not_ cast to int64 + if using_infer_string and not isinstance(td, Timedelta): + # TODO: GH#56010 + request.applymarker(pytest.mark.xfail(reason="inferred as string")) + expected = Series(["x", td], index=[0, "td"], dtype=object) ser = Series(["x"]) @@ -595,13 +601,21 @@ def test_setitem_enlarge_with_na( expected = Series(expected_values, dtype=target_dtype) tm.assert_series_equal(ser, expected) - def test_setitem_enlargement_object_none(self, nulls_fixture): + def test_setitem_enlargement_object_none(self, nulls_fixture, using_infer_string): # GH#48665 ser = Series(["a", "b"]) ser[3] = nulls_fixture - expected = Series(["a", "b", nulls_fixture], index=[0, 1, 3]) + dtype = ( + "string[pyarrow_numpy]" + if using_infer_string and not isinstance(nulls_fixture, Decimal) + else object + ) + expected = Series(["a", "b", nulls_fixture], index=[0, 1, 3], dtype=dtype) tm.assert_series_equal(ser, expected) - assert ser[3] is nulls_fixture + if using_infer_string: + ser[3] is np.nan + else: + assert ser[3] is nulls_fixture def test_setitem_scalar_into_readonly_backing_data(): @@ -845,20 +859,28 @@ def test_series_where(self, obj, key, expected, warn, val, is_inplace): self._check_inplace(is_inplace, orig, arr, obj) - def test_index_where(self, obj, key, expected, warn, val): + def test_index_where(self, obj, key, expected, warn, val, using_infer_string): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - res = Index(obj).where(~mask, val) - expected_idx = Index(expected, dtype=expected.dtype) - tm.assert_index_equal(res, expected_idx) + if using_infer_string and obj.dtype == object: + with pytest.raises(TypeError, match="Scalar must"): + Index(obj).where(~mask, val) + else: + res = Index(obj).where(~mask, val) + expected_idx = Index(expected, dtype=expected.dtype) + tm.assert_index_equal(res, expected_idx) - def test_index_putmask(self, obj, key, expected, warn, val): + def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - res = Index(obj).putmask(mask, val) - tm.assert_index_equal(res, Index(expected, dtype=expected.dtype)) + if using_infer_string and obj.dtype == object: + with pytest.raises(TypeError, match="Scalar must"): + Index(obj).putmask(mask, val) + else: + res = Index(obj).putmask(mask, val) + tm.assert_index_equal(res, Index(expected, dtype=expected.dtype)) @pytest.mark.parametrize( diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index 7c1507ce423ad..c978481ca9988 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.core.dtypes.common import is_integer import pandas as pd @@ -230,6 +232,7 @@ def test_where_ndframe_align(): tm.assert_series_equal(out, expected) +@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set ints into string") def test_where_setitem_invalid(): # GH 2702 # make sure correct exceptions are raised on invalid list assignment diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index aca06a2f91c32..46f55fff91e41 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -76,7 +76,7 @@ def test_astype_dict_like(self, dtype_class): dt1 = dtype_class({"abc": str}) result = ser.astype(dt1) - expected = Series(["0", "2", "4", "6", "8"], name="abc") + expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype=object) tm.assert_series_equal(result, expected) dt2 = dtype_class({"abc": "float64"}) @@ -170,10 +170,12 @@ def test_astype_empty_constructor_equality(self, dtype): Series([string.digits * 10, rand_str(63), rand_str(64), np.nan, 1.0]), ], ) - def test_astype_str_map(self, dtype, series): + def test_astype_str_map(self, dtype, series, using_infer_string): # see GH#4405 result = series.astype(dtype) expected = series.map(str) + if using_infer_string: + expected = expected.astype(object) tm.assert_series_equal(result, expected) def test_astype_float_to_period(self): @@ -283,13 +285,13 @@ def test_astype_str_cast_dt64(self): ts = Series([Timestamp("2010-01-04 00:00:00")]) res = ts.astype(str) - expected = Series(["2010-01-04"]) + expected = Series(["2010-01-04"], dtype=object) tm.assert_series_equal(res, expected) ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")]) res = ts.astype(str) - expected = Series(["2010-01-04 00:00:00-05:00"]) + expected = Series(["2010-01-04 00:00:00-05:00"], dtype=object) tm.assert_series_equal(res, expected) def test_astype_str_cast_td64(self): @@ -298,7 +300,7 @@ def test_astype_str_cast_td64(self): td = Series([Timedelta(1, unit="d")]) ser = td.astype(str) - expected = Series(["1 days"]) + expected = Series(["1 days"], dtype=object) tm.assert_series_equal(ser, expected) def test_dt64_series_astype_object(self): @@ -345,7 +347,7 @@ def test_astype_from_float_to_str(self, dtype): # https://github.com/pandas-dev/pandas/issues/36451 ser = Series([0.1], dtype=dtype) result = ser.astype(str) - expected = Series(["0.1"]) + expected = Series(["0.1"], dtype=object) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -416,7 +418,7 @@ def test_astype_cast_object_int(self): tm.assert_series_equal(result, Series(np.arange(1, 5))) - def test_astype_unicode(self): + def test_astype_unicode(self, using_infer_string): # see GH#7758: A bit of magic is required to set # default encoding to utf-8 digits = string.digits @@ -433,12 +435,14 @@ def test_astype_unicode(self): item = "野菜食べないとやばい" ser = Series([item.encode()]) result = ser.astype(np.str_) - expected = Series([item]) + expected = Series([item], dtype=object) tm.assert_series_equal(result, expected) for ser in test_series: res = ser.astype(np.str_) expec = ser.map(str) + if using_infer_string: + expec = expec.astype(object) tm.assert_series_equal(res, expec) # Restore the former encoding @@ -534,12 +538,12 @@ def test_astype_categorical_to_other(self): expected = ser tm.assert_series_equal(ser.astype("category"), expected) tm.assert_series_equal(ser.astype(CategoricalDtype()), expected) - msg = r"Cannot cast object dtype to float64" + msg = r"Cannot cast object|string dtype to float64" with pytest.raises(ValueError, match=msg): ser.astype("float64") cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])) - exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"]) + exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"], dtype=object) tm.assert_series_equal(cat.astype("str"), exp) s2 = Series(Categorical(["1", "2", "3", "4"])) exp2 = Series([1, 2, 3, 4]).astype("int") diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index 47659308cfcad..795b2eab82aca 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -53,7 +53,7 @@ def test_combine_first(self): # mixed types index = tm.makeStringIndex(20) floats = Series(np.random.default_rng(2).standard_normal(20), index=index) - strings = Series(tm.makeStringIndex(10), index=index[::2]) + strings = Series(tm.makeStringIndex(10), index=index[::2], dtype=object) combined = strings.combine_first(floats) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index bd1e19ee858f0..b0a920ba02cad 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -186,6 +186,7 @@ def test_convert_dtypes( self, test_cases, params, + using_infer_string, ): data, maindtype, expected_default, expected_other = test_cases if ( @@ -219,6 +220,16 @@ def test_convert_dtypes( for spec, dtype in expected_other.items(): if all(params_dict[key] is val for key, val in zip(spec[::2], spec[1::2])): expected_dtype = dtype + if ( + using_infer_string + and expected_default == "string" + and expected_dtype == object + and params[0] + and not params[1] + ): + # If we would convert with convert strings then infer_objects converts + # with the option + expected_dtype = "string[pyarrow_numpy]" expected = pd.Series(data, dtype=expected_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index 6d78ecd61cdcb..f86f6069a2ef3 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -83,7 +83,7 @@ def func(x): tm.assert_series_equal(result, expected) -def test_map_series_stringdtype(any_string_dtype): +def test_map_series_stringdtype(any_string_dtype, using_infer_string): # map test on StringDType, GH#40823 ser1 = Series( data=["cat", "dog", "rabbit"], @@ -98,6 +98,8 @@ def test_map_series_stringdtype(any_string_dtype): item = np.nan expected = Series(data=["rabbit", "dog", "cat", item], dtype=any_string_dtype) + if using_infer_string and any_string_dtype == "object": + expected = expected.astype("string[pyarrow_numpy]") tm.assert_series_equal(result, expected) @@ -106,7 +108,7 @@ def test_map_series_stringdtype(any_string_dtype): "data, expected_dtype", [(["1-1", "1-1", np.nan], "category"), (["1-1", "1-2", np.nan], object)], ) -def test_map_categorical_with_nan_values(data, expected_dtype): +def test_map_categorical_with_nan_values(data, expected_dtype, using_infer_string): # GH 20714 bug fixed in: GH 24275 def func(val): return val.split("-")[0] @@ -114,6 +116,8 @@ def func(val): s = Series(data, dtype="category") result = s.map(func, na_action="ignore") + if using_infer_string and expected_dtype == object: + expected_dtype = "string[pyarrow_numpy]" expected = Series(["1", "1", np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -133,11 +137,15 @@ def test_map_empty_integer_series_with_datetime_index(): @pytest.mark.parametrize("func", [str, lambda x: str(x)]) -def test_map_simple_str_callables_same_as_astype(string_series, func): +def test_map_simple_str_callables_same_as_astype( + string_series, func, using_infer_string +): # test that we are evaluating row-by-row first # before vectorized evaluation result = string_series.map(func) - expected = string_series.astype(str) + expected = string_series.astype( + str if not using_infer_string else "string[pyarrow_numpy]" + ) tm.assert_series_equal(result, expected) @@ -461,7 +469,7 @@ def test_map_box_period(): @pytest.mark.parametrize("na_action", [None, "ignore"]) -def test_map_categorical(na_action): +def test_map_categorical(na_action, using_infer_string): values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) s = Series(values, name="XX", index=list("abcdefg")) @@ -474,7 +482,7 @@ def test_map_categorical(na_action): result = s.map(lambda x: "A", na_action=na_action) exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) - assert result.dtype == object + assert result.dtype == object if not using_infer_string else "string" @pytest.mark.parametrize( @@ -536,12 +544,14 @@ def f(x): (list(range(3)), {0: 42}, [42] + [np.nan] * 3), ], ) -def test_map_missing_mixed(vals, mapping, exp): +def test_map_missing_mixed(vals, mapping, exp, using_infer_string): # GH20495 s = Series(vals + [np.nan]) result = s.map(mapping) - - tm.assert_series_equal(result, Series(exp)) + exp = Series(exp) + if using_infer_string and mapping == {np.nan: "not NaN"}: + exp.iloc[-1] = np.nan + tm.assert_series_equal(result, exp) def test_map_scalar_on_date_time_index_aware_series(): diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index 0923a2d42ce10..6f0c8d751a92a 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas.util._test_decorators as td from pandas import ( @@ -22,6 +24,9 @@ import pandas._testing as tm +@pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="share memory doesn't work for arrow" +) def test_reindex(datetime_series, string_series): identity = string_series.reindex(string_series.index) diff --git a/pandas/tests/series/methods/test_rename.py b/pandas/tests/series/methods/test_rename.py index 83adff08b758e..119654bd19b3f 100644 --- a/pandas/tests/series/methods/test_rename.py +++ b/pandas/tests/series/methods/test_rename.py @@ -8,6 +8,7 @@ Index, MultiIndex, Series, + array, ) import pandas._testing as tm @@ -45,22 +46,28 @@ def test_rename_by_series(self): expected = Series(range(5), index=[0, 10, 20, 3, 4], name="foo") tm.assert_series_equal(result, expected) - def test_rename_set_name(self): + def test_rename_set_name(self, using_infer_string): ser = Series(range(4), index=list("abcd")) for name in ["foo", 123, 123.0, datetime(2001, 11, 11), ("foo",)]: result = ser.rename(name) assert result.name == name - tm.assert_numpy_array_equal(result.index.values, ser.index.values) + if using_infer_string: + tm.assert_extension_array_equal(result.index.values, ser.index.values) + else: + tm.assert_numpy_array_equal(result.index.values, ser.index.values) assert ser.name is None - def test_rename_set_name_inplace(self): + def test_rename_set_name_inplace(self, using_infer_string): ser = Series(range(3), index=list("abc")) for name in ["foo", 123, 123.0, datetime(2001, 11, 11), ("foo",)]: ser.rename(name, inplace=True) assert ser.name == name - exp = np.array(["a", "b", "c"], dtype=np.object_) - tm.assert_numpy_array_equal(ser.index.values, exp) + if using_infer_string: + exp = array(exp, dtype="string[pyarrow_numpy]") + tm.assert_extension_array_equal(ser.index.values, exp) + else: + tm.assert_numpy_array_equal(ser.index.values, exp) def test_rename_axis_supported(self): # Supporting axis for compatibility, detailed in GH-18589 diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index f08966c3816c0..fe0f79b766f72 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas as pd import pandas._testing as tm from pandas.core.arrays import IntervalArray @@ -389,6 +391,7 @@ def test_replace_mixed_types_with_string(self): expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) tm.assert_series_equal(expected, result) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") @pytest.mark.parametrize( "categorical, numeric", [ @@ -719,6 +722,7 @@ def test_replace_nullable_numeric(self): with pytest.raises(TypeError, match="Invalid value"): ints.replace(1, 9.5) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 1 in string") @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_series(self, regex): # GH-48644 @@ -748,10 +752,12 @@ def test_replace_value_none_dtype_numeric(self, val): expected = pd.Series([1, None], dtype=object) tm.assert_series_equal(result, expected) - def test_replace_change_dtype_series(self): + def test_replace_change_dtype_series(self, using_infer_string): # GH#25797 df = pd.DataFrame.from_dict({"Test": ["0.5", True, "0.6"]}) - df["Test"] = df["Test"].replace([True], [np.nan]) + warn = FutureWarning if using_infer_string else None + with tm.assert_produces_warning(warn, match="Downcasting"): + df["Test"] = df["Test"].replace([True], [np.nan]) expected = pd.DataFrame.from_dict({"Test": ["0.5", np.nan, "0.6"]}) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/series/methods/test_reset_index.py b/pandas/tests/series/methods/test_reset_index.py index db36221d8f510..9e6b4ce0df1d6 100644 --- a/pandas/tests/series/methods/test_reset_index.py +++ b/pandas/tests/series/methods/test_reset_index.py @@ -136,8 +136,16 @@ def test_reset_index_drop_errors(self): with pytest.raises(KeyError, match="not found"): s.reset_index("wrong", drop=True) - def test_reset_index_with_drop(self, series_with_multilevel_index): - ser = series_with_multilevel_index + def test_reset_index_with_drop(self): + arrays = [ + ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + tuples = zip(*arrays) + index = MultiIndex.from_tuples(tuples) + data = np.random.default_rng(2).standard_normal(8) + ser = Series(data, index=index) + ser.iloc[3] = np.nan deleveled = ser.reset_index() assert isinstance(deleveled, DataFrame) @@ -166,12 +174,20 @@ def test_reset_index_inplace_and_drop_ignore_name(self): ), ], ) -def test_reset_index_dtypes_on_empty_series_with_multiindex(array, dtype): +def test_reset_index_dtypes_on_empty_series_with_multiindex( + array, dtype, using_infer_string +): # GH 19602 - Preserve dtype on empty Series with MultiIndex idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) result = Series(dtype=object, index=idx)[:0].reset_index().dtypes + exp = "string" if using_infer_string else object expected = Series( - {"level_0": np.int64, "level_1": np.float64, "level_2": dtype, 0: object} + { + "level_0": np.int64, + "level_1": np.float64, + "level_2": exp if dtype == object else dtype, + 0: object, + } ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index 76ca05a60eb7a..1c17013d621c7 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -165,7 +165,7 @@ def test_to_csv_compression(self, s, encoding, compression): pd.read_csv(fh, index_col=0, encoding=encoding).squeeze("columns"), ) - def test_to_csv_interval_index(self): + def test_to_csv_interval_index(self, using_infer_string): # GH 28210 s = Series(["foo", "bar", "baz"], index=pd.interval_range(0, 3)) @@ -175,6 +175,8 @@ def test_to_csv_interval_index(self): # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) expected = s.copy() - expected.index = expected.index.astype(str) - + if using_infer_string: + expected.index = expected.index.astype("string[pyarrow_numpy]") + else: + expected.index = expected.index.astype(str) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_update.py b/pandas/tests/series/methods/test_update.py index c38b2400f0f4e..3745c045078b7 100644 --- a/pandas/tests/series/methods/test_update.py +++ b/pandas/tests/series/methods/test_update.py @@ -38,6 +38,7 @@ def test_update(self, using_copy_on_write): expected = DataFrame( [[1, np.nan, "foo"], [3, 2.0, np.nan]], columns=["a", "b", "c"] ) + expected["c"] = expected["c"].astype(object) tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index b835be6d8e501..e7233f005e427 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -204,9 +204,9 @@ def test_series_integer_mod(self, index): s1 = Series(range(1, 10)) s2 = Series("foo", index=index) - msg = "not all arguments converted during string formatting" + msg = "not all arguments converted during string formatting|mod not" - with pytest.raises(TypeError, match=msg): + with pytest.raises((TypeError, NotImplementedError), match=msg): s2 % s1 def test_add_with_duplicate_index(self): @@ -491,14 +491,27 @@ def test_ser_cmp_result_names(self, names, comparison_op): result = op(ser, cidx) assert result.name == names[2] - def test_comparisons(self): + def test_comparisons(self, using_infer_string): s = Series(["a", "b", "c"]) s2 = Series([False, True, False]) # it works! exp = Series([False, False, False]) - tm.assert_series_equal(s == s2, exp) - tm.assert_series_equal(s2 == s, exp) + if using_infer_string: + import pyarrow as pa + + msg = "has no kernel" + # TODO(3.0) GH56008 + with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): + s == s2 + with tm.assert_produces_warning( + DeprecationWarning, match="comparison", check_stacklevel=False + ): + with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): + s2 == s + else: + tm.assert_series_equal(s == s2, exp) + tm.assert_series_equal(s2 == s, exp) # ----------------------------------------------------------------- # Categorical Dtype Comparisons diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index a898e558322ac..eee297ecea4a8 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -152,7 +152,7 @@ def test_scalar_extension_dtype(self, ea_scalar_and_dtype): assert ser.dtype == ea_dtype tm.assert_series_equal(ser, expected) - def test_constructor(self, datetime_series): + def test_constructor(self, datetime_series, using_infer_string): empty_series = Series() assert datetime_series.index._is_all_dates @@ -166,7 +166,7 @@ def test_constructor(self, datetime_series): # Mixed type Series mixed = Series(["hello", np.nan], index=[0, 1]) - assert mixed.dtype == np.object_ + assert mixed.dtype == np.object_ if not using_infer_string else "string" assert np.isnan(mixed[1]) assert not empty_series.index._is_all_dates @@ -197,7 +197,7 @@ def test_constructor_index_ndim_gt_1_raises(self): Series([1, 3, 2], index=df) @pytest.mark.parametrize("input_class", [list, dict, OrderedDict]) - def test_constructor_empty(self, input_class): + def test_constructor_empty(self, input_class, using_infer_string): empty = Series() empty2 = Series(input_class()) @@ -228,7 +228,10 @@ def test_constructor_empty(self, input_class): # GH 19853 : with empty string, index and dtype str empty = Series("", dtype=str, index=range(3)) - empty2 = Series("", index=range(3)) + if using_infer_string: + empty2 = Series("", index=range(3), dtype=object) + else: + empty2 = Series("", index=range(3)) tm.assert_series_equal(empty, empty2) @pytest.mark.parametrize("input_arg", [np.nan, float("nan")]) @@ -1440,7 +1443,7 @@ def test_constructor_dict_of_tuples(self): # https://github.com/pandas-dev/pandas/issues/22698 @pytest.mark.filterwarnings("ignore:elementwise comparison:FutureWarning") - def test_fromDict(self): + def test_fromDict(self, using_infer_string): data = {"a": 0, "b": 1, "c": 2, "d": 3} series = Series(data) @@ -1452,19 +1455,19 @@ def test_fromDict(self): data = {"a": 0, "b": "1", "c": "2", "d": "3"} series = Series(data) - assert series.dtype == np.object_ + assert series.dtype == np.object_ if not using_infer_string else "string" data = {"a": "0", "b": "1"} series = Series(data, dtype=float) assert series.dtype == np.float64 - def test_fromValue(self, datetime_series): + def test_fromValue(self, datetime_series, using_infer_string): nans = Series(np.nan, index=datetime_series.index, dtype=np.float64) assert nans.dtype == np.float64 assert len(nans) == len(datetime_series) strings = Series("foo", index=datetime_series.index) - assert strings.dtype == np.object_ + assert strings.dtype == np.object_ if not using_infer_string else "string" assert len(strings) == len(datetime_series) d = datetime.now() diff --git a/pandas/tests/series/test_formats.py b/pandas/tests/series/test_formats.py index 25b34351627a1..040b1186980b2 100644 --- a/pandas/tests/series/test_formats.py +++ b/pandas/tests/series/test_formats.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas as pd from pandas import ( Categorical, @@ -142,6 +144,9 @@ def test_tidy_repr_name_0(self, arg): rep_str = repr(ser) assert "Name: 0" in rep_str + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="TODO: investigate why this is failing" + ) def test_newline(self): ser = Series(["a\n\r\tb"], name="a\n\r\td", index=["a\n\r\tf"]) assert "\t" not in repr(ser) @@ -301,7 +306,7 @@ def __repr__(self) -> str: repr(ser) str(ser) - def test_categorical_repr(self): + def test_categorical_repr(self, using_infer_string): a = Series(Categorical([1, 2, 3, 4])) exp = ( "0 1\n1 2\n2 3\n3 4\n" @@ -311,22 +316,38 @@ def test_categorical_repr(self): assert exp == a.__str__() a = Series(Categorical(["a", "b"] * 25)) - exp = ( - "0 a\n1 b\n" - " ..\n" - "48 a\n49 b\n" - "Length: 50, dtype: category\nCategories (2, object): ['a', 'b']" - ) + if using_infer_string: + exp = ( + "0 a\n1 b\n" + " ..\n" + "48 a\n49 b\n" + "Length: 50, dtype: category\nCategories (2, string): [a, b]" + ) + else: + exp = ( + "0 a\n1 b\n" + " ..\n" + "48 a\n49 b\n" + "Length: 50, dtype: category\nCategories (2, object): ['a', 'b']" + ) with option_context("display.max_rows", 5): assert exp == repr(a) levs = list("abcdefghijklmnopqrstuvwxyz") a = Series(Categorical(["a", "b"], categories=levs, ordered=True)) - exp = ( - "0 a\n1 b\n" - "dtype: category\n" - "Categories (26, object): ['a' < 'b' < 'c' < 'd' ... 'w' < 'x' < 'y' < 'z']" - ) + if using_infer_string: + exp = ( + "0 a\n1 b\n" + "dtype: category\n" + "Categories (26, string): [a < b < c < d ... w < x < y < z]" + ) + else: + exp = ( + "0 a\n1 b\n" + "dtype: category\n" + "Categories (26, object): ['a' < 'b' < 'c' < 'd' ... " + "'w' < 'x' < 'y' < 'z']" + ) assert exp == a.__str__() def test_categorical_series_repr(self): diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index 2146e154dc7fa..166f52181fed4 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -146,7 +146,7 @@ def test_logical_operators_int_dtype_with_bool(self): expected = Series([False, True, True, True]) tm.assert_series_equal(result, expected) - def test_logical_operators_int_dtype_with_object(self): + def test_logical_operators_int_dtype_with_object(self, using_infer_string): # GH#9016: support bitwise op for integer types s_0123 = Series(range(4), dtype="int64") @@ -155,8 +155,14 @@ def test_logical_operators_int_dtype_with_object(self): tm.assert_series_equal(result, expected) s_abNd = Series(["a", "b", np.nan, "d"]) - with pytest.raises(TypeError, match="unsupported.* 'int' and 'str'"): - s_0123 & s_abNd + if using_infer_string: + import pyarrow as pa + + with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"): + s_0123 & s_abNd + else: + with pytest.raises(TypeError, match="unsupported.* 'int' and 'str'"): + s_0123 & s_abNd def test_logical_operators_bool_dtype_with_int(self): index = list("bca") @@ -354,7 +360,7 @@ def test_reverse_ops_with_index(self, op, expected): result = op(ser, idx) tm.assert_series_equal(result, expected) - def test_logical_ops_label_based(self): + def test_logical_ops_label_based(self, using_infer_string): # GH#4947 # logical ops should be label based @@ -422,7 +428,17 @@ def test_logical_ops_label_based(self): tm.assert_series_equal(result, a[a]) for e in [Series(["z"])]: - result = a[a | e] + warn = FutureWarning if using_infer_string else None + if using_infer_string: + import pyarrow as pa + + with tm.assert_produces_warning(warn, match="Operation between non"): + with pytest.raises( + pa.lib.ArrowNotImplementedError, match="has no kernel" + ): + result = a[a | e] + else: + result = a[a | e] tm.assert_series_equal(result, a[a]) # vs scalars diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index f79e58427688b..4bbbcf3bf54c2 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -153,17 +153,22 @@ def test_validate_stat_keepdims(): np.sum(ser, keepdims=True) -def test_mean_with_convertible_string_raises(using_array_manager): +def test_mean_with_convertible_string_raises(using_array_manager, using_infer_string): # GH#44008 ser = Series(["1", "2"]) - assert ser.sum() == "12" - msg = "Could not convert string '12' to numeric" + if using_infer_string: + msg = "does not support" + with pytest.raises(TypeError, match=msg): + ser.sum() + else: + assert ser.sum() == "12" + msg = "Could not convert string '12' to numeric|does not support" with pytest.raises(TypeError, match=msg): ser.mean() df = ser.to_frame() if not using_array_manager: - msg = r"Could not convert \['12'\] to numeric" + msg = r"Could not convert \['12'\] to numeric|does not support" with pytest.raises(TypeError, match=msg): df.mean() @@ -174,29 +179,30 @@ def test_mean_dont_convert_j_to_complex(using_array_manager): if using_array_manager: msg = "Could not convert string 'J' to numeric" else: - msg = r"Could not convert \['J'\] to numeric" + msg = r"Could not convert \['J'\] to numeric|does not support" with pytest.raises(TypeError, match=msg): df.mean() with pytest.raises(TypeError, match=msg): df.agg("mean") - msg = "Could not convert string 'J' to numeric" + msg = "Could not convert string 'J' to numeric|does not support" with pytest.raises(TypeError, match=msg): df["db"].mean() + msg = "Could not convert string 'J' to numeric|ufunc 'divide'" with pytest.raises(TypeError, match=msg): np.mean(df["db"].astype("string").array) def test_median_with_convertible_string_raises(using_array_manager): # GH#34671 this _could_ return a string "2", but definitely not float 2.0 - msg = r"Cannot convert \['1' '2' '3'\] to numeric" + msg = r"Cannot convert \['1' '2' '3'\] to numeric|does not support" ser = Series(["1", "2", "3"]) with pytest.raises(TypeError, match=msg): ser.median() if not using_array_manager: - msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric" + msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric|does not support" df = ser.to_frame() with pytest.raises(TypeError, match=msg): df.median() diff --git a/pandas/tests/strings/conftest.py b/pandas/tests/strings/conftest.py index 868b5f1283128..036e4de20ba53 100644 --- a/pandas/tests/strings/conftest.py +++ b/pandas/tests/strings/conftest.py @@ -1,4 +1,3 @@ -import numpy as np import pytest from pandas import Series @@ -131,53 +130,3 @@ def any_string_method(request): ... method(*args, **kwargs) """ return request.param - - -# subset of the full set from pandas/conftest.py -_any_allowed_skipna_inferred_dtype = [ - ("string", ["a", np.nan, "c"]), - ("bytes", [b"a", np.nan, b"c"]), - ("empty", [np.nan, np.nan, np.nan]), - ("empty", []), - ("mixed-integer", ["a", np.nan, 2]), -] -ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id - - -@pytest.fixture(params=_any_allowed_skipna_inferred_dtype, ids=ids) -def any_allowed_skipna_inferred_dtype(request): - """ - Fixture for all (inferred) dtypes allowed in StringMethods.__init__ - - The covered (inferred) types are: - * 'string' - * 'empty' - * 'bytes' - * 'mixed' - * 'mixed-integer' - - Returns - ------- - inferred_dtype : str - The string for the inferred dtype from _libs.lib.infer_dtype - values : np.ndarray - An array of object dtype that will be inferred to have - `inferred_dtype` - - Examples - -------- - >>> from pandas._libs import lib - >>> - >>> def test_something(any_allowed_skipna_inferred_dtype): - ... inferred_dtype, values = any_allowed_skipna_inferred_dtype - ... # will pass - ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype - ... - ... # constructor for .str-accessor will also pass - ... Series(values).str - """ - inferred_dtype, values = request.param - values = np.array(values, dtype=object) # object dtype to avoid casting - - # correctness of inference tested in tests/dtypes/test_inference.py - return inferred_dtype, values diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py index 0d2f220e70c56..2914b22a52e94 100644 --- a/pandas/tests/strings/test_api.py +++ b/pandas/tests/strings/test_api.py @@ -1,3 +1,4 @@ +import numpy as np import pytest from pandas import ( @@ -9,6 +10,55 @@ ) from pandas.core.strings.accessor import StringMethods +# subset of the full set from pandas/conftest.py +_any_allowed_skipna_inferred_dtype = [ + ("string", ["a", np.nan, "c"]), + ("bytes", [b"a", np.nan, b"c"]), + ("empty", [np.nan, np.nan, np.nan]), + ("empty", []), + ("mixed-integer", ["a", np.nan, 2]), +] +ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id + + +@pytest.fixture(params=_any_allowed_skipna_inferred_dtype, ids=ids) +def any_allowed_skipna_inferred_dtype(request): + """ + Fixture for all (inferred) dtypes allowed in StringMethods.__init__ + + The covered (inferred) types are: + * 'string' + * 'empty' + * 'bytes' + * 'mixed' + * 'mixed-integer' + + Returns + ------- + inferred_dtype : str + The string for the inferred dtype from _libs.lib.infer_dtype + values : np.ndarray + An array of object dtype that will be inferred to have + `inferred_dtype` + + Examples + -------- + >>> from pandas._libs import lib + >>> + >>> def test_something(any_allowed_skipna_inferred_dtype): + ... inferred_dtype, values = any_allowed_skipna_inferred_dtype + ... # will pass + ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype + ... + ... # constructor for .str-accessor will also pass + ... Series(values).str + """ + inferred_dtype, values = request.param + values = np.array(values, dtype=object) # object dtype to avoid casting + + # correctness of inference tested in tests/dtypes/test_inference.py + return inferred_dtype, values + def test_api(any_string_dtype): # GH 6106, GH 9322 diff --git a/pandas/tests/tseries/frequencies/test_freq_code.py b/pandas/tests/tseries/frequencies/test_freq_code.py index 444f49d07481c..16b7190753ee2 100644 --- a/pandas/tests/tseries/frequencies/test_freq_code.py +++ b/pandas/tests/tseries/frequencies/test_freq_code.py @@ -3,12 +3,9 @@ from pandas._libs.tslibs import ( Period, - Resolution, to_offset, ) -import pandas._testing as tm - @pytest.mark.parametrize( "freqstr,exp_freqstr", @@ -23,27 +20,6 @@ def test_get_to_timestamp_base(freqstr, exp_freqstr): assert result_code == exp_code -@pytest.mark.parametrize( - "freqstr,expected", - [ - ("Y", "year"), - ("Q", "quarter"), - ("M", "month"), - ("D", "day"), - ("h", "hour"), - ("min", "minute"), - ("s", "second"), - ("ms", "millisecond"), - ("us", "microsecond"), - ("ns", "nanosecond"), - ], -) -def test_get_attrname_from_abbrev(freqstr, expected): - reso = Resolution.get_reso_from_freqstr(freqstr) - assert reso.attr_abbrev == freqstr - assert reso.attrname == expected - - @pytest.mark.parametrize( "args,expected", [ @@ -91,12 +67,3 @@ def test_compatibility(freqstr, expected): ts_np = np.datetime64("2021-01-01T08:00:00.00") do = to_offset(freqstr) assert ts_np + do == np.datetime64(expected) - - -@pytest.mark.parametrize("freq", ["A", "H", "T", "S", "L", "U", "N"]) -def test_units_A_H_T_S_L_U_N_deprecated_from_attrname_to_abbrevs(freq): - # GH#52536 - msg = f"'{freq}' is deprecated and will be removed in a future version." - - with tm.assert_produces_warning(FutureWarning, match=msg): - Resolution.get_reso_from_freqstr(freq) diff --git a/pandas/tests/tseries/offsets/conftest.py b/pandas/tests/tseries/offsets/conftest.py index c9c4d6c456c53..2fc846353dcb5 100644 --- a/pandas/tests/tseries/offsets/conftest.py +++ b/pandas/tests/tseries/offsets/conftest.py @@ -3,35 +3,6 @@ import pytest from pandas._libs.tslibs import Timestamp -from pandas._libs.tslibs.offsets import MonthOffset - -from pandas.tseries import offsets - - -@pytest.fixture( - params=[ - getattr(offsets, o) for o in offsets.__all__ if o not in ("Tick", "BaseOffset") - ] -) -def offset_types(request): - """ - Fixture for all the datetime offsets available for a time series. - """ - return request.param - - -@pytest.fixture( - params=[ - getattr(offsets, o) - for o in offsets.__all__ - if issubclass(getattr(offsets, o), MonthOffset) and o != "MonthOffset" - ] -) -def month_classes(request): - """ - Fixture for month based datetime offsets available for a time series. - """ - return request.param @pytest.fixture diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index bc20e840b7c61..8a881e1b30b10 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -102,6 +102,33 @@ def _create_offset(klass, value=1, normalize=False): return klass +@pytest.fixture( + params=[ + getattr(offsets, o) + for o in offsets.__all__ + if issubclass(getattr(offsets, o), liboffsets.MonthOffset) + and o != "MonthOffset" + ] +) +def month_classes(request): + """ + Fixture for month based datetime offsets available for a time series. + """ + return request.param + + +@pytest.fixture( + params=[ + getattr(offsets, o) for o in offsets.__all__ if o not in ("Tick", "BaseOffset") + ] +) +def offset_types(request): + """ + Fixture for all the datetime offsets available for a time series. + """ + return request.param + + @pytest.fixture def dt(): return Timestamp(datetime(2008, 1, 2)) diff --git a/pandas/tests/tslibs/test_resolution.py b/pandas/tests/tslibs/test_resolution.py index 7b2268f16a85f..690962f1daa5e 100644 --- a/pandas/tests/tslibs/test_resolution.py +++ b/pandas/tests/tslibs/test_resolution.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import pytz from pandas._libs.tslibs import ( @@ -7,6 +8,8 @@ ) from pandas._libs.tslibs.dtypes import NpyDatetimeUnit +import pandas._testing as tm + def test_get_resolution_nano(): # don't return the fallback RESO_DAY @@ -22,3 +25,33 @@ def test_get_resolution_non_nano_data(): res = get_resolution(arr, pytz.UTC, NpyDatetimeUnit.NPY_FR_us.value) assert res == Resolution.RESO_US + + +@pytest.mark.parametrize( + "freqstr,expected", + [ + ("Y", "year"), + ("Q", "quarter"), + ("M", "month"), + ("D", "day"), + ("h", "hour"), + ("min", "minute"), + ("s", "second"), + ("ms", "millisecond"), + ("us", "microsecond"), + ("ns", "nanosecond"), + ], +) +def test_get_attrname_from_abbrev(freqstr, expected): + reso = Resolution.get_reso_from_freqstr(freqstr) + assert reso.attr_abbrev == freqstr + assert reso.attrname == expected + + +@pytest.mark.parametrize("freq", ["A", "H", "T", "S", "L", "U", "N"]) +def test_units_A_H_T_S_L_U_N_deprecated_from_attrname_to_abbrevs(freq): + # GH#52536 + msg = f"'{freq}' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + Resolution.get_reso_from_freqstr(freq) diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index f5ef6a00e0b32..b1cc7ec186f19 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -1,11 +1,6 @@ import numpy as np import pytest -from pandas.compat import ( - is_ci_environment, - is_platform_mac, - is_platform_windows, -) from pandas.errors import NumbaUtilError import pandas.util._test_decorators as td @@ -17,15 +12,7 @@ ) import pandas._testing as tm -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.skipif( - is_ci_environment() and (is_platform_windows() or is_platform_mac()), - reason="On GHA CI, Windows can fail with " - "'Windows fatal exception: stack overflow' " - "and macOS can timeout", - ), -] +pytestmark = pytest.mark.single_cpu @pytest.fixture(params=["single", "table"]) diff --git a/pandas/tests/window/test_online.py b/pandas/tests/window/test_online.py index 8c4fb1fe6872b..14d3a39107bc4 100644 --- a/pandas/tests/window/test_online.py +++ b/pandas/tests/window/test_online.py @@ -1,27 +1,13 @@ import numpy as np import pytest -from pandas.compat import ( - is_ci_environment, - is_platform_mac, - is_platform_windows, -) - from pandas import ( DataFrame, Series, ) import pandas._testing as tm -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.skipif( - is_ci_environment() and (is_platform_windows() or is_platform_mac()), - reason="On GHA CI, Windows can fail with " - "'Windows fatal exception: stack overflow' " - "and macOS can timeout", - ), -] +pytestmark = pytest.mark.single_cpu pytest.importorskip("numba") diff --git a/pyproject.toml b/pyproject.toml index 8ebd70762b2a5..38a22b2e7f90d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -486,6 +486,10 @@ filterwarnings = [ "error:::pandas", "error::ResourceWarning", "error::pytest.PytestUnraisableExceptionWarning", + # TODO(PY311-minimum): Specify EncodingWarning + # Ignore 3rd party EncodingWarning but raise on pandas' + "ignore:.*encoding.* argument not specified", + "error:.*encoding.* argument not specified::pandas", "ignore:.*ssl.SSLSocket:pytest.PytestUnraisableExceptionWarning", "ignore:.*ssl.SSLSocket:ResourceWarning", # GH 44844: Can remove once minimum matplotlib version >= 3.7 diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 6e6251425928d..5bde4c21cfab5 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -50,6 +50,7 @@ "_global_config", "_chained_assignment_msg", "_chained_assignment_method_msg", + "_chained_assignment_warning_method_msg", "_version_meson", # The numba extensions need this to mock the iloc object "_iLocIndexer", From c64dca9dcb183e40ea1149a536b2dcda7280f960 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 21 Nov 2023 22:31:19 +0100 Subject: [PATCH 11/12] Update pandas/core/series.py Co-authored-by: Joris Van den Bossche --- pandas/core/series.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/series.py b/pandas/core/series.py index aeb36676e16f6..f415d5534c811 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -882,6 +882,8 @@ def view(self, dtype: Dtype | None = None) -> Series: cause index misalignment. .. deprecated:: 2.2.0 + ``Series.view`` is deprecated and will be removed in a future version. + Use :meth:`Series.astype` as an alternative to change the dtype. Parameters ---------- From ef1290edde03d6e0fa431bab4b0a2a5d9ace3747 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 25 Nov 2023 00:58:06 +0100 Subject: [PATCH 12/12] Move dep warning --- pandas/core/series.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index f415d5534c811..f8909e47b4510 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -876,15 +876,15 @@ def view(self, dtype: Dtype | None = None) -> Series: """ Create a new view of the Series. + .. deprecated:: 2.2.0 + ``Series.view`` is deprecated and will be removed in a future version. + Use :meth:`Series.astype` as an alternative to change the dtype. + This function will return a new Series with a view of the same underlying values in memory, optionally reinterpreted with a new data type. The new data type must preserve the same size in bytes as to not cause index misalignment. - .. deprecated:: 2.2.0 - ``Series.view`` is deprecated and will be removed in a future version. - Use :meth:`Series.astype` as an alternative to change the dtype. - Parameters ---------- dtype : data type