diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a622de742a840..d6c03f08f2ba0 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -6,6 +6,7 @@ from typing import ( Literal, _GenericAlias, ) +import warnings cimport cython from cpython.datetime cimport ( @@ -99,6 +100,8 @@ cdef extern from "pandas/parser/pd_parser.h": PandasParser_IMPORT +from pandas._config import get_option + from pandas._libs cimport util from pandas._libs.util cimport ( INT64_MAX, @@ -1267,6 +1270,9 @@ cdef class Seen: bint datetimetz_ # seen_datetimetz bint period_ # seen_period bint interval_ # seen_interval + bint time_ + bint date_ + bint bytes_ def __cinit__(self, bint coerce_numeric=False): """ @@ -1293,6 +1299,9 @@ cdef class Seen: self.datetimetz_ = False self.period_ = False self.interval_ = False + self.time_ = False + self.date_ = False + self.bytes_ = False self.coerce_numeric = coerce_numeric cdef bint check_uint64_conflict(self) except -1: @@ -2560,6 +2569,11 @@ def maybe_convert_objects(ndarray[object] objects, else: seen.object_ = True break + elif PyDate_Check(val): + if convert_non_numeric: + seen.date_ = True + else: + seen.object_ = True elif is_period_object(val): if convert_non_numeric: seen.period_ = True @@ -2583,12 +2597,53 @@ def maybe_convert_objects(ndarray[object] objects, else: seen.object_ = True break + elif isinstance(val, bytes): + if convert_non_numeric: + seen.bytes_ = True + else: + seen.object_ = True + break + elif PyTime_Check(val): + if convert_non_numeric and val.tzinfo is None: + seen.time_ = True + else: + seen.object_ = True + break else: seen.object_ = True break - # we try to coerce datetime w/tz but must all have the same tz - if seen.datetimetz_: + if seen.bytes_: + if is_bytes_array(objects): + opt = get_option("future.infer_bytes") + if opt is True: + import pyarrow as pa + + from pandas.core.dtypes.dtypes import ArrowDtype + + obj = pa.array(objects) + dtype = ArrowDtype(obj.type) + return dtype.construct_array_type()(obj) + elif opt is False: + # explicitly set to keep the old behavior and avoid the warning + pass + else: + from pandas.util._exceptions import find_stack_level + warnings.warn( + "Pandas type inference with a sequence of `bytes` " + "objects is deprecated. In a future version, this will give " + "bytes[pyarrow] dtype, which will require pyarrow to be " + "installed. To opt in to the new behavior immediately set " + "`pd.set_option('future.infer_bytes', True)`. To keep the " + "old behavior pass `dtype=object`.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + seen.object_ = True + + elif seen.datetimetz_: + # we try to coerce datetime w/tz but must all have the same tz if is_datetime_with_singletz_array(objects): from pandas import DatetimeIndex @@ -2647,6 +2702,65 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True + elif seen.time_: + if is_time_array(objects): + # FIXME: need to ensure this is not timetz + opt = get_option("future.infer_time") + if opt is True: + import pyarrow as pa + + from pandas.core.dtypes.dtypes import ArrowDtype + + obj = pa.array(objects) + dtype = ArrowDtype(obj.type) + return dtype.construct_array_type()(obj) + elif opt is False: + # explicitly set to keep the old behavior and avoid the warning + pass + else: + from pandas.util._exceptions import find_stack_level + warnings.warn( + "Pandas type inference with a sequence of `datetime.time` " + "objects is deprecated. In a future version, this will give " + "time32[pyarrow] dtype, which will require pyarrow to be " + "installed. To opt in to the new behavior immediately set " + "`pd.set_option('future.infer_time', True)`. To keep the " + "old behavior pass `dtype=object`.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + seen.object_ = True + + elif seen.date_: + if is_date_array(objects, skipna=True): + opt = get_option("future.infer_date") + if opt is True: + import pyarrow as pa + + from pandas.core.dtypes.dtypes import ArrowDtype + + obj = pa.array(objects) + dtype = ArrowDtype(obj.type) + return dtype.construct_array_type()(obj) + elif opt is False: + # explicitly set to keep the old behavior and avoid the warning + pass + else: + from pandas.util._exceptions import find_stack_level + warnings.warn( + "Pandas type inference with a sequence of `datetime.date` " + "objects is deprecated. In a future version, this will give " + "date32[pyarrow] dtype, which will require pyarrow to be " + "installed. To opt in to the new behavior immediately set " + "`pd.set_option('future.infer_time', True)`. To keep the " + "old behavior pass `dtype=object`.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + seen.object_ = True + elif seen.nat_: if not seen.object_ and not seen.numeric_ and not seen.bool_: # all NaT, None, or nan (at least one NaT) diff --git a/pandas/conftest.py b/pandas/conftest.py index b2f1377a9fb32..106ea3aeed8a1 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -129,6 +129,9 @@ def pytest_collection_modifyitems(items, config) -> None: # Warnings from doctests that can be ignored; place reason in comment above. # Each entry specifies (path, message) - see the ignore_doctest_warning function ignored_doctest_warnings = [ + ("DatetimeProperties.time", "with pyarrow time dtype"), + ("DatetimeArray.time", "with pyarrow time dtype"), + ("DatetimeIndex.time", "with pyarrow time dtype"), ("is_int64_dtype", "is_int64_dtype is deprecated"), ("is_interval_dtype", "is_interval_dtype is deprecated"), ("is_period_dtype", "is_period_dtype is deprecated"), @@ -137,6 +140,8 @@ def pytest_collection_modifyitems(items, config) -> None: ("is_sparse", "is_sparse is deprecated"), ("NDFrame.replace", "The 'method' keyword"), ("NDFrame.replace", "Series.replace without 'value'"), + ("DatetimeArray.time", "with pyarrow time dtype"), + ("DatetimeIndex.time", "with pyarrow time dtype"), # Docstring divides by zero to show behavior difference ("missing.mask_zero_div_zero", "divide by zero encountered"), ( diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 85a75fff25ebd..565800823a9f2 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -651,7 +651,15 @@ def _evaluate_op_method(self, other, op, arrow_funcs): if pc_func is NotImplemented: raise NotImplementedError(f"{op.__name__} not implemented.") - result = pc_func(self._pa_array, other) + try: + result = pc_func(self._pa_array, other) + except pa.lib.ArrowNotImplementedError: + if op in [operator.add, roperator.radd, operator.sub, roperator.rsub]: + # By returning NotImplemented we get standard message with a + # TypeError + return NotImplemented + raise + return type(self)(result) def _logical_method(self, other, op): diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index d6afba8c34904..08c514bcf043b 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -15,6 +15,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs import ( lib, tslib, @@ -55,9 +57,11 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import ( + ArrowDtype, DatetimeTZDtype, ExtensionDtype, PeriodDtype, + ArrowDtype, ) from pandas.core.dtypes.missing import isna @@ -82,7 +86,10 @@ ) from pandas import DataFrame - from pandas.core.arrays import PeriodArray + from pandas.core.arrays import ( + ArrowExtensionArray, + PeriodArray, + ) _midnight = time(0, 0) @@ -1335,7 +1342,7 @@ def day_name(self, locale=None) -> npt.NDArray[np.object_]: return result @property - def time(self) -> npt.NDArray[np.object_]: + def time(self) -> npt.NDArray[np.object_] | ArrowExtensionArray: """ Returns numpy array of :class:`datetime.time` objects. @@ -1368,7 +1375,30 @@ def time(self) -> npt.NDArray[np.object_]: # keeping their timezone and not using UTC timestamps = self._local_timestamps() - return ints_to_pydatetime(timestamps, box="time", reso=self._creso) + result = ints_to_pydatetime(timestamps, box="time", reso=self._creso) + + opt = get_option("future.infer_time") + if opt is None: + warnings.warn( + f"The behavior of {type(self).__name__}.time is deprecated. " + "In a future version, this will return an array with pyarrow time " + "dtype instead of object dtype. To opt in to the future behavior, " + "set `pd.set_option('future.infer_time', True)`.", + FutureWarning, + stacklevel=find_stack_level(), + ) + elif opt is True: + # TODO: optimize this to avoid going through ints_to_pydatetime + import pyarrow as pa + + pa_type = pa.time64(self.unit) + result[self.isna()] = None + obj = pa.array(result, type=pa_type) + dtype = ArrowDtype(obj.type) + out = dtype.construct_array_type()(obj) + return out + + return result @property def timetz(self) -> npt.NDArray[np.object_]: diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 3f662073f0357..e32091c351431 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -889,3 +889,31 @@ def register_converter_cb(key) -> None: styler_environment, validator=is_instance_factory([type(None), str]), ) + + +with cf.config_prefix("future"): + cf.register_option( + "future.infer_bytes", + None, + "Whether to infer sequence of bytes objects as pyarrow bytes " + "dtype, which will be the default in pandas 3.0 " + "(at which point this option will be deprecated).", + validator=is_one_of_factory([True, False, None]), + ) + cf.register_option( + "future.infer_time", + None, + "Whether to infer sequence of datetime.time objects as pyarrow time " + "dtype, which will be the default in pandas 3.0 " + "(at which point this option will be deprecated).", + validator=is_one_of_factory([True, False, None]), + ) + + cf.register_option( + "future.infer_date", + None, + "Whether to infer sequence of datetime.date objects as pyarrow date " + "dtype, which will be the default in pandas 3.0 " + "(at which point this option will be deprecated).", + validator=is_one_of_factory([True, False, None]), + ) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 9b4d67a20a7cd..9140e59ca5a0b 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -14,10 +14,13 @@ cast, overload, ) +import warnings import numpy as np from numpy import ma +from pandas._config import get_option + from pandas._libs import lib from pandas._libs.tslibs import ( Period, @@ -31,6 +34,7 @@ DtypeObj, T, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import ( @@ -47,7 +51,10 @@ is_object_dtype, pandas_dtype, ) -from pandas.core.dtypes.dtypes import PandasDtype +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + PandasDtype, +) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCExtensionArray, @@ -360,6 +367,78 @@ def array( elif inferred_dtype == "boolean": return BooleanArray._from_sequence(data, copy=copy) + elif inferred_dtype == "time": + opt = get_option("future.infer_time") + + if opt is True: + import pyarrow as pa + + obj = pa.array(data) + dtype = ArrowDtype(obj.type) + return dtype.construct_array_type()(obj) + elif opt is False: + # explicitly set to keep the old behavior and avoid the warning + pass + else: + warnings.warn( + "Pandas type inference with a sequence of `datetime.time` " + "objects is deprecated. In a future version, this will give " + "time32[pyarrow] dtype, which will require pyarrow to be " + "installed. To opt in to the new behavior immediately set " + "`pd.set_option('future.infer_time', True)`. To keep the " + "old behavior pass `dtype=object`.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + elif inferred_dtype == "date": + opt = get_option("future.infer_date") + + if opt is True: + import pyarrow as pa + + obj = pa.array(data) + dtype = ArrowDtype(obj.type) + return dtype.construct_array_type()(obj) + elif opt is False: + # explicitly set to keep the old behavior and avoid the warning + pass + else: + warnings.warn( + "Pandas type inference with a sequence of `datetime.date` " + "objects is deprecated. In a future version, this will give " + "date32[pyarrow] dtype, which will require pyarrow to be " + "installed. To opt in to the new behavior immediately set " + "`pd.set_option('future.infer_time', True)`. To keep the " + "old behavior pass `dtype=object`.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + elif inferred_dtype == "bytes": + opt = get_option("future.infer_bytes") + + if opt is True: + import pyarrow as pa + + obj = pa.array(data) + dtype = ArrowDtype(obj.type) + return dtype.construct_array_type()(obj) + elif opt is False: + # explicitly set to keep the old behavior and avoid the warning + pass + else: + warnings.warn( + "Pandas type inference with a sequence of `bytes` " + "objects is deprecated. In a future version, this will give " + "bytes[pyarrow] dtype, which will require pyarrow to be " + "installed. To opt in to the new behavior immediately set " + "`pd.set_option('future.infer_bytes', True)`. To keep the " + "old behavior pass `dtype=object`.", + FutureWarning, + stacklevel=find_stack_level(), + ) + # Pandas overrides NumPy for # 1. datetime64[ns,us,ms,s] # 2. timedelta64[ns,us,ms,s] diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index f65cb94df293e..6de55844f37ac 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -20,6 +20,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs import lib from pandas._libs.missing import ( NA, @@ -40,6 +42,7 @@ IntCastingNaNError, LossySetitemError, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( ensure_int8, @@ -819,6 +822,65 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: val = val.asm8 dtype = val.dtype + elif isinstance(val, dt.time): + if val.tzinfo is None: + # pyarrow doesn't have a dtype for timetz. + opt = get_option("future.infer_time") + if opt is None: + warnings.warn( + "Pandas type inference with a `datetime.time` " + "object is deprecated. In a future version, this will give " + "time32[pyarrow] dtype, which will require pyarrow to be " + "installed. To opt in to the new behavior immediately set " + "`pd.set_option('future.infer_time', True)`. To keep the " + "old behavior pass `dtype=object`.", + FutureWarning, + stacklevel=find_stack_level(), + ) + elif opt is True: + import pyarrow as pa + + pa_dtype = pa.time64("us") + dtype = ArrowDtype(pa_dtype) + + elif isinstance(val, dt.date): + opt = get_option("future.infer_date") + if opt is None: + warnings.warn( + "Pandas type inference with a `datetime.date` " + "object is deprecated. In a future version, this will give " + "date32[pyarrow] dtype, which will require pyarrow to be " + "installed. To opt in to the new behavior immediately set " + "`pd.set_option('future.infer_date', True)`. To keep the " + "old behavior pass `dtype=object`.", + FutureWarning, + stacklevel=find_stack_level(), + ) + elif opt is True: + import pyarrow as pa + + pa_dtype = pa.date32() + dtype = ArrowDtype(pa_dtype) + + elif isinstance(val, bytes): + opt = get_option("future.infer_bytes") + if opt is None: + warnings.warn( + "Pandas type inference with a `bytes` " + "object is deprecated. In a future version, this will give " + "bytes[pyarrow] dtype, which will require pyarrow to be " + "installed. To opt in to the new behavior immediately set " + "`pd.set_option('future.infer_bytes', True)`. To keep the " + "old behavior pass `dtype=object`.", + FutureWarning, + stacklevel=find_stack_level(), + ) + elif opt is True: + import pyarrow as pa + + pa_dtype = pa.binary() + dtype = ArrowDtype(pa_dtype) + elif is_bool(val): dtype = np.dtype(np.bool_) diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index c6da7d847c363..a8e9649906cc0 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -101,14 +101,14 @@ def _delegate_property_get(self, name: str): # type: ignore[override] elif not is_list_like(result): return result - result = np.asarray(result) - if self.orig is not None: index = self.orig.index else: index = self._parent.index # return the result as a Series - result = Series(result, index=index, name=self.name).__finalize__(self._parent) + result = Series( + result, index=index, name=self.name, dtype=result.dtype + ).__finalize__(self._parent) # setting this object will show a SettingWithCopyWarning/Error result._is_copy = ( diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 24fe7e6bfc0c1..e19f37e7771a7 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1967,7 +1967,13 @@ def decode(self, encoding, errors: str = "strict"): f = lambda x: decoder(x, errors)[0] arr = self._data.array # assert isinstance(arr, (StringArray,)) - result = arr._str_map(f) + + if isinstance(arr.dtype, ArrowDtype): + # TODO: is there a performant way to do this? + res_values = arr.map(f) + result = type(arr)._from_sequence(res_values) + else: + result = arr._str_map(f) return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index c9dab71805b62..ec0c3c9cda0ed 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -5066,7 +5066,16 @@ def _unconvert_string_array( dtype = f"U{itemsize}" if isinstance(data[0], bytes): - data = Series(data, copy=False).str.decode(encoding, errors=errors)._values + with warnings.catch_warnings(): + # Deprecation about inferring bytes to bytes[pyarrow] dtype + # TODO: try to avoid this altogether + warnings.filterwarnings("ignore", category=FutureWarning) + + data = ( + Series(data, copy=False).str.decode(encoding, errors=errors)._values + ).astype(object, copy=False) + # TODO: if we have pyarrow str instead of object here to begin + # with, can we avoid object dtype cast here? else: data = data.astype(dtype, copy=False).astype(object, copy=False) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index d62830ffe3ea1..cb1bb4fbbf7cc 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2910,7 +2910,13 @@ def _prepare_data(self) -> np.recarray: for i, col in enumerate(data): typ = typlist[i] if typ <= self._max_string_length: - data[col] = data[col].fillna("").apply(_pad_bytes, args=(typ,)) + with warnings.catch_warnings(): + # deprecated behavior with sequence of bytes, will infer + # to bytes[pyarrow] + # TODO: can we avoid this altogether + warnings.filterwarnings("ignore", category=FutureWarning) + + data[col] = data[col].fillna("").apply(_pad_bytes, args=(typ,)) stype = f"S{typ}" dtypes[col] = stype data[col] = data[col].astype(stype) diff --git a/pandas/tests/arithmetic/common.py b/pandas/tests/arithmetic/common.py index f3173e8f0eb57..1534346eb86cc 100644 --- a/pandas/tests/arithmetic/common.py +++ b/pandas/tests/arithmetic/common.py @@ -33,7 +33,9 @@ def assert_cannot_add(left, right, msg="cannot add"): right + left -def assert_invalid_addsub_type(left, right, msg=None): +def assert_invalid_addsub_type( + left, right, msg=None, can_be_not_implemented: bool = False +): """ Helper to assert that left and right can be neither added nor subtracted. @@ -42,14 +44,23 @@ def assert_invalid_addsub_type(left, right, msg=None): left : object right : object msg : str or None, default None + can_be_not_implemented : bool, default False + Whether to accept NotImplementedError in addition to TypeError """ - with pytest.raises(TypeError, match=msg): + + errs = TypeError + if can_be_not_implemented: + # really we are interested in pa.lib.ArrowNotImplementedError, which + # is a subclass of NotImplementedError + errs = (TypeError, NotImplementedError) + + with pytest.raises(errs, match=msg): left + right - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): right + left - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): left - right - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): right - left diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 6a0584485be42..959ab146e5e7c 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -20,6 +20,7 @@ from pandas._libs.tslibs.conversion import localize_pydatetime from pandas._libs.tslibs.offsets import shift_months from pandas.errors import PerformanceWarning +import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -1166,31 +1167,52 @@ def test_dt64arr_add_sub_parr( ) assert_invalid_addsub_type(dtarr, parr, msg) - def test_dt64arr_addsub_time_objects_raises(self, box_with_array, tz_naive_fixture): + @pytest.mark.parametrize( + "future", [pytest.param(True, marks=td.skip_if_no("pyarrow")), False, None] + ) + def test_dt64arr_addsub_time_objects_raises( + self, box_with_array, tz_naive_fixture, future, request + ): # https://github.com/pandas-dev/pandas/issues/10329 tz = tz_naive_fixture + if str(tz) == "tzlocal()" and future is True: + # TODO(GH#53278) + mark = pytest.mark.xfail( + reason="Incorrectly raises AttributeError instead of TypeError", + # some but not all CI builds + strict=False, + ) + request.node.add_marker(mark) obj1 = date_range("2012-01-01", periods=3, tz=tz) obj2 = [time(i, i, i) for i in range(3)] obj1 = tm.box_expected(obj1, box_with_array) - obj2 = tm.box_expected(obj2, box_with_array) - - msg = "|".join( - [ - "unsupported operand", - "cannot subtract DatetimeArray from ndarray", - ] - ) + msgs = [ + "unsupported operand", + "cannot subtract DatetimeArray from ndarray", + ] + warn_msg = "Pandas type inference with a sequence of `datetime.time` objects" + warn = None + if future is True: + msgs.append(r"Function '(add|subtract)_checked' has no kernel") + elif future is None: + warn = FutureWarning + + with pd.option_context("future.infer_time", future): + with tm.assert_produces_warning(warn, match=warn_msg): + obj2 = tm.box_expected(obj2, box_with_array) + + msg = "|".join(msgs) with warnings.catch_warnings(record=True): # pandas.errors.PerformanceWarning: Non-vectorized DateOffset being # applied to Series or DatetimeIndex # we aren't testing that here, so ignore. warnings.simplefilter("ignore", PerformanceWarning) - assert_invalid_addsub_type(obj1, obj2, msg=msg) + assert_invalid_addsub_type(obj1, obj2, msg=msg, can_be_not_implemented=True) # ------------------------------------------------------------- # Other invalid operations diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 5eb7f37a4ae34..7fad0a31c0de7 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -369,7 +369,12 @@ def test_constructor_date_objects(self): # we dont cast date objects to timestamps, matching Index constructor v = date.today() - cat = Categorical([v, v]) + msg = ( + "Pandas type inference with a sequence of `datetime.date` " + "objects is deprecated" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + cat = Categorical([v, v]) assert cat.categories.dtype == object assert type(cat.categories[0]) is date diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 2acc7bdc0d902..400b4d76af922 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -155,8 +155,14 @@ def test_to_pydatetime(self, dta_dti): def test_time_date(self, dta_dti, meth): dta, dti = dta_dti - result = getattr(dta, meth) - expected = getattr(dti, meth) + warn = None + msg = "In a future version, this will return an array with pyarrow time dtype" + if meth == "time": + warn = FutureWarning + + with tm.assert_produces_warning(warn, match=msg): + result = getattr(dta, meth) + expected = getattr(dti, meth) tm.assert_numpy_array_equal(result, expected) def test_format_native_types(self, unit, dtype, dta_dti): diff --git a/pandas/tests/dtypes/cast/test_infer_dtype.py b/pandas/tests/dtypes/cast/test_infer_dtype.py index b5d761b3549fa..cd99ee8a5507b 100644 --- a/pandas/tests/dtypes/cast/test_infer_dtype.py +++ b/pandas/tests/dtypes/cast/test_infer_dtype.py @@ -23,6 +23,7 @@ Timestamp, date_range, ) +import pandas._testing as tm def test_infer_dtype_from_int_scalar(any_int_numpy_dtype): @@ -102,7 +103,9 @@ def test_infer_dtype_from_period(freq): def test_infer_dtype_misc(): dt = date(2000, 1, 1) - dtype, val = infer_dtype_from_scalar(dt) + msg = "type inference with a `datetime.date` object" + with tm.assert_produces_warning(FutureWarning, match=msg): + dtype, val = infer_dtype_from_scalar(dt) assert dtype == np.object_ ts = Timestamp(1, tz="US/Eastern") @@ -160,7 +163,14 @@ def test_infer_dtype_from_scalar_errors(): ], ) def test_infer_dtype_from_scalar(value, expected): - dtype, _ = infer_dtype_from_scalar(value) + msg = "type inference with a `bytes` object is deprecated" + warn = None + if isinstance(value, bytes): + warn = FutureWarning + + with tm.assert_produces_warning(warn, match=msg): + dtype, _ = infer_dtype_from_scalar(value) + assert is_dtype_equal(dtype, expected) with pytest.raises(TypeError, match="must be list-like"): diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index 1becf3b9843b7..4c5b13b0aa7e8 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -16,6 +16,7 @@ from pandas.core.dtypes.missing import isna import pandas as pd +import pandas._testing as tm def _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar=None): @@ -310,7 +311,13 @@ def test_maybe_promote_any_with_bytes(any_numpy_dtype): # output is not a generic bytes, but corresponds to expected_dtype exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] - _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + msg = "type inference with a `bytes` object" + warn = None + if any_numpy_dtype in ["timedelta64[ns]", "datetime64[ns]"]: + warn = FutureWarning + + with tm.assert_produces_warning(warn, match=msg): + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) def test_maybe_promote_datetime64_with_any(datetime64_dtype, any_numpy_dtype): @@ -329,7 +336,13 @@ def test_maybe_promote_datetime64_with_any(datetime64_dtype, any_numpy_dtype): expected_dtype = np.dtype(object) exp_val_for_scalar = fill_value - _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + msg = "type inference with a `bytes` object is deprecated" + warn = None + if any_numpy_dtype is bytes and datetime64_dtype == "datetime64[ns]": + warn = FutureWarning + + with tm.assert_produces_warning(warn, match=msg): + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) @pytest.mark.parametrize( @@ -354,12 +367,21 @@ def test_maybe_promote_any_with_datetime64(any_numpy_dtype, fill_value): expected_dtype = np.dtype(object) exp_val_for_scalar = fill_value + msg = "type inference with a `datetime.date` object" + warn = None + if type(fill_value) is datetime.date and any_numpy_dtype in [ + "datetime64[ns]", + "timedelta64[ns]", + ]: + warn = FutureWarning + if type(fill_value) is datetime.date and dtype.kind == "M": # Casting date to dt64 is deprecated, in 2.0 enforced to cast to object expected_dtype = np.dtype(object) exp_val_for_scalar = fill_value - _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + with tm.assert_produces_warning(warn, match=msg): + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) @pytest.mark.parametrize( @@ -403,7 +425,13 @@ def test_maybe_promote_timedelta64_with_any(timedelta64_dtype, any_numpy_dtype): expected_dtype = np.dtype(object) exp_val_for_scalar = fill_value - _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + msg = "type inference with a `bytes` object is deprecated" + warn = None + if any_numpy_dtype is bytes and timedelta64_dtype == "timedelta64[ns]": + warn = FutureWarning + + with tm.assert_produces_warning(warn, match=msg): + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) @pytest.mark.parametrize( diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index bbce40727c669..eeebf8cae0fa4 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1037,6 +1037,58 @@ def test_maybe_convert_objects_ea(self, idx): ) tm.assert_extension_array_equal(result, idx._data) + @pytest.mark.parametrize( + "future", [pytest.param(True, marks=td.skip_if_no("pyarrow")), False, None] + ) + def test_maybe_convert_objects_time(self, future): + ts = Timestamp.now() + objs = np.array([ts.time()], dtype=object) + + msg = "Pandas type inference with a sequence of `datetime.time` objects" + warn = None + if future is True: + pa = pytest.importorskip("pyarrow") + dtype = pd.ArrowDtype(pa.time64("us")) + exp = dtype.construct_array_type()._from_sequence(objs, dtype=dtype) + else: + if future is None: + warn = FutureWarning + exp = objs + + with pd.option_context("future.infer_time", future): + with tm.assert_produces_warning(warn, match=msg): + out = lib.maybe_convert_objects(objs, convert_non_numeric=True) + with tm.assert_produces_warning(warn, match=msg): + ser = Series(objs) + with tm.assert_produces_warning(warn, match=msg): + ser2 = Series(list(objs)) + with tm.assert_produces_warning(warn, match=msg): + df = DataFrame(objs) + with tm.assert_produces_warning(warn, match=msg): + df2 = DataFrame(list(objs)) + with tm.assert_produces_warning(warn, match=msg): + idx = Index(objs) + with tm.assert_produces_warning(warn, match=msg): + idx2 = Index(list(objs)) + with tm.assert_produces_warning(warn, match=msg): + arr = pd.array(objs) + with tm.assert_produces_warning(warn, match=msg): + arr2 = pd.array(list(objs)) + + tm.assert_equal(out, exp) + if future: + tm.assert_equal(arr, exp) + tm.assert_equal(arr2, exp) + else: + tm.assert_equal(arr, pd.core.arrays.PandasArray(exp)) + tm.assert_equal(arr2, pd.core.arrays.PandasArray(exp)) + tm.assert_series_equal(ser, Series(exp, dtype=exp.dtype)) + tm.assert_series_equal(ser2, Series(exp, dtype=exp.dtype)) + tm.assert_frame_equal(df, DataFrame(exp, dtype=exp.dtype)) + tm.assert_frame_equal(df2, DataFrame(exp, dtype=exp.dtype)) + tm.assert_index_equal(idx, Index(exp, dtype=exp.dtype)) + tm.assert_index_equal(idx2, Index(exp, dtype=exp.dtype)) + class TestTypeInference: # Dummy class used for testing with Python objects @@ -1512,7 +1564,7 @@ def test_other_dtypes_for_array(self, func): def test_date(self): dates = [date(2012, 1, day) for day in range(1, 20)] - index = Index(dates) + index = Index(dates, dtype=object) assert index.inferred_type == "date" dates = [date(2012, 1, day) for day in range(1, 20)] + [np.nan] diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index abcca16340365..2f9b2e6a2a09a 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -716,6 +716,38 @@ class TestBaseReshaping(base.BaseReshapingTests): def test_transpose(self, data): super().test_transpose(data) + @pytest.mark.parametrize( + "columns", + [ + ["A", "B"], + pd.MultiIndex.from_tuples( + [("A", "a"), ("A", "b")], names=["outer", "inner"] + ), + ], + ) + def test_stack(self, data, columns): + warn = None + warn_msg = "Pandas type inference with a sequence of `datetime.time` objects" + + pa_dtype = data.dtype.pyarrow_dtype + if pa.types.is_time(pa_dtype): + # FIXME: need to avoid doing inference when calling frame._constructor + # in _stack_multi_columns + warn = FutureWarning + if pa.types.is_date(pa_dtype): + # FIXME: need to avoid doing inference when calling frame._constructor + # in _stack_multi_columns + warn = FutureWarning + warn_msg = ( + "Pandas type inference with a sequence of `datetime.date` objects" + ) + if pa.types.is_binary(pa_dtype): + warn = FutureWarning + warn_msg = "Pandas type inference with a sequence of `bytes` objects" + + with tm.assert_produces_warning(warn, match=warn_msg, check_stacklevel=False): + super().test_stack(data, columns) + class TestBaseSetitem(base.BaseSetitemTests): @pytest.mark.xfail( @@ -778,6 +810,21 @@ def test_invert(self, data, request): class TestBaseMethods(base.BaseMethodsTests): + def test_hash_pandas_object_works(self, data, as_frame): + pa_dtype = data.dtype.pyarrow_dtype + warn_msg = "Pandas type inference with a sequence of `datetime.(time|date)`" + warn = None + if pa.types.is_time(pa_dtype) or pa.types.is_date(pa_dtype): + # TODO(#48964) This warning will be avoided by implementing + # ArrowExtensionArray.hash_pandas_object + warn = FutureWarning + elif pa.types.is_binary(pa_dtype): + warn_msg = "Pandas type inference with a sequence of `bytes`" + warn = FutureWarning + + with tm.assert_produces_warning(warn, match=warn_msg, check_stacklevel=False): + super().test_hash_pandas_object_works(data, as_frame) + @pytest.mark.parametrize("periods", [1, -2]) def test_diff(self, data, periods, request): pa_dtype = data.dtype.pyarrow_dtype @@ -1659,7 +1706,15 @@ def test_pickle_roundtrip(data): def test_astype_from_non_pyarrow(data): # GH49795 - pd_array = data._pa_array.to_pandas().array + msg = ( + "Pandas type inference with a sequence of `datetime.date` objects is deprecated" + ) + warn = None + if pa.types.is_date(data.dtype.pyarrow_dtype): + warn = FutureWarning + + with tm.assert_produces_warning(warn, match=msg): + pd_array = data._pa_array.to_pandas().array result = pd_array.astype(data.dtype) assert not isinstance(pd_array.dtype, ArrowDtype) assert isinstance(result.dtype, ArrowDtype) diff --git a/pandas/tests/frame/methods/test_asfreq.py b/pandas/tests/frame/methods/test_asfreq.py index 47cebd31451e3..bbba8d77334e2 100644 --- a/pandas/tests/frame/methods/test_asfreq.py +++ b/pandas/tests/frame/methods/test_asfreq.py @@ -186,7 +186,12 @@ def test_asfreq_with_date_object_index(self, frame_or_series): ts = frame_or_series(np.random.randn(20), index=rng) ts2 = ts.copy() - ts2.index = [x.date() for x in ts2.index] + msg = ( + "Pandas type inference with a sequence of `datetime.date` " + "objects is deprecated" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + ts2.index = [x.date() for x in ts2.index] result = ts2.asfreq("4H", method="ffill") expected = ts.asfreq("4H", method="ffill") diff --git a/pandas/tests/frame/methods/test_filter.py b/pandas/tests/frame/methods/test_filter.py index af77db4058b43..1dfb10dd9e6a5 100644 --- a/pandas/tests/frame/methods/test_filter.py +++ b/pandas/tests/frame/methods/test_filter.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import DataFrame import pandas._testing as tm @@ -112,11 +114,18 @@ def test_filter_unicode(self, name, expected): tm.assert_frame_equal(df.filter(like=name), expected) tm.assert_frame_equal(df.filter(regex=name), expected) + @pytest.mark.parametrize( + "future", [pytest.param(True, marks=td.skip_if_no("pyarrow")), False, None] + ) @pytest.mark.parametrize("name", ["a", "a"]) - def test_filter_bytestring(self, name): + def test_filter_bytestring(self, name, future): # GH13101 - df = DataFrame({b"a": [1, 2], b"b": [3, 4]}) - expected = DataFrame({b"a": [1, 2]}) + warn = FutureWarning if future is None else None + msg = "type inference with a sequence of `bytes` objects" + with tm.assert_produces_warning(warn, match=msg): + with pd.option_context("future.infer_bytes", future): + df = DataFrame({b"a": [1, 2], b"b": [3, 4]}) + expected = DataFrame({b"a": [1, 2]}) tm.assert_frame_equal(df.filter(like=name), expected) tm.assert_frame_equal(df.filter(regex=name), expected) diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index 98f3926968ad0..de59f0e912cfd 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -510,16 +510,26 @@ def test_join_multiindex_dates(self): # GH 33692 date = pd.Timestamp(2000, 1, 1).date() - df1_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"]) + msg = ( + "Pandas type inference with a sequence of `datetime.date` " + "objects is deprecated" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df1_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"]) df1 = DataFrame({"col1": [0]}, index=df1_index) - df2_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"]) + with tm.assert_produces_warning(FutureWarning, match=msg): + df2_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"]) df2 = DataFrame({"col2": [0]}, index=df2_index) - df3_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"]) + with tm.assert_produces_warning(FutureWarning, match=msg): + df3_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"]) df3 = DataFrame({"col3": [0]}, index=df3_index) result = df1.join([df2, df3]) - expected_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"]) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected_index = MultiIndex.from_tuples( + [(0, date)], names=["index_0", "date"] + ) expected = DataFrame( {"col1": [0], "col2": [0], "col3": [0]}, index=expected_index ) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index 63e2eb790a4ea..ee8ac468dbfed 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -200,7 +200,9 @@ def test_reindex_date_fill_value(self): ts = df.iloc[0, 0] fv = ts.date() - res = df.reindex(index=range(4), columns=["A", "B", "C"], fill_value=fv) + msg = "type inference with a sequence of `datetime.date` objects is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = df.reindex(index=range(4), columns=["A", "B", "C"], fill_value=fv) expected = DataFrame( {"A": df["A"].tolist() + [fv], "B": df["B"].tolist() + [fv], "C": [fv] * 4}, diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 06e244b93016c..9f714f3ff16a7 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1895,7 +1895,12 @@ def test_constructor_with_datetimes2(self): datetimes = [ts.to_pydatetime() for ts in ind] dates = [ts.date() for ts in ind] df = DataFrame(datetimes, columns=["datetimes"]) - df["dates"] = dates + msg = ( + "Pandas type inference with a sequence of `datetime.date` " + "objects is deprecated" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df["dates"] = dates result = df.dtypes expected = Series( [np.dtype("datetime64[ns]"), np.dtype("object")], @@ -2361,7 +2366,12 @@ def test_datetime_date_tuple_columns_from_dict(self): # GH 10863 v = date.today() tup = v, v - result = DataFrame({tup: Series(range(3), index=range(3))}, columns=[tup]) + msg = ( + "Pandas type inference with a sequence of `datetime.date` " + "objects is deprecated" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = DataFrame({tup: Series(range(3), index=range(3))}, columns=[tup]) expected = DataFrame([0, 1, 2], columns=Index(Series([tup]))) tm.assert_frame_equal(result, expected) @@ -3154,6 +3164,40 @@ def test_tzaware_data_tznaive_dtype(self, constructor, box, frame_or_series): with pytest.raises(err, match=msg): constructor(ts, dtype="M8[ns]") + @pytest.mark.parametrize( + "future", [pytest.param(True, marks=td.skip_if_no("pyarrow")), False, None] + ) + def test_from_pytime(self, constructor, box, frame_or_series, future): + item = Timestamp("2023-05-04 08:53").time() + + warn = None + if box is list or (box is dict and frame_or_series is Series): + msg = ( + "Pandas type inference with a sequence of `datetime.time` " + "objects is deprecated" + ) + else: + msg = "Pandas type inference with a `datetime.time` object is deprecated" + exp_dtype = np.dtype(object) + if future is None: + warn = FutureWarning + elif future is True: + import pyarrow as pa + + pa_type = pa.time64("us") + exp_dtype = pd.ArrowDtype(pa_type) + + with pd.option_context("future.infer_time", future): + with tm.assert_produces_warning(warn, match=msg): + result = constructor(item) + dtype = tm.get_dtype(result) + assert dtype == exp_dtype + + aware = Timestamp("2023-05-04 08:53", tz="US/Pacific").timetz() + result2 = constructor(aware) + dtype = tm.get_dtype(result2) + assert dtype == np.dtype(object) + # TODO: better location for this test? class TestAllowNonNano: diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index f827eaf63a342..7efb5a20249a9 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -671,7 +671,14 @@ def test_datetime_method(method): def test_datetime_property(attr): s = pd.Series(pd.date_range("2000", periods=4)) s.attrs = {"a": 1} - result = getattr(s.dt, attr) + + warn = None + msg = "In a future version, this will return an array with pyarrow time dtype" + if attr == "time": + warn = FutureWarning + with tm.assert_produces_warning(warn, match=msg): + result = getattr(s.dt, attr) + assert result.attrs == {"a": 1} diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index aad1218190a84..64848c350f65f 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -68,19 +68,22 @@ def test_agg_datetimes_mixed(): for row in data ] - df2 = DataFrame( - { - "key": [x[0] for x in data], - "date": [x[1] for x in data], - "value": [x[2] for x in data], - } - ) + msg = "Pandas type inference with a sequence of `datetime.date` objects" + with tm.assert_produces_warning(FutureWarning, match=msg): + df2 = DataFrame( + { + "key": [x[0] for x in data], + "date": [x[1] for x in data], + "value": [x[2] for x in data], + } + ) df1["weights"] = df1["value"] / df1["value"].sum() gb1 = df1.groupby("date").aggregate(np.sum) df2["weights"] = df1["value"] / df1["value"].sum() - gb2 = df2.groupby("date").aggregate(np.sum) + with tm.assert_produces_warning(FutureWarning, match=msg): + gb2 = df2.groupby("date").aggregate(np.sum) assert len(gb1) == len(gb2) @@ -367,22 +370,25 @@ def test_agg_consistency(): def P1(a): return np.percentile(a.dropna(), q=1) - df = DataFrame( - { - "col1": [1, 2, 3, 4], - "col2": [10, 25, 26, 31], - "date": [ - dt.date(2013, 2, 10), - dt.date(2013, 2, 10), - dt.date(2013, 2, 11), - dt.date(2013, 2, 11), - ], - } - ) + msg = "Pandas type inference with a sequence of `datetime.date` objects" + with tm.assert_produces_warning(FutureWarning, match=msg): + df = DataFrame( + { + "col1": [1, 2, 3, 4], + "col2": [10, 25, 26, 31], + "date": [ + dt.date(2013, 2, 10), + dt.date(2013, 2, 10), + dt.date(2013, 2, 11), + dt.date(2013, 2, 11), + ], + } + ) g = df.groupby("date") - expected = g.agg([P1]) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.agg([P1]) expected.columns = expected.columns.levels[0] result = g.agg(P1) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 832192d8a33e6..ec137666c6e14 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1,6 +1,7 @@ from datetime import ( date, datetime, + time, ) from io import StringIO @@ -60,8 +61,11 @@ def test_apply_issues(): ) df = df.set_index("date_time") - expected = df.groupby(df.index.date).idxmax() - result = df.groupby(df.index.date).apply(lambda x: x.idxmax()) + msg = "Pandas type inference with a sequence of `datetime.date` objects" + gb = df.groupby(df.index.date) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = gb.idxmax() + result = gb.apply(lambda x: x.idxmax()) tm.assert_frame_equal(result, expected) # GH 5789 @@ -836,7 +840,19 @@ def test_apply_datetime_issue(group_column_dtlike): # is a datetime object and the column labels are different from # standard int values in range(len(num_columns)) - df = DataFrame({"a": ["foo"], "b": [group_column_dtlike]}) + warn = None + warn_msg = ( + "Pandas type inference with a sequence of `datetime.time` " + "objects is deprecated" + ) + if isinstance(group_column_dtlike, time): + warn = FutureWarning + elif type(group_column_dtlike) is date: + warn = FutureWarning + warn_msg = warn_msg.replace("datetime.time", "datetime.date") + + with tm.assert_produces_warning(warn, match=warn_msg): + df = DataFrame({"a": ["foo"], "b": [group_column_dtlike]}) result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) expected = DataFrame( @@ -1088,27 +1104,31 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp(): # GH 29617 + msg = "Pandas type inference with a sequence of `datetime.date` objects" - df = DataFrame( - { - "A": ["a", "a", "a", "b"], - "B": [ - date(2020, 1, 10), - date(2020, 1, 10), - date(2020, 2, 10), - date(2020, 2, 10), - ], - "C": [1, 2, 3, 4], - }, - index=Index([100, 101, 102, 103], name="idx"), - ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df = DataFrame( + { + "A": ["a", "a", "a", "b"], + "B": [ + date(2020, 1, 10), + date(2020, 1, 10), + date(2020, 2, 10), + date(2020, 2, 10), + ], + "C": [1, 2, 3, 4], + }, + index=Index([100, 101, 102, 103], name="idx"), + ) grp = df.groupby(["A", "B"]) - result = grp.apply(lambda x: x.head(1)) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grp.apply(lambda x: x.head(1)) expected = df.iloc[[0, 2, 3]] expected = expected.reset_index() - expected.index = MultiIndex.from_frame(expected[["A", "B", "idx"]]) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected.index = MultiIndex.from_frame(expected[["A", "B", "idx"]]) expected = expected.drop(columns="idx") tm.assert_frame_equal(result, expected) @@ -1207,9 +1227,11 @@ def test_positional_slice_groups_datetimelike(): "let": list("abcde"), } ) - result = expected.groupby( - [expected.let, expected.date.dt.date], group_keys=False - ).apply(lambda x: x.iloc[0:]) + msg = "Pandas type inference with a sequence of `datetime.date` objects" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = expected.groupby( + [expected.let, expected.date.dt.date], group_keys=False + ).apply(lambda x: x.iloc[0:]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_min_max.py b/pandas/tests/groupby/test_min_max.py index 37eb52be0b37b..d447a22dce64d 100644 --- a/pandas/tests/groupby/test_min_max.py +++ b/pandas/tests/groupby/test_min_max.py @@ -67,13 +67,16 @@ def test_min_date_with_nans(): ).dt.date df = DataFrame({"a": [np.nan, "1", np.nan], "b": [0, 1, 1], "c": dates}) - result = df.groupby("b", as_index=False)["c"].min()["c"] + msg = "Pandas type inference with a sequence of `datetime.date` objects" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("b", as_index=False)["c"].min()["c"] expected = pd.to_datetime( Series(["2019-05-09", "2019-05-09"], name="c"), format="%Y-%m-%d" ).dt.date tm.assert_series_equal(result, expected) - result = df.groupby("b")["c"].min() + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("b")["c"].min() expected.index.name = "b" tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_nunique.py b/pandas/tests/groupby/test_nunique.py index f4ebd54a7a1a9..e9d1d0cac3489 100644 --- a/pandas/tests/groupby/test_nunique.py +++ b/pandas/tests/groupby/test_nunique.py @@ -156,7 +156,12 @@ def test_nunique_with_timegrouper(): ) def test_nunique_with_NaT(key, data, dropna, expected): # GH 27951 - df = DataFrame({"key": key, "data": data}) + msg = "Pandas type inference with a sequence of `datetime.date` objects" + warn = None + if type(data[0]) is dt.date: + warn = FutureWarning + with tm.assert_produces_warning(warn, match=msg): + df = DataFrame({"key": key, "data": data}) result = df.groupby(["key"])["data"].nunique(dropna=dropna) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/categorical/test_astype.py b/pandas/tests/indexes/categorical/test_astype.py index da1d692f9eb2d..cf307408102e7 100644 --- a/pandas/tests/indexes/categorical/test_astype.py +++ b/pandas/tests/indexes/categorical/test_astype.py @@ -78,12 +78,15 @@ def test_categorical_date_roundtrip(self, box): # astype to categorical and back should preserve date objects v = date.today() - obj = Index([v, v]) + msg = "Pandas type inference with a sequence of `datetime.date` objects" + with tm.assert_produces_warning(FutureWarning, match=msg): + obj = Index([v, v]) assert obj.dtype == object if box: obj = obj.array - cat = obj.astype("category") + with tm.assert_produces_warning(FutureWarning, match=msg): + cat = obj.astype("category") rtrip = cat.astype(object) assert rtrip.dtype == object diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index ecdea9ea25c9d..84afa84299447 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -513,7 +513,9 @@ class TestGetIndexer: def test_get_indexer_date_objs(self): rng = date_range("1/1/2000", periods=20) - result = rng.get_indexer(rng.map(lambda x: x.date())) + msg = "Pandas type inference with a sequence of `datetime.date` objects" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = rng.get_indexer(rng.map(lambda x: x.date())) expected = rng.get_indexer(rng) tm.assert_numpy_array_equal(result, expected) @@ -568,7 +570,9 @@ def test_get_indexer(self): def test_get_indexer_mixed_dtypes(self, target): # https://github.com/pandas-dev/pandas/issues/33741 values = DatetimeIndex([Timestamp("2020-01-01"), Timestamp("2020-01-02")]) - result = values.get_indexer(target) + msg = "Pandas type inference with a sequence of `datetime.date` objects" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = values.get_indexer(target) expected = np.array([0, 1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) @@ -583,7 +587,9 @@ def test_get_indexer_mixed_dtypes(self, target): def test_get_indexer_out_of_bounds_date(self, target, positions): values = DatetimeIndex([Timestamp("2020-01-01"), Timestamp("2020-01-02")]) - result = values.get_indexer(target) + msg = "Pandas type inference with a sequence of `datetime.date` objects" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = values.get_indexer(target) expected = np.array(positions, dtype=np.intp) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index f07a9dce5f6ae..e2bd5450d1f57 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -24,7 +24,9 @@ class TestDatetimeIndexOps: def test_dti_time(self): rng = date_range("1/1/2000", freq="12min", periods=10) - result = pd.Index(rng).time + msg = "In a future version, this will return an array with pyarrow time dtype" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = pd.Index(rng).time expected = [t.time() for t in rng] assert (result == expected).all() diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 6f3c83b999e94..eda4e98da8fd0 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -853,7 +853,10 @@ def test_time_accessor(self, dtype): expected = np.array([time(10, 20, 30), pd.NaT]) index = DatetimeIndex(["2018-06-04 10:20:30", pd.NaT], dtype=dtype) - result = index.time + + msg = "In a future version, this will return an array with pyarrow time dtype" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = index.time tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index cabc2bfd61db6..9843bf3f1cc52 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -801,7 +801,9 @@ def test_datetimeindex(): # but NOT date objects, matching Index behavior date4 = date.today() - index = MultiIndex.from_product([[date4], [date2]]) + msg = "Pandas type inference with a sequence of `datetime.date` objects" + with tm.assert_produces_warning(FutureWarning, match=msg): + index = MultiIndex.from_product([[date4], [date2]]) assert not isinstance(index.levels[0], pd.DatetimeIndex) assert isinstance(index.levels[1], pd.DatetimeIndex) @@ -829,23 +831,27 @@ def test_constructor_with_tz(): def test_multiindex_inference_consistency(): # check that inference behavior matches the base class - + msg = "Pandas type inference with a sequence of `datetime.date` objects" v = date.today() arr = [v, v] - idx = Index(arr) + with tm.assert_produces_warning(FutureWarning, match=msg): + idx = Index(arr) assert idx.dtype == object - mi = MultiIndex.from_arrays([arr]) + with tm.assert_produces_warning(FutureWarning, match=msg): + mi = MultiIndex.from_arrays([arr]) lev = mi.levels[0] assert lev.dtype == object - mi = MultiIndex.from_product([arr]) + with tm.assert_produces_warning(FutureWarning, match=msg): + mi = MultiIndex.from_product([arr]) lev = mi.levels[0] assert lev.dtype == object - mi = MultiIndex.from_tuples([(x,) for x in arr]) + with tm.assert_produces_warning(FutureWarning, match=msg): + mi = MultiIndex.from_tuples([(x,) for x in arr]) lev = mi.levels[0] assert lev.dtype == object diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 4017a0e3a2f80..b33aec1b5913a 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1443,7 +1443,10 @@ def test_loc_setitem_datetime_coercion(self): df.loc[0:1, "c"] = np.datetime64("2008-08-08") assert Timestamp("2008-08-08") == df.loc[0, "c"] assert Timestamp("2008-08-08") == df.loc[1, "c"] - df.loc[2, "c"] = date(2005, 5, 5) + + warn_msg = "type inference with a `datetime.date` object" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + df.loc[2, "c"] = date(2005, 5, 5) assert Timestamp("2005-05-05").date() == df.loc[2, "c"] @pytest.mark.parametrize("idxer", ["var", ["var"]]) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 66dd090ec0783..b0b05c14fb980 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -993,13 +993,17 @@ def test_reader_seconds(self, request, engine, read_ext): time(16, 37, 0, 900000), time(18, 20, 54), ] - } + }, + dtype=object, ) - actual = pd.read_excel("times_1900" + read_ext, sheet_name="Sheet1") + warn_msg = "Pandas type inference with a sequence of `datetime.time` objects" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + actual = pd.read_excel("times_1900" + read_ext, sheet_name="Sheet1") tm.assert_frame_equal(actual, expected) - actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1") + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1") tm.assert_frame_equal(actual, expected) def test_read_excel_multiindex(self, request, read_ext): diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 0560e12a00bf5..15c78aa985e7f 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -633,7 +633,7 @@ def test_excel_roundtrip_datetime(self, merge_cells, tsframe, path): tsf = tsframe.copy() - tsf.index = [x.date() for x in tsframe.index] + tsf.index = Index([x.date() for x in tsframe.index], dtype=object) tsf.to_excel(path, "test1", merge_cells=merge_cells) with ExcelFile(path) as reader: diff --git a/pandas/tests/io/formats/test_to_string.py b/pandas/tests/io/formats/test_to_string.py index 31ba018a178ca..9099b757f848a 100644 --- a/pandas/tests/io/formats/test_to_string.py +++ b/pandas/tests/io/formats/test_to_string.py @@ -13,6 +13,7 @@ option_context, to_datetime, ) +import pandas._testing as tm def test_repr_embedded_ndarray(): @@ -172,10 +173,13 @@ def test_to_string_unicode_columns(float_frame): def test_to_string_utf8_columns(): + msg = "type inference with a sequence of `bytes` objects" + n = "\u05d0".encode() with option_context("display.max_rows", 1): - df = DataFrame([1, 2], columns=[n]) + with tm.assert_produces_warning(FutureWarning, match=msg): + df = DataFrame([1, 2], columns=[n]) repr(df) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index ea996e82ae3a6..0558c301b998f 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -836,7 +836,13 @@ def test_date_index_and_values(self, date_format, as_object, date_typ): if as_object: data.append("a") - ser = Series(data, index=data) + msg = "Pandas type inference with a sequence of `datetime.date` objects" + warn = None + if date_typ is datetime.date and not as_object: + warn = FutureWarning + + with tm.assert_produces_warning(warn, match=msg): + ser = Series(data, index=data) result = ser.to_json(date_format=date_format) if date_format == "epoch": diff --git a/pandas/tests/io/parser/dtypes/test_categorical.py b/pandas/tests/io/parser/dtypes/test_categorical.py index 33422d41c2f93..870332e316210 100644 --- a/pandas/tests/io/parser/dtypes/test_categorical.py +++ b/pandas/tests/io/parser/dtypes/test_categorical.py @@ -264,7 +264,14 @@ def test_categorical_coerces_timestamp(all_parsers): data = "b\n2014-01-01\n2014-01-01" expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)}) - result = parser.read_csv(StringIO(data), dtype=dtype) + msg = ( + "Pandas type inference with a sequence of `datetime.date` objects is deprecated" + ) + warn = None + if parser.engine == "pyarrow": + warn = FutureWarning + with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): + result = parser.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 571e09bb5e9dd..d4ded75e201c8 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -479,7 +479,9 @@ def test_date_col_as_index_col(all_parsers): if parser.engine == "pyarrow": # https://github.com/pandas-dev/pandas/issues/44231 # pyarrow 6.0 starts to infer time type - expected["X2"] = pd.to_datetime("1970-01-01" + expected["X2"]).dt.time + msg = "In a future version, this will return an array with pyarrow time dtype" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected["X2"] = pd.to_datetime("1970-01-01" + expected["X2"]).dt.time tm.assert_frame_equal(result, expected) @@ -1619,10 +1621,12 @@ def parse_function(yy, mm): parse_dates={"ym": [0, 1]}, date_parser=parse_function, ) - expected = DataFrame( - [[date(2001, 1, 1), 10, 10.0], [date(2001, 2, 1), 1, 11.0]], - columns=["ym", "day", "a"], - ) + msg = "Pandas type inference with a sequence of `datetime.date` objects" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = DataFrame( + [[date(2001, 1, 1), 10, 10.0], [date(2001, 2, 1), 1, 11.0]], + columns=["ym", "day", "a"], + ) expected["ym"] = expected["ym"].astype("datetime64[ns]") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_errors.py b/pandas/tests/io/pytables/test_errors.py index 295cce970889c..4f5b32cdd7e70 100644 --- a/pandas/tests/io/pytables/test_errors.py +++ b/pandas/tests/io/pytables/test_errors.py @@ -56,13 +56,15 @@ def test_table_index_incompatible_dtypes(setup_path): def test_unimplemented_dtypes_table_columns(setup_path): + warn_msg = "type inference with a `datetime.date` object" with ensure_clean_store(setup_path) as store: dtypes = [("date", datetime.date(2001, 1, 2))] # currently not supported dtypes #### for n, f in dtypes: df = tm.makeDataFrame() - df[n] = f + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + df[n] = f msg = re.escape(f"[{n}] is not implemented as a table column") with pytest.raises(TypeError, match=msg): store.append(f"df1_{n}", df) @@ -71,7 +73,8 @@ def test_unimplemented_dtypes_table_columns(setup_path): df = tm.makeDataFrame() df["obj1"] = "foo" df["obj2"] = "bar" - df["datetime1"] = datetime.date(2001, 1, 2) + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + df["datetime1"] = datetime.date(2001, 1, 2) df = df._consolidate() with ensure_clean_store(setup_path) as store: diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 42f020a8f3708..f1ddb34fcd582 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -407,6 +407,7 @@ def test_empty_series(dtype, setup_path): def test_can_serialize_dates(setup_path): rng = [x.date() for x in bdate_range("1/1/2000", "1/30/2000")] + rng = Index(rng, dtype=object) frame = DataFrame(np.random.randn(len(rng), 4), index=rng) _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 82330e1d63c9a..61a2fb0fdda81 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -11,6 +11,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -322,16 +324,21 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path): tm.assert_series_equal(read_hdf(path, "ss4"), concat([df["B"], df2["B"]])) +@pytest.mark.parametrize( + "future", [pytest.param(True, marks=td.skip_if_no("pyarrow")), False, None] +) @pytest.mark.parametrize("format", ["fixed", "table"]) -def test_to_hdf_errors(tmp_path, format, setup_path): +def test_to_hdf_errors(tmp_path, format, setup_path, future): data = ["\ud800foo"] - ser = Series(data, index=Index(data)) - path = tmp_path / setup_path - # GH 20835 - ser.to_hdf(path, "table", format=format, errors="surrogatepass") - result = read_hdf(path, "table", errors="surrogatepass") - tm.assert_series_equal(result, ser) + with pd.option_context("future.infer_bytes", future): + ser = Series(data, index=Index(data)) + path = tmp_path / setup_path + # GH 20835 + ser.to_hdf(path, "table", format=format, errors="surrogatepass") + + result = read_hdf(path, "table", errors="surrogatepass") + tm.assert_series_equal(result, ser) def test_create_table_index(setup_path): diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index e6c0c918a73cc..55f58c7028fbd 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -277,10 +277,12 @@ def test_store_timezone(setup_path): # issue storing datetime.date with a timezone as it resets when read # back in a new timezone + today = date(2013, 9, 10) + idx = pd.Index([today, today, today], dtype=object) + # original method with ensure_clean_store(setup_path) as store: - today = date(2013, 9, 10) - df = DataFrame([1, 2, 3], index=[today, today, today]) + df = DataFrame([1, 2, 3], index=idx) store["obj1"] = df result = store["obj1"] tm.assert_frame_equal(result, df) @@ -288,8 +290,7 @@ def test_store_timezone(setup_path): # with tz setting with ensure_clean_store(setup_path) as store: with tm.set_timezone("EST5EDT"): - today = date(2013, 9, 10) - df = DataFrame([1, 2, 3], index=[today, today, today]) + df = DataFrame([1, 2, 3], index=idx) store["obj1"] = df with tm.set_timezone("CST6CDT"): diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index d56139d32b1da..f74a1e82f327c 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -14,6 +14,8 @@ import pandas as pd import pandas._testing as tm +bytes_msg = "Pandas type inference with a sequence of `bytes` objects is deprecated" + @pytest.fixture def dirpath(datapath): @@ -39,6 +41,9 @@ def data_test_ix(request, dirpath): # https://github.com/cython/cython/issues/1720 class TestSAS7BDAT: + @pytest.mark.filterwarnings( + "ignore:Pandas type inference with a sequence of `bytes`:FutureWarning" + ) @pytest.mark.slow def test_from_file(self, dirpath, data_test_ix): df0, test_ix = data_test_ix @@ -47,6 +52,9 @@ def test_from_file(self, dirpath, data_test_ix): df = pd.read_sas(fname, encoding="utf-8") tm.assert_frame_equal(df, df0) + @pytest.mark.filterwarnings( + "ignore:Pandas type inference with a sequence of `bytes`:FutureWarning" + ) @pytest.mark.slow def test_from_buffer(self, dirpath, data_test_ix): df0, test_ix = data_test_ix @@ -61,6 +69,9 @@ def test_from_buffer(self, dirpath, data_test_ix): df = rdr.read() tm.assert_frame_equal(df, df0, check_exact=False) + @pytest.mark.filterwarnings( + "ignore:Pandas type inference with a sequence of `bytes`:FutureWarning" + ) @pytest.mark.slow def test_from_iterator(self, dirpath, data_test_ix): df0, test_ix = data_test_ix @@ -72,6 +83,9 @@ def test_from_iterator(self, dirpath, data_test_ix): df = rdr.read(3) tm.assert_frame_equal(df, df0.iloc[2:5, :]) + @pytest.mark.filterwarnings( + "ignore:Pandas type inference with a sequence of `bytes`:FutureWarning" + ) @pytest.mark.slow def test_path_pathlib(self, dirpath, data_test_ix): df0, test_ix = data_test_ix @@ -80,6 +94,9 @@ def test_path_pathlib(self, dirpath, data_test_ix): df = pd.read_sas(fname, encoding="utf-8") tm.assert_frame_equal(df, df0) + @pytest.mark.filterwarnings( + "ignore:Pandas type inference with a sequence of `bytes`:FutureWarning" + ) @td.skip_if_no("py.path") @pytest.mark.slow def test_path_localpath(self, dirpath, data_test_ix): @@ -91,6 +108,9 @@ def test_path_localpath(self, dirpath, data_test_ix): df = pd.read_sas(fname, encoding="utf-8") tm.assert_frame_equal(df, df0) + @pytest.mark.filterwarnings( + "ignore:Pandas type inference with a sequence of `bytes`:FutureWarning" + ) @pytest.mark.slow @pytest.mark.parametrize("chunksize", (3, 5, 10, 11)) @pytest.mark.parametrize("k", range(1, 17)) @@ -103,23 +123,42 @@ def test_iterator_loop(self, dirpath, k, chunksize): y += x.shape[0] assert y == rdr.row_count - def test_iterator_read_too_much(self, dirpath): + @pytest.mark.parametrize( + "future", [pytest.param(True, marks=td.skip_if_no("pyarrow")), False, None] + ) + def test_iterator_read_too_much(self, dirpath, future): # github #14734 fname = os.path.join(dirpath, "test1.sas7bdat") + + warn = FutureWarning if future is None else None + with pd.read_sas( fname, format="sas7bdat", iterator=True, encoding="utf-8" ) as rdr: - d1 = rdr.read(rdr.row_count + 20) + with tm.assert_produces_warning(warn, match=bytes_msg): + with pd.option_context("future.infer_bytes", future): + d1 = rdr.read(rdr.row_count + 20) with pd.read_sas(fname, iterator=True, encoding="utf-8") as rdr: - d2 = rdr.read(rdr.row_count + 20) + with tm.assert_produces_warning(warn, match=bytes_msg): + with pd.option_context("future.infer_bytes", future): + d2 = rdr.read(rdr.row_count + 20) tm.assert_frame_equal(d1, d2) -def test_encoding_options(datapath): +@pytest.mark.parametrize( + "future", [pytest.param(True, marks=td.skip_if_no("pyarrow")), False, None] +) +def test_encoding_options(datapath, future): fname = datapath("io", "sas", "data", "test1.sas7bdat") - df1 = pd.read_sas(fname) - df2 = pd.read_sas(fname, encoding="utf-8") + + warn = FutureWarning if future is None else None + with tm.assert_produces_warning(warn, match=bytes_msg): + with pd.option_context("future.infer_bytes", future): + df1 = pd.read_sas(fname) + with tm.assert_produces_warning(warn, match=bytes_msg): + with pd.option_context("future.infer_bytes", future): + df2 = pd.read_sas(fname, encoding="utf-8") for col in df1.columns: try: df1[col] = df1[col].str.decode("utf-8") @@ -130,31 +169,54 @@ def test_encoding_options(datapath): from pandas.io.sas.sas7bdat import SAS7BDATReader with contextlib.closing(SAS7BDATReader(fname, convert_header_text=False)) as rdr: - df3 = rdr.read() + with tm.assert_produces_warning(warn, match=bytes_msg): + with pd.option_context("future.infer_bytes", future): + df3 = rdr.read() for x, y in zip(df1.columns, df3.columns): assert x == y.decode() -def test_encoding_infer(datapath): +@pytest.mark.parametrize( + "future", [pytest.param(True, marks=td.skip_if_no("pyarrow")), False, None] +) +def test_encoding_infer(datapath, future): fname = datapath("io", "sas", "data", "test1.sas7bdat") + warn = FutureWarning if future is None else None + with pd.read_sas(fname, encoding="infer", iterator=True) as df1_reader: # check: is encoding inferred correctly from file assert df1_reader.inferred_encoding == "cp1252" - df1 = df1_reader.read() + with tm.assert_produces_warning(warn, match=bytes_msg): + with pd.option_context("future.infer_bytes", future): + df1 = df1_reader.read() with pd.read_sas(fname, encoding="cp1252", iterator=True) as df2_reader: - df2 = df2_reader.read() + with tm.assert_produces_warning(warn, match=bytes_msg): + with pd.option_context("future.infer_bytes", future): + df2 = df2_reader.read() # check: reader reads correct information tm.assert_frame_equal(df1, df2) -def test_productsales(datapath): +@pytest.mark.parametrize( + "future", [pytest.param(True, marks=td.skip_if_no("pyarrow")), False, None] +) +def test_productsales(datapath, future): fname = datapath("io", "sas", "data", "productsales.sas7bdat") - df = pd.read_sas(fname, encoding="utf-8") + + warn = FutureWarning if future is None else None + with tm.assert_produces_warning(warn, match=bytes_msg): + with pd.option_context("future.infer_bytes", future): + df = pd.read_sas(fname, encoding="utf-8") fname = datapath("io", "sas", "data", "productsales.csv") df0 = pd.read_csv(fname, parse_dates=["MONTH"]) + if future: + # TODO: shouldn't read_csv infer to string[pyarrow] + cols = ["COUNTRY", "REGION", "DIVISION", "PRODTYPE", "PRODUCT"] + df0[cols] = df0[cols].astype("string[pyarrow]") + vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"] df0[vn] = df0[vn].astype(np.float64) tm.assert_frame_equal(df, df0) @@ -205,14 +267,24 @@ def test_compact_numerical_values(datapath, column): tm.assert_series_equal(result, expected, check_exact=True) -def test_many_columns(datapath): +@pytest.mark.parametrize( + "future", [pytest.param(True, marks=td.skip_if_no("pyarrow")), False, None] +) +def test_many_columns(datapath, future): # Test for looking for column information in more places (PR #22628) fname = datapath("io", "sas", "data", "many_columns.sas7bdat") - df = pd.read_sas(fname, encoding="latin-1") + warn = FutureWarning if future is None else None + with tm.assert_produces_warning(warn, match=bytes_msg): + with pd.option_context("future.infer_bytes", future): + df = pd.read_sas(fname, encoding="latin-1") fname = datapath("io", "sas", "data", "many_columns.csv") df0 = pd.read_csv(fname, encoding="latin-1") + if future: + # TODO: shouldn't read_csv already do this? + cols = ["DATASRC", "PDDOCID", "nvitl", "treatment", "VISIT_NO"] + df0[cols] = df0[cols].astype("string[pyarrow]") tm.assert_frame_equal(df, df0) @@ -257,13 +329,20 @@ def round_datetime_to_ms(ts): return ts -def test_max_sas_date(datapath): +@pytest.mark.parametrize( + "future", [pytest.param(True, marks=td.skip_if_no("pyarrow")), False, None] +) +def test_max_sas_date(datapath, future): # GH 20927 # NB. max datetime in SAS dataset is 31DEC9999:23:59:59.999 # but this is read as 29DEC9999:23:59:59.998993 by a buggy # sas7bdat module fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat") - df = pd.read_sas(fname, encoding="iso-8859-1") + + warn = FutureWarning if future is None else None + with tm.assert_produces_warning(warn, match=bytes_msg): + with pd.option_context("future.infer_bytes", future): + df = pd.read_sas(fname, encoding="iso-8859-1") # SAS likes to left pad strings with spaces - lstrip before comparing df = df.map(lambda x: x.lstrip() if isinstance(x, str) else x) @@ -292,7 +371,10 @@ def test_max_sas_date(datapath): tm.assert_frame_equal(df, expected) -def test_max_sas_date_iterator(datapath): +@pytest.mark.parametrize( + "future", [pytest.param(True, marks=td.skip_if_no("pyarrow")), False, None] +) +def test_max_sas_date_iterator(datapath, future): # GH 20927 # when called as an iterator, only those chunks with a date > pd.Timestamp.max # are returned as datetime.datetime, if this happens that whole chunk is returned @@ -300,18 +382,24 @@ def test_max_sas_date_iterator(datapath): col_order = ["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"] fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat") results = [] - for df in pd.read_sas(fname, encoding="iso-8859-1", chunksize=1): - # SAS likes to left pad strings with spaces - lstrip before comparing - df = df.map(lambda x: x.lstrip() if isinstance(x, str) else x) - # GH 19732: Timestamps imported from sas will incur floating point errors - try: - df["dt_as_dt"] = df["dt_as_dt"].dt.round("us") - except pd._libs.tslibs.np_datetime.OutOfBoundsDatetime: - df = df.map(round_datetime_to_ms) - except AttributeError: - df["dt_as_dt"] = df["dt_as_dt"].apply(round_datetime_to_ms) - df.reset_index(inplace=True, drop=True) - results.append(df) + + warn = FutureWarning if future is None else None + with tm.assert_produces_warning(warn, match=bytes_msg): + with pd.option_context("future.infer_bytes", future): + for df in pd.read_sas(fname, encoding="iso-8859-1", chunksize=1): + # SAS likes to left pad strings with spaces - lstrip before comparing + df = df.map(lambda x: x.lstrip() if isinstance(x, str) else x) + # GH 19732: Timestamps imported from sas will incur floating + # point errors + try: + df["dt_as_dt"] = df["dt_as_dt"].dt.round("us") + except pd._libs.tslibs.np_datetime.OutOfBoundsDatetime: + df = df.map(round_datetime_to_ms) + except AttributeError: + df["dt_as_dt"] = df["dt_as_dt"].apply(round_datetime_to_ms) + df.reset_index(inplace=True, drop=True) + results.append(df) + expected = [ pd.DataFrame( { @@ -383,12 +471,22 @@ def test_rle_rdc_exceptions( pd.read_sas(io.BytesIO(data), format="sas7bdat") -def test_0x40_control_byte(datapath): +@pytest.mark.parametrize( + "future", [pytest.param(True, marks=td.skip_if_no("pyarrow")), False, None] +) +def test_0x40_control_byte(datapath, future): # GH 31243 fname = datapath("io", "sas", "data", "0x40controlbyte.sas7bdat") - df = pd.read_sas(fname, encoding="ascii") + + warn = FutureWarning if future is None else None + with tm.assert_produces_warning(warn, match=bytes_msg): + with pd.option_context("future.infer_bytes", future): + df = pd.read_sas(fname, encoding="ascii") fname = datapath("io", "sas", "data", "0x40controlbyte.csv") df0 = pd.read_csv(fname, dtype="object") + if future: + # TODO: shouldn't read_csv infer to pyarrow? + df0 = df0.astype("string[pyarrow]") tm.assert_frame_equal(df, df0) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 435b9bdade944..b8b61f1a7785a 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -316,13 +316,25 @@ def test_read_expands_user_home_dir( ), ], ) - def test_read_fspath_all(self, reader, module, path, datapath): + @pytest.mark.parametrize( + "future", [pytest.param(True, marks=td.skip_if_no("pyarrow")), False, None] + ) + def test_read_fspath_all(self, reader, module, path, datapath, future): pytest.importorskip(module) path = datapath(*path) mypath = CustomFSPath(path) - result = reader(mypath) - expected = reader(path) + + msg = "type inference with a sequence of `bytes` objects" + warn = None + if reader is pd.read_sas and future is None: + warn = FutureWarning + + with pd.option_context("future.infer_bytes", future): + with tm.assert_produces_warning(warn, match=msg): + result = reader(mypath) + with tm.assert_produces_warning(warn, match=msg): + expected = reader(path) if path.endswith(".pickle"): # categorical diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 571d9d5536e20..b0bdc7be2fac6 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -82,7 +82,8 @@ def test_orc_reader_basic(dirpath): "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"), "float1": np.array([1.0, 2.0], dtype="float32"), "double1": np.array([-15.0, -5.0], dtype="float64"), - "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"), + # TODO: same thing with bytes[pyarrow] + "bytes1": pd.Series([b"\x00\x01\x02\x03\x04", b""], dtype="object"), "string1": np.array(["hi", "bye"], dtype="object"), } expected = pd.DataFrame.from_dict(data) @@ -153,7 +154,9 @@ def test_orc_reader_date_low(dirpath): dtype="object", ), } - expected = pd.DataFrame.from_dict(data) + msg = "Pandas type inference with a sequence of `datetime.date` objects" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = pd.DataFrame.from_dict(data) inputfile = os.path.join(dirpath, "TestOrcFile.testDate1900.orc") got = read_orc(inputfile).iloc[:10] @@ -194,7 +197,9 @@ def test_orc_reader_date_high(dirpath): dtype="object", ), } - expected = pd.DataFrame.from_dict(data) + msg = "Pandas type inference with a sequence of `datetime.date` objects" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = pd.DataFrame.from_dict(data) inputfile = os.path.join(dirpath, "TestOrcFile.testDate2038.orc") got = read_orc(inputfile).iloc[:10] @@ -255,7 +260,8 @@ def test_orc_roundtrip_file(dirpath): "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"), "float1": np.array([1.0, 2.0], dtype="float32"), "double1": np.array([-15.0, -5.0], dtype="float64"), - "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"), + # TODO: same thing with bytes[pyarrow] dtype + "bytes1": pd.Series([b"\x00\x01\x02\x03\x04", b""], dtype="object"), "string1": np.array(["hi", "bye"], dtype="object"), } expected = pd.DataFrame.from_dict(data) @@ -279,7 +285,8 @@ def test_orc_roundtrip_bytesio(): "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"), "float1": np.array([1.0, 2.0], dtype="float32"), "double1": np.array([-15.0, -5.0], dtype="float64"), - "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"), + # TODO: same thing with bytes[pyarrow] dtype + "bytes1": pd.Series([b"\x00\x01\x02\x03\x04", b""], dtype="object"), "string1": np.array(["hi", "bye"], dtype="object"), } expected = pd.DataFrame.from_dict(data) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 10fce6b5bf43d..6b1ee13d7ab74 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -122,7 +122,8 @@ def df_full(): "string": list("abc"), "string_with_nan": ["a", np.nan, "c"], "string_with_none": ["a", None, "c"], - "bytes": [b"foo", b"bar", b"baz"], + # TODO: same thing with bytes[pyarrow] here + "bytes": pd.Series([b"foo", b"bar", b"baz"], dtype=object), "unicode": ["foo", "bar", "baz"], "int": list(range(1, 4)), "uint": np.arange(3, 6).astype("u1"), @@ -1054,7 +1055,7 @@ def test_columns_dtypes_not_invalid(self, pa): check_round_trip(df, pa) # bytes - df.columns = [b"foo", b"bar"] + df.columns = pd.Index([b"foo", b"bar"], dtype=object) with pytest.raises(NotImplementedError, match="|S3"): # Bytes fails on read_parquet check_round_trip(df, pa) @@ -1093,7 +1094,7 @@ def test_columns_dtypes_invalid(self, fp): self.check_error_on_write(df, fp, err, msg) # bytes - df.columns = [b"foo", b"bar"] + df.columns = pd.Index([b"foo", b"bar"], dtype=object) self.check_error_on_write(df, fp, err, msg) # python object diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 75fcef09535d4..116fde9b75af8 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2239,9 +2239,12 @@ def test_datetime_NaT(self): def test_datetime_date(self): # test support for datetime.date - df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) + msg = "Pandas type inference with a sequence of `datetime.date` objects" + with tm.assert_produces_warning(FutureWarning, match=msg): + df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) assert df.to_sql("test_date", self.conn, index=False) == 2 - res = read_sql_table("test_date", self.conn) + with tm.assert_produces_warning(FutureWarning, match=msg): + res = read_sql_table("test_date", self.conn) result = res["a"] expected = to_datetime(df["a"]) # comes back as datetime64 @@ -2249,9 +2252,14 @@ def test_datetime_date(self): def test_datetime_time(self, sqlite_buildin): # test support for datetime.time - df = DataFrame([time(9, 0, 0), time(9, 1, 30)], columns=["a"]) + + warn_msg = "Pandas type inference with a sequence of `datetime.time`" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + df = DataFrame([time(9, 0, 0), time(9, 1, 30)], columns=["a"]) assert df.to_sql("test_time", self.conn, index=False) == 2 - res = read_sql_table("test_time", self.conn) + + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + res = read_sql_table("test_time", self.conn) tm.assert_frame_equal(res, df) # GH8341 @@ -2267,7 +2275,9 @@ def test_datetime_time(self, sqlite_buildin): res = sql.read_sql_query("SELECT * FROM test_time3", self.conn) ref = df.map(lambda _: _.strftime("%H:%M:%S.%f")) tm.assert_frame_equal(ref, res) - res = sql.read_sql_table("test_time3", self.conn) + + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + res = sql.read_sql_table("test_time3", self.conn) tm.assert_frame_equal(df, res) def test_mixed_dtype_insert(self): @@ -3092,7 +3102,9 @@ def test_execute_sql(self): def test_datetime_date(self): # test support for datetime.date - df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) + msg = "Pandas type inference with a sequence of `datetime.date` objects" + with tm.assert_produces_warning(FutureWarning, match=msg): + df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) assert df.to_sql("test_date", self.conn, index=False) == 2 res = read_sql_query("SELECT * FROM test_date", self.conn) if self.flavor == "sqlite": @@ -3104,13 +3116,18 @@ def test_datetime_date(self): @pytest.mark.parametrize("tz_aware", [False, True]) def test_datetime_time(self, tz_aware): # test support for datetime.time, GH #8341 + + warn_msg = "Pandas type inference with a sequence of `datetime.time` objects" if not tz_aware: tz_times = [time(9, 0, 0), time(9, 1, 30)] + warn = FutureWarning else: tz_dt = date_range("2013-01-01 09:00:00", periods=2, tz="US/Pacific") tz_times = Series(tz_dt.to_pydatetime()).map(lambda dt: dt.timetz()) + warn = None - df = DataFrame(tz_times, columns=["a"]) + with tm.assert_produces_warning(warn, match=warn_msg): + df = DataFrame(tz_times, columns=["a"]) assert df.to_sql("test_time", self.conn, index=False) == 2 res = read_sql_query("SELECT * FROM test_time", self.conn) diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index 01762e39c36c1..832f6a24623d5 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -673,11 +673,25 @@ def test_plot_scatter(self): def test_raise_error_on_datetime_time_data(self): # GH 8113, datetime.time type is not supported by matplotlib in scatter df = DataFrame(np.random.randn(10), columns=["a"]) - df["dtime"] = date_range(start="2014-01-01", freq="h", periods=10).time + warn_msg = ( + "Pandas type inference with a sequence of `datetime.time` " + "objects is deprecated" + ) + + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + df["dtime"] = date_range(start="2014-01-01", freq="h", periods=10).time + msg = "must be a string or a (real )?number, not 'datetime.time'" with pytest.raises(TypeError, match=msg): - df.plot(kind="scatter", x="dtime", y="a") + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + # warns bc it calls infer_objects inside df.plot + df.plot(kind="scatter", x="dtime", y="a") + + with pd.option_context("future.infer_time", True): + with pytest.raises(TypeError, match=msg): + with tm.assert_produces_warning(None): + df.plot(kind="scatter", x="dtime", y="a") def test_scatterplot_datetime_data(self): # GH 30391 @@ -1216,9 +1230,12 @@ def test_specified_props_kwd_plot_box(self, props, expected): assert result[expected][0].get_color() == "C1" def test_unordered_ts(self): + idx = pd.Index( + [date(2012, 10, 1), date(2012, 9, 1), date(2012, 8, 1)], dtype=object + ) df = DataFrame( np.array([3.0, 2.0, 1.0]), - index=[date(2012, 10, 1), date(2012, 9, 1), date(2012, 8, 1)], + index=idx, columns=["test"], ) ax = df.plot() diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index dda71328d4e6c..30cb04186efe6 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -1008,6 +1008,7 @@ def test_time(self): t = datetime(1, 1, 1, 3, 30, 0) deltas = np.random.randint(1, 20, 3).cumsum() ts = np.array([(t + timedelta(minutes=int(x))).time() for x in deltas]) + ts = Index(ts, dtype=object) df = DataFrame( {"a": np.random.randn(len(ts)), "b": np.random.randn(len(ts))}, index=ts ) @@ -1031,7 +1032,10 @@ def test_time(self): def test_time_change_xlim(self): t = datetime(1, 1, 1, 3, 30, 0) deltas = np.random.randint(1, 20, 3).cumsum() - ts = np.array([(t + timedelta(minutes=int(x))).time() for x in deltas]) + ts = Index( + np.array([(t + timedelta(minutes=int(x))).time() for x in deltas]), + dtype=object, + ) df = DataFrame( {"a": np.random.randn(len(ts)), "b": np.random.randn(len(ts))}, index=ts ) @@ -1073,6 +1077,7 @@ def test_time_musec(self): t = datetime(1, 1, 1, 3, 30, 0) deltas = np.random.randint(1, 20, 3).cumsum() ts = np.array([(t + timedelta(microseconds=int(x))).time() for x in deltas]) + ts = Index(ts, dtype=object) df = DataFrame( {"a": np.random.randn(len(ts)), "b": np.random.randn(len(ts))}, index=ts ) diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 9ec0071ba9afa..d5ff454fe9645 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -131,34 +131,43 @@ def test_concat_multiindex_datetime_object_index(self): dtype="object", ) - s = Series( - ["a", "b"], - index=MultiIndex.from_arrays( + msg = "Pandas type inference with a sequence of `datetime.date` objects" + with tm.assert_produces_warning(FutureWarning, match=msg): + # TODO: should this be not-inferring since we already specified + # object dtype? + mi = MultiIndex.from_arrays( [ [1, 2], idx[:-1], ], names=["first", "second"], - ), - ) - s2 = Series( + ) + s = Series( ["a", "b"], - index=MultiIndex.from_arrays( + index=mi, + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + mi2 = MultiIndex.from_arrays( [[1, 2], idx[::2]], names=["first", "second"], - ), - ) - mi = MultiIndex.from_arrays( - [[1, 2, 2], idx], - names=["first", "second"], + ) + s2 = Series( + ["a", "b"], + index=mi2, ) - assert mi.levels[1].dtype == object + with tm.assert_produces_warning(FutureWarning, match=msg): + mi3 = MultiIndex.from_arrays( + [[1, 2, 2], idx], + names=["first", "second"], + ) + assert mi3.levels[1].dtype == object expected = DataFrame( [["a", "a"], ["b", np.nan], [np.nan, "b"]], - index=mi, + index=mi3, ) - result = concat([s, s2], axis=1) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = concat([s, s2], axis=1) tm.assert_frame_equal(result, expected) def test_concat_NaT_series(self): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 896f1a9be52be..e27a798554467 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2036,31 +2036,42 @@ def test_dtype_on_categorical_dates(self): # GH 16900 # dates should not be coerced to ints - df = DataFrame( - [[date(2001, 1, 1), 1.1], [date(2001, 1, 2), 1.3]], columns=["date", "num2"] - ) - df["date"] = df["date"].astype("category") - - df2 = DataFrame( - [[date(2001, 1, 1), 1.3], [date(2001, 1, 3), 1.4]], columns=["date", "num4"] + msg = ( + "Pandas type inference with a sequence of `datetime.date` " + "objects is deprecated" ) - df2["date"] = df2["date"].astype("category") + with tm.assert_produces_warning(FutureWarning, match=msg): + df = DataFrame( + [[date(2001, 1, 1), 1.1], [date(2001, 1, 2), 1.3]], + columns=["date", "num2"], + ) + df["date"] = df["date"].astype("category") - expected_outer = DataFrame( - [ - [pd.Timestamp("2001-01-01").date(), 1.1, 1.3], - [pd.Timestamp("2001-01-02").date(), 1.3, np.nan], - [pd.Timestamp("2001-01-03").date(), np.nan, 1.4], - ], - columns=["date", "num2", "num4"], - ) - result_outer = merge(df, df2, how="outer", on=["date"]) + with tm.assert_produces_warning(FutureWarning, match=msg): + df2 = DataFrame( + [[date(2001, 1, 1), 1.3], [date(2001, 1, 3), 1.4]], + columns=["date", "num4"], + ) + df2["date"] = df2["date"].astype("category") + + with tm.assert_produces_warning(FutureWarning, match=msg): + expected_outer = DataFrame( + [ + [pd.Timestamp("2001-01-01").date(), 1.1, 1.3], + [pd.Timestamp("2001-01-02").date(), 1.3, np.nan], + [pd.Timestamp("2001-01-03").date(), np.nan, 1.4], + ], + columns=["date", "num2", "num4"], + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result_outer = merge(df, df2, how="outer", on=["date"]) tm.assert_frame_equal(result_outer, expected_outer) - expected_inner = DataFrame( - [[pd.Timestamp("2001-01-01").date(), 1.1, 1.3]], - columns=["date", "num2", "num4"], - ) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected_inner = DataFrame( + [[pd.Timestamp("2001-01-01").date(), 1.1, 1.3]], + columns=["date", "num2", "num4"], + ) result_inner = merge(df, df2, how="inner", on=["date"]) tm.assert_frame_equal(result_inner, expected_inner) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index b6fcb27faf146..13be50ec46496 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1087,11 +1087,15 @@ def test_pivot_integer_columns(self): [1.0], ) ) - df = DataFrame(data) - table = df.pivot_table(values=4, index=[0, 1, 3], columns=[2]) + msg = "type inference with a sequence of `datetime.date` objects is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + df = DataFrame(data) + with tm.assert_produces_warning(FutureWarning, match=msg): + table = df.pivot_table(values=4, index=[0, 1, 3], columns=[2]) df2 = df.rename(columns=str) - table2 = df2.pivot_table(values="4", index=["0", "1", "3"], columns=["2"]) + with tm.assert_produces_warning(FutureWarning, match=msg): + table2 = df2.pivot_table(values="4", index=["0", "1", "3"], columns=["2"]) tm.assert_frame_equal(table, table2, check_names=False) diff --git a/pandas/tests/series/accessors/test_cat_accessor.py b/pandas/tests/series/accessors/test_cat_accessor.py index cc64a9388fd7c..069b2e10f2ad1 100644 --- a/pandas/tests/series/accessors/test_cat_accessor.py +++ b/pandas/tests/series/accessors/test_cat_accessor.py @@ -212,8 +212,12 @@ def test_dt_accessor_api_for_categorical(self, idx): tm.assert_equal(res, exp) for attr in attr_names: - res = getattr(cat.dt, attr) - exp = getattr(ser.dt, attr) + with warnings.catch_warnings(): + if attr == "time": + # deprecated to return pyarrow time dtype + warnings.simplefilter("ignore", FutureWarning) + res = getattr(cat.dt, attr) + exp = getattr(ser.dt, attr) tm.assert_equal(res, exp) diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 5cdeee20f3435..ada580d4f694a 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -12,6 +12,7 @@ import pytz from pandas._libs.tslibs.timezones import maybe_get_tz +from pandas.compat import pa_version_under7p0 from pandas.errors import SettingWithCopyError from pandas.core.dtypes.common import ( @@ -87,10 +88,19 @@ def get_expected(ser, prop): result = result.astype("int64") elif not is_list_like(result) or isinstance(result, DataFrame): return result - return Series(result, index=ser.index, name=ser.name) + return Series(result, index=ser.index, name=ser.name, dtype=result.dtype) + + if name == "time": + msg = ( + "In a future version, this will return an array with pyarrow time dtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + left = getattr(ser.dt, name) + right = get_expected(ser, name) + else: + left = getattr(ser.dt, name) + right = get_expected(ser, name) - left = getattr(ser.dt, name) - right = get_expected(ser, name) if not (is_list_like(left) and is_list_like(right)): assert left == right elif isinstance(left, DataFrame): @@ -672,10 +682,31 @@ def test_valid_dt_with_missing_values(self): ) tm.assert_series_equal(result, expected) - result = ser.dt.time + msg = "In a future version, this will return an array with pyarrow time" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser.dt.time expected = Series([time(0), time(0), pd.NaT, time(0), time(0)], dtype="object") tm.assert_series_equal(result, expected) + with pd.option_context("future.infer_time", False): + with tm.assert_produces_warning(None): + result = ser.dt.time + tm.assert_series_equal(result, expected) + + if pa_version_under7p0: + return + + with pd.option_context("future.infer_time", True): + with tm.assert_produces_warning(None): + result_pa = ser.dt.time + + import pyarrow as pa + + pa_dtype = pa.time64("ns") + dtype = pd.ArrowDtype(pa_dtype) + expected_pa = expected.astype(dtype) + tm.assert_series_equal(result_pa, expected_pa) + def test_dt_accessor_api(self): # GH 9322 from pandas.core.indexes.accessors import ( @@ -712,9 +743,18 @@ def test_date_tz(self): tz="US/Eastern", ) ser = Series(rng) - expected = Series([date(2014, 4, 4), date(2014, 7, 18), date(2015, 11, 22)]) + + warn_msg = ( + "Pandas type inference with a sequence of `datetime.date` objects " + "is deprecated" + ) + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + expected = Series([date(2014, 4, 4), date(2014, 7, 18), date(2015, 11, 22)]) tm.assert_series_equal(ser.dt.date, expected) - tm.assert_series_equal(ser.apply(lambda x: x.date()), expected) + + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + res = ser.apply(lambda x: x.date()) + tm.assert_series_equal(res, expected) def test_dt_timetz_accessor(self, tz_naive_fixture): # GH21358 @@ -725,7 +765,8 @@ def test_dt_timetz_accessor(self, tz_naive_fixture): ) ser = Series(dtindex) expected = Series( - [time(23, 56, tzinfo=tz), time(21, 24, tzinfo=tz), time(22, 14, tzinfo=tz)] + [time(23, 56, tzinfo=tz), time(21, 24, tzinfo=tz), time(22, 14, tzinfo=tz)], + dtype=object, ) result = ser.dt.timetz tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index d72c8599dfe5e..5085a4b4f742a 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -25,6 +25,7 @@ Timestamp, cut, date_range, + option_context, ) import pandas._testing as tm @@ -377,7 +378,10 @@ def test_astype_cast_object_int(self): tm.assert_series_equal(result, Series(np.arange(1, 5))) - def test_astype_unicode(self): + @pytest.mark.parametrize( + "future", [pytest.param(True, marks=td.skip_if_no("pyarrow")), False, None] + ) + def test_astype_unicode(self, future): # see GH#7758: A bit of magic is required to set # default encoding to utf-8 digits = string.digits @@ -388,11 +392,18 @@ def test_astype_unicode(self): former_encoding = None + warn = FutureWarning if future is None else None + if sys.getdefaultencoding() == "utf-8": # GH#45326 as of 2.0 Series.astype matches Index.astype by handling # bytes with obj.decode() instead of str(obj) item = "野菜食べないとやばい" - ser = Series([item.encode()]) + + msg = "type inference with a sequence of `bytes` objects" + with option_context("future.infer_bytes", future): + with tm.assert_produces_warning(warn, match=msg): + ser = Series([item.encode()]) + result = ser.astype("unicode") expected = Series([item]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index f2ac5f1086625..5a60ab63ce123 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd import pandas._testing as tm @@ -225,12 +227,23 @@ def test_convert_bool_dtype(self): df = pd.DataFrame({"A": pd.array([True])}) tm.assert_frame_equal(df, df.convert_dtypes()) - def test_convert_byte_string_dtype(self): + @pytest.mark.parametrize( + "future", [pytest.param(True, marks=td.skip_if_no("pyarrow")), False, None] + ) + def test_convert_byte_string_dtype(self, future): # GH-43183 byte_str = b"binary-string" - df = pd.DataFrame(data={"A": byte_str}, index=[0]) - result = df.convert_dtypes() + with pd.option_context("future.infer_bytes", future): + warn_msg = "type inference with a `bytes` object" + warn = FutureWarning if future is None else None + with tm.assert_produces_warning(warn, match=warn_msg): + df = pd.DataFrame(data={"A": byte_str}, index=[0]) + + warn_msg2 = "type inference with a sequence of `bytes` objects" + with tm.assert_produces_warning(warn, match=warn_msg2): + result = df.convert_dtypes() + expected = df tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index a0edfae606e3f..8a63b9c4906f6 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -219,9 +219,8 @@ def test_add_with_duplicate_index(self): tm.assert_series_equal(result, expected) def test_add_na_handling(self): - ser = Series( - [Decimal("1.3"), Decimal("2.3")], index=[date(2012, 1, 1), date(2012, 1, 2)] - ) + index = Index([date(2012, 1, 1), date(2012, 1, 2)], dtype=object) + ser = Series([Decimal("1.3"), Decimal("2.3")], index=index) result = ser + ser.shift(1) result2 = ser.shift(1) + ser @@ -761,7 +760,13 @@ def test_align_date_objects_with_datetimeindex(self): ts_slice = ts[5:] ts2 = ts_slice.copy() - ts2.index = [x.date() for x in ts2.index] + + warn_msg = ( + "Pandas type inference with a sequence of `datetime.date` objects " + "is deprecated" + ) + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + ts2.index = [x.date() for x in ts2.index] result = ts + ts2 result2 = ts2 + ts diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index ceb283ca9e9e7..6f5d9e1424e7c 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1043,7 +1043,12 @@ def test_constructor_dtype_datetime64_7(self): # leave datetime.date alone dates2 = np.array([d.date() for d in dates.to_pydatetime()], dtype=object) - series1 = Series(dates2, dates) + warn_msg = ( + "Pandas type inference with a sequence of `datetime.date` objects " + "is deprecated" + ) + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + series1 = Series(dates2, dates) tm.assert_numpy_array_equal(series1.values, dates2) assert series1.dtype == object diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py index c439a5f006922..a20167034e346 100644 --- a/pandas/tests/strings/test_api.py +++ b/pandas/tests/strings/test_api.py @@ -1,11 +1,14 @@ import pytest +import pandas.util._test_decorators as td + from pandas import ( DataFrame, Index, MultiIndex, Series, _testing as tm, + option_context, ) from pandas.core.strings.accessor import StringMethods @@ -31,7 +34,27 @@ def test_api_per_dtype(index_or_series, dtype, any_skipna_inferred_dtype): box = index_or_series inferred_dtype, values = any_skipna_inferred_dtype - t = box(values, dtype=dtype) # explicit dtype to avoid casting + warn_msg = ( + "Pandas type inference with a sequence of `datetime.time` objects " + "is deprecated" + ) + warn = None + if dtype == "category" and inferred_dtype == "time": + warn = FutureWarning + if dtype == "category" and inferred_dtype == "bytes": + warn = FutureWarning + warn_msg = ( + "Pandas type inference with a sequence of `bytes` objects is deprecated" + ) + if dtype == "category" and inferred_dtype == "date": + warn = FutureWarning + warn_msg = ( + "Pandas type inference with a sequence of `datetime.date` objects " + "is deprecated" + ) + + with tm.assert_produces_warning(warn, match=warn_msg): + t = box(values, dtype=dtype) # explicit dtype to avoid casting types_passing_constructor = [ "string", @@ -52,12 +75,16 @@ def test_api_per_dtype(index_or_series, dtype, any_skipna_inferred_dtype): assert not hasattr(t, "str") +@pytest.mark.parametrize( + "future_bytes", [pytest.param(True, marks=td.skip_if_no("pyarrow")), False, None] +) @pytest.mark.parametrize("dtype", [object, "category"]) def test_api_per_method( index_or_series, dtype, any_allowed_skipna_inferred_dtype, any_string_method, + future_bytes, request, ): # this test does not check correctness of the different methods, @@ -94,7 +121,14 @@ def test_api_per_method( mark = pytest.mark.xfail(raises=raises, reason=reason) request.node.add_marker(mark) - t = box(values, dtype=dtype) # explicit dtype to avoid casting + warn = None + warn_msg = "type inference with a sequence of `bytes` objects" + if dtype == "category" and inferred_dtype == "bytes" and future_bytes is None: + warn = FutureWarning + + with option_context("future.infer_bytes", future_bytes): + with tm.assert_produces_warning(warn, match=warn_msg): + t = box(values, dtype=dtype) # explicit dtype to avoid casting method = getattr(t.str, method_name) bytes_allowed = method_name in ["decode", "get", "len", "slice"] diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index d4807a8eedaaa..ec693dbfd4989 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -6,11 +6,14 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( DataFrame, Index, MultiIndex, Series, + option_context, ) import pandas._testing as tm from pandas.core.strings.accessor import StringMethods @@ -523,34 +526,54 @@ def test_encode_decode(any_string_dtype): tm.assert_series_equal(result, expected) -def test_encode_errors_kwarg(any_string_dtype): - ser = Series(["a", "b", "a\x9d"], dtype=any_string_dtype) - - msg = ( - r"'charmap' codec can't encode character '\\x9d' in position 1: " - "character maps to " - ) - with pytest.raises(UnicodeEncodeError, match=msg): - ser.str.encode("cp1252") - - result = ser.str.encode("cp1252", "ignore") - expected = ser.map(lambda x: x.encode("cp1252", "ignore")) - tm.assert_series_equal(result, expected) - +@pytest.mark.parametrize( + "future", [pytest.param(True, marks=td.skip_if_no("pyarrow")), False, None] +) +def test_encode_errors_kwarg(any_string_dtype, future): + with option_context("future.infer_bytes", future): + ser = Series(["a", "b", "a\x9d"], dtype=any_string_dtype) + + msg = ( + r"'charmap' codec can't encode character '\\x9d' in position 1: " + "character maps to " + ) + with pytest.raises(UnicodeEncodeError, match=msg): + ser.str.encode("cp1252") + + result = ser.str.encode("cp1252", "ignore") + + warn_msg = "type inference with a sequence of `bytes` objects" + warn = FutureWarning if future is None else None + with tm.assert_produces_warning(warn, match=warn_msg): + expected = ser.map(lambda x: x.encode("cp1252", "ignore")) + + if future is True: + expected = expected.astype(object) + tm.assert_series_equal(result, expected) -def test_decode_errors_kwarg(): - ser = Series([b"a", b"b", b"a\x9d"]) - msg = ( - "'charmap' codec can't decode byte 0x9d in position 1: " - "character maps to " - ) - with pytest.raises(UnicodeDecodeError, match=msg): - ser.str.decode("cp1252") - - result = ser.str.decode("cp1252", "ignore") - expected = ser.map(lambda x: x.decode("cp1252", "ignore")) - tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "future", [pytest.param(True, marks=td.skip_if_no("pyarrow")), False, None] +) +def test_decode_errors_kwarg(future): + warn_msg = "type inference with a sequence of `bytes` objects" + warn = FutureWarning if future is None else None + with option_context("future.infer_bytes", future): + with tm.assert_produces_warning(warn, match=warn_msg): + ser = Series([b"a", b"b", b"a\x9d"]) + + msg = ( + "'charmap' codec can't decode byte 0x9d in position 1: " + "character maps to " + ) + with pytest.raises(UnicodeDecodeError, match=msg): + ser.str.decode("cp1252") + + result = ser.str.decode("cp1252", "ignore") + expected = ser.map(lambda x: x.decode("cp1252", "ignore")) + if future: + expected = expected.astype("string[pyarrow]") + tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -643,9 +666,18 @@ def test_str_accessor_no_new_attributes(any_string_dtype): ser.str.xlabel = "a" -def test_cat_on_bytes_raises(): - lhs = Series(np.array(list("abc"), "S1").astype(object)) - rhs = Series(np.array(list("def"), "S1").astype(object)) +@pytest.mark.parametrize( + "future", [pytest.param(True, marks=td.skip_if_no("pyarrow")), False, None] +) +def test_cat_on_bytes_raises(future): + warn_msg = "type inference with a sequence of `bytes` objects" + warn = FutureWarning if future is None else None + + with option_context("future.infer_bytes", future): + with tm.assert_produces_warning(warn, match=warn_msg): + lhs = Series(np.array(list("abc"), "S1").astype(object)) + rhs = Series(np.array(list("def"), "S1").astype(object)) + msg = "Cannot use .str.cat with values of inferred dtype 'bytes'" with pytest.raises(TypeError, match=msg): lhs.str.cat(rhs) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 8c5f9a894f2f7..f920c141366c9 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -269,13 +269,18 @@ def test_subsets_multiindex_dtype(self): tm.assert_series_equal(result, expected) def test_datetime_object_multiindex(self): + msg = ( + "Pandas type inference with a sequence of `datetime.date` " + "objects is deprecated" + ) data_dic = { (0, datetime.date(2018, 3, 3)): {"A": 1, "B": 10}, (0, datetime.date(2018, 3, 4)): {"A": 2, "B": 11}, (1, datetime.date(2018, 3, 3)): {"A": 3, "B": 12}, (1, datetime.date(2018, 3, 4)): {"A": 4, "B": 13}, } - result = DataFrame.from_dict(data_dic, orient="index") + with tm.assert_produces_warning(FutureWarning, match=msg): + result = DataFrame.from_dict(data_dic, orient="index") data = {"A": [1, 2, 3, 4], "B": [10, 11, 12, 13]} index = [ [0, 0, 1, 1], @@ -286,7 +291,8 @@ def test_datetime_object_multiindex(self): datetime.date(2018, 3, 4), ], ] - expected = DataFrame(data=data, index=index) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = DataFrame(data=data, index=index) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/tools/test_to_time.py b/pandas/tests/tools/test_to_time.py index 5046fd9d0edc1..eb987d8a63b39 100644 --- a/pandas/tests/tools/test_to_time.py +++ b/pandas/tests/tools/test_to_time.py @@ -61,9 +61,15 @@ def test_arraylike(self): with pytest.raises(ValueError, match=msg): to_time(arg, format="%I:%M%p", errors="raise") - tm.assert_series_equal( - to_time(Series(arg, name="test")), Series(expected_arr, name="test") + warn_msg = ( + "Pandas type inference with a sequence of `datetime.time` objects " + "is deprecated" ) + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + res_ser = to_time(Series(arg, name="test")) + exp_ser = Series(expected_arr, name="test", dtype=object) + + tm.assert_series_equal(res_ser, exp_ser) res = to_time(np.array(arg)) assert isinstance(res, list) diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index c2977b81a9b4a..f75ba2eef8750 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -309,11 +311,22 @@ def test_invalid_key(): hash_pandas_object(Series(list("abc")), hash_key="foo") -def test_already_encoded(index): +@pytest.mark.parametrize( + "future", [pytest.param(True, marks=td.skip_if_no("pyarrow")), False, None] +) +def test_already_encoded(index, future): # If already encoded, then ok. - obj = Series(list("abc")).str.encode("utf8") - a = hash_pandas_object(obj, index=index) - b = hash_pandas_object(obj, index=index) + msg = "type inference with a sequence of `bytes` objects" + warn = None + if future is None: + warn = FutureWarning + + with pd.option_context("future.infer_bytes", future): + obj = Series(list("abc")).str.encode("utf8") + with tm.assert_produces_warning(warn, match=msg): + a = hash_pandas_object(obj, index=index) + b = hash_pandas_object(obj, index=index) + tm.assert_series_equal(a, b)