Skip to content

DEPR: infer bytes to bytes[pyarrow] #53357

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 28 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
bae9342
POC: infer time objects to ArrowDtype[time]
jbrockmendel May 1, 2023
d0cce3d
skip if no pyarrow
jbrockmendel May 1, 2023
5857d67
dt.time
jbrockmendel May 3, 2023
fa6421c
mypy fixup
jbrockmendel May 3, 2023
d498ea1
Handle construction from scalar
jbrockmendel May 4, 2023
037a0c6
fix timetz sqlite tet
jbrockmendel May 4, 2023
94fd426
update
jbrockmendel May 17, 2023
2b80d58
update test
jbrockmendel May 19, 2023
b365fe2
update test
jbrockmendel May 19, 2023
8971611
unstrict xfail
jbrockmendel May 20, 2023
9e0b4bd
lint fixup
jbrockmendel May 20, 2023
3d9ec01
Fix doctest, typo
jbrockmendel May 22, 2023
65a702c
remove extra import
jbrockmendel May 30, 2023
24f848b
doctest warnings
jbrockmendel May 31, 2023
1f01e6e
dt.time
jbrockmendel May 3, 2023
7568a8b
dt.time
jbrockmendel May 3, 2023
0a7562a
Handle construction from scalar
jbrockmendel May 4, 2023
62d03ca
update
jbrockmendel May 17, 2023
47e1601
Fix doctest, typo
jbrockmendel May 22, 2023
423496a
POC: infer time objects to ArrowDtype[time]
jbrockmendel May 1, 2023
ce1bc00
Handle construction from scalar
jbrockmendel May 4, 2023
ae01101
POC: infer time objects to ArrowDtype[time]
jbrockmendel May 1, 2023
208789c
Handle construction from scalar
jbrockmendel May 4, 2023
f6d055d
ENH/DEPR: infer date objects to date[pyarrow] dtype
jbrockmendel May 10, 2023
5ff5333
deprecate inference with scalar date
jbrockmendel May 10, 2023
70470f7
remove extra import
jbrockmendel May 19, 2023
985e7af
whitespace fixup
jbrockmendel May 30, 2023
a3bd444
DEPR: infer bytes to bytes[pyarrow]
jbrockmendel May 23, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 116 additions & 2 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ from typing import (
Literal,
_GenericAlias,
)
import warnings

cimport cython
from cpython.datetime cimport (
Expand Down Expand Up @@ -99,6 +100,8 @@ cdef extern from "pandas/parser/pd_parser.h":

PandasParser_IMPORT

from pandas._config import get_option

from pandas._libs cimport util
from pandas._libs.util cimport (
INT64_MAX,
Expand Down Expand Up @@ -1267,6 +1270,9 @@ cdef class Seen:
bint datetimetz_ # seen_datetimetz
bint period_ # seen_period
bint interval_ # seen_interval
bint time_
bint date_
bint bytes_

def __cinit__(self, bint coerce_numeric=False):
"""
Expand All @@ -1293,6 +1299,9 @@ cdef class Seen:
self.datetimetz_ = False
self.period_ = False
self.interval_ = False
self.time_ = False
self.date_ = False
self.bytes_ = False
self.coerce_numeric = coerce_numeric

cdef bint check_uint64_conflict(self) except -1:
Expand Down Expand Up @@ -2560,6 +2569,11 @@ def maybe_convert_objects(ndarray[object] objects,
else:
seen.object_ = True
break
elif PyDate_Check(val):
if convert_non_numeric:
seen.date_ = True
else:
seen.object_ = True
elif is_period_object(val):
if convert_non_numeric:
seen.period_ = True
Expand All @@ -2583,12 +2597,53 @@ def maybe_convert_objects(ndarray[object] objects,
else:
seen.object_ = True
break
elif isinstance(val, bytes):
if convert_non_numeric:
seen.bytes_ = True
else:
seen.object_ = True
break
elif PyTime_Check(val):
if convert_non_numeric and val.tzinfo is None:
seen.time_ = True
else:
seen.object_ = True
break
else:
seen.object_ = True
break

# we try to coerce datetime w/tz but must all have the same tz
if seen.datetimetz_:
if seen.bytes_:
if is_bytes_array(objects):
opt = get_option("future.infer_bytes")
if opt is True:
import pyarrow as pa

from pandas.core.dtypes.dtypes import ArrowDtype

obj = pa.array(objects)
dtype = ArrowDtype(obj.type)
return dtype.construct_array_type()(obj)
elif opt is False:
# explicitly set to keep the old behavior and avoid the warning
pass
else:
from pandas.util._exceptions import find_stack_level
warnings.warn(
"Pandas type inference with a sequence of `bytes` "
"objects is deprecated. In a future version, this will give "
"bytes[pyarrow] dtype, which will require pyarrow to be "
"installed. To opt in to the new behavior immediately set "
"`pd.set_option('future.infer_bytes', True)`. To keep the "
"old behavior pass `dtype=object`.",
FutureWarning,
stacklevel=find_stack_level(),
)

seen.object_ = True

elif seen.datetimetz_:
# we try to coerce datetime w/tz but must all have the same tz
if is_datetime_with_singletz_array(objects):
from pandas import DatetimeIndex

Expand Down Expand Up @@ -2647,6 +2702,65 @@ def maybe_convert_objects(ndarray[object] objects,

seen.object_ = True

elif seen.time_:
if is_time_array(objects):
# FIXME: need to ensure this is not timetz
opt = get_option("future.infer_time")
if opt is True:
import pyarrow as pa

from pandas.core.dtypes.dtypes import ArrowDtype

obj = pa.array(objects)
dtype = ArrowDtype(obj.type)
return dtype.construct_array_type()(obj)
elif opt is False:
# explicitly set to keep the old behavior and avoid the warning
pass
else:
from pandas.util._exceptions import find_stack_level
warnings.warn(
"Pandas type inference with a sequence of `datetime.time` "
"objects is deprecated. In a future version, this will give "
"time32[pyarrow] dtype, which will require pyarrow to be "
"installed. To opt in to the new behavior immediately set "
"`pd.set_option('future.infer_time', True)`. To keep the "
"old behavior pass `dtype=object`.",
FutureWarning,
stacklevel=find_stack_level(),
)

seen.object_ = True

elif seen.date_:
if is_date_array(objects, skipna=True):
opt = get_option("future.infer_date")
if opt is True:
import pyarrow as pa

from pandas.core.dtypes.dtypes import ArrowDtype

obj = pa.array(objects)
dtype = ArrowDtype(obj.type)
return dtype.construct_array_type()(obj)
elif opt is False:
# explicitly set to keep the old behavior and avoid the warning
pass
else:
from pandas.util._exceptions import find_stack_level
warnings.warn(
"Pandas type inference with a sequence of `datetime.date` "
"objects is deprecated. In a future version, this will give "
"date32[pyarrow] dtype, which will require pyarrow to be "
"installed. To opt in to the new behavior immediately set "
"`pd.set_option('future.infer_time', True)`. To keep the "
"old behavior pass `dtype=object`.",
FutureWarning,
stacklevel=find_stack_level(),
)

seen.object_ = True

elif seen.nat_:
if not seen.object_ and not seen.numeric_ and not seen.bool_:
# all NaT, None, or nan (at least one NaT)
Expand Down
5 changes: 5 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,9 @@ def pytest_collection_modifyitems(items, config) -> None:
# Warnings from doctests that can be ignored; place reason in comment above.
# Each entry specifies (path, message) - see the ignore_doctest_warning function
ignored_doctest_warnings = [
("DatetimeProperties.time", "with pyarrow time dtype"),
("DatetimeArray.time", "with pyarrow time dtype"),
("DatetimeIndex.time", "with pyarrow time dtype"),
("is_int64_dtype", "is_int64_dtype is deprecated"),
("is_interval_dtype", "is_interval_dtype is deprecated"),
("is_period_dtype", "is_period_dtype is deprecated"),
Expand All @@ -137,6 +140,8 @@ def pytest_collection_modifyitems(items, config) -> None:
("is_sparse", "is_sparse is deprecated"),
("NDFrame.replace", "The 'method' keyword"),
("NDFrame.replace", "Series.replace without 'value'"),
("DatetimeArray.time", "with pyarrow time dtype"),
("DatetimeIndex.time", "with pyarrow time dtype"),
# Docstring divides by zero to show behavior difference
("missing.mask_zero_div_zero", "divide by zero encountered"),
(
Expand Down
10 changes: 9 additions & 1 deletion pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,7 +651,15 @@ def _evaluate_op_method(self, other, op, arrow_funcs):
if pc_func is NotImplemented:
raise NotImplementedError(f"{op.__name__} not implemented.")

result = pc_func(self._pa_array, other)
try:
result = pc_func(self._pa_array, other)
except pa.lib.ArrowNotImplementedError:
if op in [operator.add, roperator.radd, operator.sub, roperator.rsub]:
# By returning NotImplemented we get standard message with a
# TypeError
return NotImplemented
raise

return type(self)(result)

def _logical_method(self, other, op):
Expand Down
36 changes: 33 additions & 3 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@

import numpy as np

from pandas._config import get_option

from pandas._libs import (
lib,
tslib,
Expand Down Expand Up @@ -55,9 +57,11 @@
pandas_dtype,
)
from pandas.core.dtypes.dtypes import (
ArrowDtype,
DatetimeTZDtype,
ExtensionDtype,
PeriodDtype,
ArrowDtype,
)
from pandas.core.dtypes.missing import isna

Expand All @@ -82,7 +86,10 @@
)

from pandas import DataFrame
from pandas.core.arrays import PeriodArray
from pandas.core.arrays import (
ArrowExtensionArray,
PeriodArray,
)

_midnight = time(0, 0)

Expand Down Expand Up @@ -1335,7 +1342,7 @@ def day_name(self, locale=None) -> npt.NDArray[np.object_]:
return result

@property
def time(self) -> npt.NDArray[np.object_]:
def time(self) -> npt.NDArray[np.object_] | ArrowExtensionArray:
"""
Returns numpy array of :class:`datetime.time` objects.

Expand Down Expand Up @@ -1368,7 +1375,30 @@ def time(self) -> npt.NDArray[np.object_]:
# keeping their timezone and not using UTC
timestamps = self._local_timestamps()

return ints_to_pydatetime(timestamps, box="time", reso=self._creso)
result = ints_to_pydatetime(timestamps, box="time", reso=self._creso)

opt = get_option("future.infer_time")
if opt is None:
warnings.warn(
f"The behavior of {type(self).__name__}.time is deprecated. "
"In a future version, this will return an array with pyarrow time "
"dtype instead of object dtype. To opt in to the future behavior, "
"set `pd.set_option('future.infer_time', True)`.",
FutureWarning,
stacklevel=find_stack_level(),
)
elif opt is True:
# TODO: optimize this to avoid going through ints_to_pydatetime
import pyarrow as pa

pa_type = pa.time64(self.unit)
result[self.isna()] = None
obj = pa.array(result, type=pa_type)
dtype = ArrowDtype(obj.type)
out = dtype.construct_array_type()(obj)
return out

return result

@property
def timetz(self) -> npt.NDArray[np.object_]:
Expand Down
28 changes: 28 additions & 0 deletions pandas/core/config_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -889,3 +889,31 @@ def register_converter_cb(key) -> None:
styler_environment,
validator=is_instance_factory([type(None), str]),
)


with cf.config_prefix("future"):
cf.register_option(
"future.infer_bytes",
None,
"Whether to infer sequence of bytes objects as pyarrow bytes "
"dtype, which will be the default in pandas 3.0 "
"(at which point this option will be deprecated).",
validator=is_one_of_factory([True, False, None]),
)
cf.register_option(
"future.infer_time",
None,
"Whether to infer sequence of datetime.time objects as pyarrow time "
"dtype, which will be the default in pandas 3.0 "
"(at which point this option will be deprecated).",
validator=is_one_of_factory([True, False, None]),
)

cf.register_option(
"future.infer_date",
None,
"Whether to infer sequence of datetime.date objects as pyarrow date "
"dtype, which will be the default in pandas 3.0 "
"(at which point this option will be deprecated).",
validator=is_one_of_factory([True, False, None]),
)
Loading