Skip to content

Commit d27f15f

Browse files
committed
DEPR: infer bytes to bytes[pyarrow]
1 parent 5a10bc7 commit d27f15f

22 files changed

+460
-99
lines changed

pandas/_libs/lib.pyx

+39-2
Original file line numberDiff line numberDiff line change
@@ -1272,6 +1272,7 @@ cdef class Seen:
12721272
bint interval_ # seen_interval
12731273
bint time_
12741274
bint date_
1275+
bint bytes_
12751276

12761277
def __cinit__(self, bint coerce_numeric=False):
12771278
"""
@@ -1300,6 +1301,7 @@ cdef class Seen:
13001301
self.interval_ = False
13011302
self.time_ = False
13021303
self.date_ = False
1304+
self.bytes_ = False
13031305
self.coerce_numeric = coerce_numeric
13041306

13051307
cdef bint check_uint64_conflict(self) except -1:
@@ -2588,6 +2590,12 @@ def maybe_convert_objects(ndarray[object] objects,
25882590
else:
25892591
seen.object_ = True
25902592
break
2593+
elif isinstance(val, bytes):
2594+
if convert_non_numeric:
2595+
seen.bytes_ = True
2596+
else:
2597+
seen.object_ = True
2598+
break
25912599
elif PyTime_Check(val):
25922600
if convert_non_numeric and val.tzinfo is None:
25932601
seen.time_ = True
@@ -2598,8 +2606,37 @@ def maybe_convert_objects(ndarray[object] objects,
25982606
seen.object_ = True
25992607
break
26002608

2601-
# we try to coerce datetime w/tz but must all have the same tz
2602-
if seen.datetimetz_:
2609+
if seen.bytes_:
2610+
if is_bytes_array(objects):
2611+
opt = get_option("future.infer_bytes")
2612+
if opt is True:
2613+
import pyarrow as pa
2614+
2615+
from pandas.core.dtypes.dtypes import ArrowDtype
2616+
2617+
obj = pa.array(objects)
2618+
dtype = ArrowDtype(obj.type)
2619+
return dtype.construct_array_type()(obj)
2620+
elif opt is False:
2621+
# explicitly set to keep the old behavior and avoid the warning
2622+
pass
2623+
else:
2624+
from pandas.util._exceptions import find_stack_level
2625+
warnings.warn(
2626+
"Pandas type inference with a sequence of `bytes` "
2627+
"objects is deprecated. In a future version, this will give "
2628+
"bytes[pyarrow] dtype, which will require pyarrow to be "
2629+
"installed. To opt in to the new behavior immediately set "
2630+
"`pd.set_option('future.infer_bytes', True)`. To keep the "
2631+
"old behavior pass `dtype=object`.",
2632+
FutureWarning,
2633+
stacklevel=find_stack_level(),
2634+
)
2635+
2636+
seen.object_ = True
2637+
2638+
elif seen.datetimetz_:
2639+
# we try to coerce datetime w/tz but must all have the same tz
26032640
if is_datetime_with_singletz_array(objects):
26042641
from pandas import DatetimeIndex
26052642

pandas/core/config_init.py

+8
Original file line numberDiff line numberDiff line change
@@ -892,6 +892,14 @@ def register_converter_cb(key) -> None:
892892

893893

894894
with cf.config_prefix("future"):
895+
cf.register_option(
896+
"future.infer_bytes",
897+
None,
898+
"Whether to infer sequence of bytes objects as pyarrow bytes "
899+
"dtype, which will be the default in pandas 3.0 "
900+
"(at which point this option will be deprecated).",
901+
validator=is_one_of_factory([True, False, None]),
902+
)
895903
cf.register_option(
896904
"future.infer_time",
897905
None,

pandas/core/construction.py

+24
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,30 @@ def array(
415415
stacklevel=find_stack_level(),
416416
)
417417

418+
elif inferred_dtype == "bytes":
419+
opt = get_option("future.infer_bytes")
420+
421+
if opt is True:
422+
import pyarrow as pa
423+
424+
obj = pa.array(data)
425+
dtype = ArrowDtype(obj.type)
426+
return dtype.construct_array_type()(obj)
427+
elif opt is False:
428+
# explicitly set to keep the old behavior and avoid the warning
429+
pass
430+
else:
431+
warnings.warn(
432+
"Pandas type inference with a sequence of `bytes` "
433+
"objects is deprecated. In a future version, this will give "
434+
"bytes[pyarrow] dtype, which will require pyarrow to be "
435+
"installed. To opt in to the new behavior immediately set "
436+
"`pd.set_option('future.infer_bytes', True)`. To keep the "
437+
"old behavior pass `dtype=object`.",
438+
FutureWarning,
439+
stacklevel=find_stack_level(),
440+
)
441+
418442
# Pandas overrides NumPy for
419443
# 1. datetime64[ns,us,ms,s]
420444
# 2. timedelta64[ns,us,ms,s]

pandas/core/dtypes/cast.py

+18
Original file line numberDiff line numberDiff line change
@@ -849,7 +849,25 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
849849
import pyarrow as pa
850850

851851
pa_dtype = pa.date32()
852+
dtype = ArrowDtype(pa_dtype)
853+
854+
elif isinstance(val, bytes):
855+
opt = get_option("future.infer_bytes")
856+
if opt is None:
857+
warnings.warn(
858+
"Pandas type inference with a `bytes` "
859+
"object is deprecated. In a future version, this will give "
860+
"bytes[pyarrow] dtype, which will require pyarrow to be "
861+
"installed. To opt in to the new behavior immediately set "
862+
"`pd.set_option('future.infer_bytes', True)`. To keep the "
863+
"old behavior pass `dtype=object`.",
864+
FutureWarning,
865+
stacklevel=find_stack_level(),
866+
)
867+
elif opt is True:
868+
import pyarrow as pa
852869

870+
pa_dtype = pa.binary()
853871
dtype = ArrowDtype(pa_dtype)
854872

855873
elif is_bool(val):

pandas/core/strings/accessor.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -1961,7 +1961,13 @@ def decode(self, encoding, errors: str = "strict"):
19611961
f = lambda x: decoder(x, errors)[0]
19621962
arr = self._data.array
19631963
# assert isinstance(arr, (StringArray,))
1964-
result = arr._str_map(f)
1964+
1965+
if isinstance(arr.dtype, ArrowDtype):
1966+
# TODO: is there a performant way to do this?
1967+
res_values = arr.map(f)
1968+
result = type(arr)._from_sequence(res_values)
1969+
else:
1970+
result = arr._str_map(f)
19651971
return self._wrap_result(result)
19661972

19671973
@forbid_nonstring_types(["bytes"])

pandas/io/pytables.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -5066,7 +5066,16 @@ def _unconvert_string_array(
50665066
dtype = f"U{itemsize}"
50675067

50685068
if isinstance(data[0], bytes):
5069-
data = Series(data, copy=False).str.decode(encoding, errors=errors)._values
5069+
with warnings.catch_warnings():
5070+
# Deprecation about inferring bytes to bytes[pyarrow] dtype
5071+
# TODO: try to avoid this altogether
5072+
warnings.filterwarnings("ignore", category=FutureWarning)
5073+
5074+
data = (
5075+
Series(data, copy=False).str.decode(encoding, errors=errors)._values
5076+
).astype(object, copy=False)
5077+
# TODO: if we have pyarrow str instead of object here to begin
5078+
# with, can we avoid object dtype cast here?
50705079
else:
50715080
data = data.astype(dtype, copy=False).astype(object, copy=False)
50725081

pandas/io/stata.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -2910,7 +2910,13 @@ def _prepare_data(self) -> np.recarray:
29102910
for i, col in enumerate(data):
29112911
typ = typlist[i]
29122912
if typ <= self._max_string_length:
2913-
data[col] = data[col].fillna("").apply(_pad_bytes, args=(typ,))
2913+
with warnings.catch_warnings():
2914+
# deprecated behavior with sequence of bytes, will infer
2915+
# to bytes[pyarrow]
2916+
# TODO: can we avoid this altogether
2917+
warnings.filterwarnings("ignore", category=FutureWarning)
2918+
2919+
data[col] = data[col].fillna("").apply(_pad_bytes, args=(typ,))
29142920
stype = f"S{typ}"
29152921
dtypes[col] = stype
29162922
data[col] = data[col].astype(stype)

pandas/tests/dtypes/cast/test_infer_dtype.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,14 @@ def test_infer_dtype_from_scalar_errors():
163163
],
164164
)
165165
def test_infer_dtype_from_scalar(value, expected):
166-
dtype, _ = infer_dtype_from_scalar(value)
166+
msg = "type inference with a `bytes` object is deprecated"
167+
warn = None
168+
if isinstance(value, bytes):
169+
warn = FutureWarning
170+
171+
with tm.assert_produces_warning(warn, match=msg):
172+
dtype, _ = infer_dtype_from_scalar(value)
173+
167174
assert is_dtype_equal(dtype, expected)
168175

169176
with pytest.raises(TypeError, match="must be list-like"):

pandas/tests/dtypes/cast/test_promote.py

+21-3
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,13 @@ def test_maybe_promote_any_with_bytes(any_numpy_dtype):
311311
# output is not a generic bytes, but corresponds to expected_dtype
312312
exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0]
313313

314-
_check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar)
314+
msg = "type inference with a `bytes` object"
315+
warn = None
316+
if any_numpy_dtype in ["timedelta64[ns]", "datetime64[ns]"]:
317+
warn = FutureWarning
318+
319+
with tm.assert_produces_warning(warn, match=msg):
320+
_check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar)
315321

316322

317323
def test_maybe_promote_datetime64_with_any(datetime64_dtype, any_numpy_dtype):
@@ -330,7 +336,13 @@ def test_maybe_promote_datetime64_with_any(datetime64_dtype, any_numpy_dtype):
330336
expected_dtype = np.dtype(object)
331337
exp_val_for_scalar = fill_value
332338

333-
_check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar)
339+
msg = "type inference with a `bytes` object is deprecated"
340+
warn = None
341+
if any_numpy_dtype is bytes and datetime64_dtype == "datetime64[ns]":
342+
warn = FutureWarning
343+
344+
with tm.assert_produces_warning(warn, match=msg):
345+
_check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar)
334346

335347

336348
@pytest.mark.parametrize(
@@ -413,7 +425,13 @@ def test_maybe_promote_timedelta64_with_any(timedelta64_dtype, any_numpy_dtype):
413425
expected_dtype = np.dtype(object)
414426
exp_val_for_scalar = fill_value
415427

416-
_check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar)
428+
msg = "type inference with a `bytes` object is deprecated"
429+
warn = None
430+
if any_numpy_dtype is bytes and timedelta64_dtype == "timedelta64[ns]":
431+
warn = FutureWarning
432+
433+
with tm.assert_produces_warning(warn, match=msg):
434+
_check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar)
417435

418436

419437
@pytest.mark.parametrize(

pandas/tests/extension/test_arrow.py

+6
Original file line numberDiff line numberDiff line change
@@ -739,6 +739,9 @@ def test_stack(self, data, columns):
739739
warn_msg = (
740740
"Pandas type inference with a sequence of `datetime.date` objects"
741741
)
742+
if pa.types.is_binary(pa_dtype):
743+
warn = FutureWarning
744+
warn_msg = "Pandas type inference with a sequence of `bytes` objects"
742745

743746
with tm.assert_produces_warning(warn, match=warn_msg, check_stacklevel=False):
744747
super().test_stack(data, columns)
@@ -813,6 +816,9 @@ def test_hash_pandas_object_works(self, data, as_frame):
813816
# TODO(#48964) This warning will be avoided by implementing
814817
# ArrowExtensionArray.hash_pandas_object
815818
warn = FutureWarning
819+
elif pa.types.is_binary(pa_dtype):
820+
warn_msg = "Pandas type inference with a sequence of `bytes`"
821+
warn = FutureWarning
816822

817823
with tm.assert_produces_warning(warn, match=warn_msg, check_stacklevel=False):
818824
super().test_hash_pandas_object_works(data, as_frame)

pandas/tests/frame/methods/test_filter.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import numpy as np
22
import pytest
33

4+
import pandas.util._test_decorators as td
5+
46
import pandas as pd
57
from pandas import DataFrame
68
import pandas._testing as tm
@@ -112,11 +114,18 @@ def test_filter_unicode(self, name, expected):
112114
tm.assert_frame_equal(df.filter(like=name), expected)
113115
tm.assert_frame_equal(df.filter(regex=name), expected)
114116

117+
@pytest.mark.parametrize(
118+
"future", [pytest.param(True, marks=td.skip_if_no("pyarrow")), False, None]
119+
)
115120
@pytest.mark.parametrize("name", ["a", "a"])
116-
def test_filter_bytestring(self, name):
121+
def test_filter_bytestring(self, name, future):
117122
# GH13101
118-
df = DataFrame({b"a": [1, 2], b"b": [3, 4]})
119-
expected = DataFrame({b"a": [1, 2]})
123+
warn = FutureWarning if future is None else None
124+
msg = "type inference with a sequence of `bytes` objects"
125+
with tm.assert_produces_warning(warn, match=msg):
126+
with pd.option_context("future.infer_bytes", future):
127+
df = DataFrame({b"a": [1, 2], b"b": [3, 4]})
128+
expected = DataFrame({b"a": [1, 2]})
120129

121130
tm.assert_frame_equal(df.filter(like=name), expected)
122131
tm.assert_frame_equal(df.filter(regex=name), expected)

pandas/tests/io/formats/test_to_string.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
option_context,
1414
to_datetime,
1515
)
16+
import pandas._testing as tm
1617

1718

1819
def test_repr_embedded_ndarray():
@@ -172,10 +173,13 @@ def test_to_string_unicode_columns(float_frame):
172173

173174

174175
def test_to_string_utf8_columns():
176+
msg = "type inference with a sequence of `bytes` objects"
177+
175178
n = "\u05d0".encode()
176179

177180
with option_context("display.max_rows", 1):
178-
df = DataFrame([1, 2], columns=[n])
181+
with tm.assert_produces_warning(FutureWarning, match=msg):
182+
df = DataFrame([1, 2], columns=[n])
179183
repr(df)
180184

181185

pandas/tests/io/pytables/test_store.py

+14-7
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
import numpy as np
1212
import pytest
1313

14+
import pandas.util._test_decorators as td
15+
1416
import pandas as pd
1517
from pandas import (
1618
DataFrame,
@@ -324,16 +326,21 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path):
324326
tm.assert_series_equal(read_hdf(path, "ss4"), concat([df["B"], df2["B"]]))
325327

326328

329+
@pytest.mark.parametrize(
330+
"future", [pytest.param(True, marks=td.skip_if_no("pyarrow")), False, None]
331+
)
327332
@pytest.mark.parametrize("format", ["fixed", "table"])
328-
def test_to_hdf_errors(tmp_path, format, setup_path):
333+
def test_to_hdf_errors(tmp_path, format, setup_path, future):
329334
data = ["\ud800foo"]
330-
ser = Series(data, index=Index(data))
331-
path = tmp_path / setup_path
332-
# GH 20835
333-
ser.to_hdf(path, "table", format=format, errors="surrogatepass")
334335

335-
result = read_hdf(path, "table", errors="surrogatepass")
336-
tm.assert_series_equal(result, ser)
336+
with pd.option_context("future.infer_bytes", future):
337+
ser = Series(data, index=Index(data))
338+
path = tmp_path / setup_path
339+
# GH 20835
340+
ser.to_hdf(path, "table", format=format, errors="surrogatepass")
341+
342+
result = read_hdf(path, "table", errors="surrogatepass")
343+
tm.assert_series_equal(result, ser)
337344

338345

339346
def test_create_table_index(setup_path):

0 commit comments

Comments
 (0)