pandas-dev
diff --git a/‎pandas/_libs/lib.pyx
+39-2 b/‎pandas/_libs/lib.pyx
+39-2
diff --git a/‎pandas/core/config_init.py
+8 b/‎pandas/core/config_init.py
+8
diff --git a/‎pandas/core/construction.py
+24 b/‎pandas/core/construction.py
+24
diff --git a/‎pandas/core/dtypes/cast.py
+18 b/‎pandas/core/dtypes/cast.py
+18
diff --git a/‎pandas/core/strings/accessor.py
+7-1 b/‎pandas/core/strings/accessor.py
+7-1
diff --git a/‎pandas/io/pytables.py
+10-1 b/‎pandas/io/pytables.py
+10-1
diff --git a/‎pandas/io/stata.py
+7-1 b/‎pandas/io/stata.py
+7-1
diff --git a/‎pandas/tests/dtypes/cast/test_infer_dtype.py
+8-1 b/‎pandas/tests/dtypes/cast/test_infer_dtype.py
+8-1
diff --git a/‎pandas/tests/dtypes/cast/test_promote.py
+21-3 b/‎pandas/tests/dtypes/cast/test_promote.py
+21-3
diff --git a/‎pandas/tests/extension/test_arrow.py
+6 b/‎pandas/tests/extension/test_arrow.py
+6
diff --git a/‎pandas/tests/frame/methods/test_filter.py
+12-3 b/‎pandas/tests/frame/methods/test_filter.py
+12-3
diff --git a/‎pandas/tests/io/formats/test_to_string.py
+5-1 b/‎pandas/tests/io/formats/test_to_string.py
+5-1
diff --git a/‎pandas/tests/io/pytables/test_store.py
+14-7 b/‎pandas/tests/io/pytables/test_store.py
+14-7
@@ -1272,6 +1272,7 @@ cdef class Seen:
         bint interval_        # seen_interval
         bint time_
         bint date_
+        bint bytes_
 
     def __cinit__(self, bint coerce_numeric=False):
         """
@@ -1300,6 +1301,7 @@ cdef class Seen:
         self.interval_ = False
         self.time_ = False
         self.date_ = False
+        self.bytes_ = False
         self.coerce_numeric = coerce_numeric
 
     cdef bint check_uint64_conflict(self) except -1:
@@ -2588,6 +2590,12 @@ def maybe_convert_objects(ndarray[object] objects,
             else:
                 seen.object_ = True
                 break
+        elif isinstance(val, bytes):
+            if convert_non_numeric:
+                seen.bytes_ = True
+            else:
+                seen.object_ = True
+            break
         elif PyTime_Check(val):
             if convert_non_numeric and val.tzinfo is None:
                 seen.time_ = True
@@ -2598,8 +2606,37 @@ def maybe_convert_objects(ndarray[object] objects,
             seen.object_ = True
             break
 
-    # we try to coerce datetime w/tz but must all have the same tz
-    if seen.datetimetz_:
+    if seen.bytes_:
+        if is_bytes_array(objects):
+            opt = get_option("future.infer_bytes")
+            if opt is True:
+                import pyarrow as pa
+
+                from pandas.core.dtypes.dtypes import ArrowDtype
+
+                obj = pa.array(objects)
+                dtype = ArrowDtype(obj.type)
+                return dtype.construct_array_type()(obj)
+            elif opt is False:
+                # explicitly set to keep the old behavior and avoid the warning
+                pass
+            else:
+                from pandas.util._exceptions import find_stack_level
+                warnings.warn(
+                    "Pandas type inference with a sequence of `bytes` "
+                    "objects is deprecated. In a future version, this will give "
+                    "bytes[pyarrow] dtype, which will require pyarrow to be "
+                    "installed. To opt in to the new behavior immediately set "
+                    "`pd.set_option('future.infer_bytes', True)`. To keep the "
+                    "old behavior pass `dtype=object`.",
+                    FutureWarning,
+                    stacklevel=find_stack_level(),
+                )
+
+        seen.object_ = True
+
+    elif seen.datetimetz_:
+        # we try to coerce datetime w/tz but must all have the same tz
         if is_datetime_with_singletz_array(objects):
             from pandas import DatetimeIndex
 
 
@@ -892,6 +892,14 @@ def register_converter_cb(key) -> None:
 
 
 with cf.config_prefix("future"):
+    cf.register_option(
+        "future.infer_bytes",
+        None,
+        "Whether to infer sequence of bytes objects as pyarrow bytes "
+        "dtype, which will be the default in pandas 3.0 "
+        "(at which point this option will be deprecated).",
+        validator=is_one_of_factory([True, False, None]),
+    )
     cf.register_option(
         "future.infer_time",
         None,
 
@@ -415,6 +415,30 @@ def array(
                     stacklevel=find_stack_level(),
                 )
 
+        elif inferred_dtype == "bytes":
+            opt = get_option("future.infer_bytes")
+
+            if opt is True:
+                import pyarrow as pa
+
+                obj = pa.array(data)
+                dtype = ArrowDtype(obj.type)
+                return dtype.construct_array_type()(obj)
+            elif opt is False:
+                # explicitly set to keep the old behavior and avoid the warning
+                pass
+            else:
+                warnings.warn(
+                    "Pandas type inference with a sequence of `bytes` "
+                    "objects is deprecated. In a future version, this will give "
+                    "bytes[pyarrow] dtype, which will require pyarrow to be "
+                    "installed. To opt in to the new behavior immediately set "
+                    "`pd.set_option('future.infer_bytes', True)`. To keep the "
+                    "old behavior pass `dtype=object`.",
+                    FutureWarning,
+                    stacklevel=find_stack_level(),
+                )
+
     # Pandas overrides NumPy for
     #   1. datetime64[ns,us,ms,s]
     #   2. timedelta64[ns,us,ms,s]
 
@@ -849,7 +849,25 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
             import pyarrow as pa
 
             pa_dtype = pa.date32()
+            dtype = ArrowDtype(pa_dtype)
+
+    elif isinstance(val, bytes):
+        opt = get_option("future.infer_bytes")
+        if opt is None:
+            warnings.warn(
+                "Pandas type inference with a `bytes` "
+                "object is deprecated. In a future version, this will give "
+                "bytes[pyarrow] dtype, which will require pyarrow to be "
+                "installed. To opt in to the new behavior immediately set "
+                "`pd.set_option('future.infer_bytes', True)`. To keep the "
+                "old behavior pass `dtype=object`.",
+                FutureWarning,
+                stacklevel=find_stack_level(),
+            )
+        elif opt is True:
+            import pyarrow as pa
 
+            pa_dtype = pa.binary()
             dtype = ArrowDtype(pa_dtype)
 
     elif is_bool(val):
 
@@ -1961,7 +1961,13 @@ def decode(self, encoding, errors: str = "strict"):
             f = lambda x: decoder(x, errors)[0]
         arr = self._data.array
         # assert isinstance(arr, (StringArray,))
-        result = arr._str_map(f)
+
+        if isinstance(arr.dtype, ArrowDtype):
+            # TODO: is there a performant way to do this?
+            res_values = arr.map(f)
+            result = type(arr)._from_sequence(res_values)
+        else:
+            result = arr._str_map(f)
         return self._wrap_result(result)
 
     @forbid_nonstring_types(["bytes"])
 
@@ -5066,7 +5066,16 @@ def _unconvert_string_array(
         dtype = f"U{itemsize}"
 
         if isinstance(data[0], bytes):
-            data = Series(data, copy=False).str.decode(encoding, errors=errors)._values
+            with warnings.catch_warnings():
+                # Deprecation about inferring bytes to bytes[pyarrow] dtype
+                # TODO: try to avoid this altogether
+                warnings.filterwarnings("ignore", category=FutureWarning)
+
+                data = (
+                    Series(data, copy=False).str.decode(encoding, errors=errors)._values
+                ).astype(object, copy=False)
+                # TODO: if we have pyarrow str instead of object here to begin
+                #  with, can we avoid object dtype cast here?
         else:
             data = data.astype(dtype, copy=False).astype(object, copy=False)
 
 
@@ -2910,7 +2910,13 @@ def _prepare_data(self) -> np.recarray:
         for i, col in enumerate(data):
             typ = typlist[i]
             if typ <= self._max_string_length:
-                data[col] = data[col].fillna("").apply(_pad_bytes, args=(typ,))
+                with warnings.catch_warnings():
+                    # deprecated behavior with sequence of bytes, will infer
+                    #  to bytes[pyarrow]
+                    # TODO: can we avoid this altogether
+                    warnings.filterwarnings("ignore", category=FutureWarning)
+
+                    data[col] = data[col].fillna("").apply(_pad_bytes, args=(typ,))
                 stype = f"S{typ}"
                 dtypes[col] = stype
                 data[col] = data[col].astype(stype)
 
@@ -163,7 +163,14 @@ def test_infer_dtype_from_scalar_errors():
     ],
 )
 def test_infer_dtype_from_scalar(value, expected):
-    dtype, _ = infer_dtype_from_scalar(value)
+    msg = "type inference with a `bytes` object is deprecated"
+    warn = None
+    if isinstance(value, bytes):
+        warn = FutureWarning
+
+    with tm.assert_produces_warning(warn, match=msg):
+        dtype, _ = infer_dtype_from_scalar(value)
+
     assert is_dtype_equal(dtype, expected)
 
     with pytest.raises(TypeError, match="must be list-like"):
 
@@ -311,7 +311,13 @@ def test_maybe_promote_any_with_bytes(any_numpy_dtype):
     # output is not a generic bytes, but corresponds to expected_dtype
     exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0]
 
-    _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar)
+    msg = "type inference with a `bytes` object"
+    warn = None
+    if any_numpy_dtype in ["timedelta64[ns]", "datetime64[ns]"]:
+        warn = FutureWarning
+
+    with tm.assert_produces_warning(warn, match=msg):
+        _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar)
 
 
 def test_maybe_promote_datetime64_with_any(datetime64_dtype, any_numpy_dtype):
@@ -330,7 +336,13 @@ def test_maybe_promote_datetime64_with_any(datetime64_dtype, any_numpy_dtype):
         expected_dtype = np.dtype(object)
         exp_val_for_scalar = fill_value
 
-    _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar)
+    msg = "type inference with a `bytes` object is deprecated"
+    warn = None
+    if any_numpy_dtype is bytes and datetime64_dtype == "datetime64[ns]":
+        warn = FutureWarning
+
+    with tm.assert_produces_warning(warn, match=msg):
+        _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar)
 
 
 @pytest.mark.parametrize(
@@ -413,7 +425,13 @@ def test_maybe_promote_timedelta64_with_any(timedelta64_dtype, any_numpy_dtype):
         expected_dtype = np.dtype(object)
         exp_val_for_scalar = fill_value
 
-    _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar)
+    msg = "type inference with a `bytes` object is deprecated"
+    warn = None
+    if any_numpy_dtype is bytes and timedelta64_dtype == "timedelta64[ns]":
+        warn = FutureWarning
+
+    with tm.assert_produces_warning(warn, match=msg):
+        _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar)
 
 
 @pytest.mark.parametrize(
 
@@ -739,6 +739,9 @@ def test_stack(self, data, columns):
             warn_msg = (
                 "Pandas type inference with a sequence of `datetime.date` objects"
             )
+        if pa.types.is_binary(pa_dtype):
+            warn = FutureWarning
+            warn_msg = "Pandas type inference with a sequence of `bytes` objects"
 
         with tm.assert_produces_warning(warn, match=warn_msg, check_stacklevel=False):
             super().test_stack(data, columns)
@@ -813,6 +816,9 @@ def test_hash_pandas_object_works(self, data, as_frame):
             # TODO(#48964) This warning will be avoided by implementing
             #  ArrowExtensionArray.hash_pandas_object
             warn = FutureWarning
+        elif pa.types.is_binary(pa_dtype):
+            warn_msg = "Pandas type inference with a sequence of `bytes`"
+            warn = FutureWarning
 
         with tm.assert_produces_warning(warn, match=warn_msg, check_stacklevel=False):
             super().test_hash_pandas_object_works(data, as_frame)
 
@@ -1,6 +1,8 @@
 import numpy as np
 import pytest
 
+import pandas.util._test_decorators as td
+
 import pandas as pd
 from pandas import DataFrame
 import pandas._testing as tm
@@ -112,11 +114,18 @@ def test_filter_unicode(self, name, expected):
         tm.assert_frame_equal(df.filter(like=name), expected)
         tm.assert_frame_equal(df.filter(regex=name), expected)
 
+    @pytest.mark.parametrize(
+        "future", [pytest.param(True, marks=td.skip_if_no("pyarrow")), False, None]
+    )
     @pytest.mark.parametrize("name", ["a", "a"])
-    def test_filter_bytestring(self, name):
+    def test_filter_bytestring(self, name, future):
         # GH13101
-        df = DataFrame({b"a": [1, 2], b"b": [3, 4]})
-        expected = DataFrame({b"a": [1, 2]})
+        warn = FutureWarning if future is None else None
+        msg = "type inference with a sequence of `bytes` objects"
+        with tm.assert_produces_warning(warn, match=msg):
+            with pd.option_context("future.infer_bytes", future):
+                df = DataFrame({b"a": [1, 2], b"b": [3, 4]})
+                expected = DataFrame({b"a": [1, 2]})
 
         tm.assert_frame_equal(df.filter(like=name), expected)
         tm.assert_frame_equal(df.filter(regex=name), expected)
 
@@ -13,6 +13,7 @@
     option_context,
     to_datetime,
 )
+import pandas._testing as tm
 
 
 def test_repr_embedded_ndarray():
@@ -172,10 +173,13 @@ def test_to_string_unicode_columns(float_frame):
 
 
 def test_to_string_utf8_columns():
+    msg = "type inference with a sequence of `bytes` objects"
+
     n = "\u05d0".encode()
 
     with option_context("display.max_rows", 1):
-        df = DataFrame([1, 2], columns=[n])
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            df = DataFrame([1, 2], columns=[n])
         repr(df)
 
 
 
@@ -11,6 +11,8 @@
 import numpy as np
 import pytest
 
+import pandas.util._test_decorators as td
+
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -324,16 +326,21 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path):
     tm.assert_series_equal(read_hdf(path, "ss4"), concat([df["B"], df2["B"]]))
 
 
+@pytest.mark.parametrize(
+    "future", [pytest.param(True, marks=td.skip_if_no("pyarrow")), False, None]
+)
 @pytest.mark.parametrize("format", ["fixed", "table"])
-def test_to_hdf_errors(tmp_path, format, setup_path):
+def test_to_hdf_errors(tmp_path, format, setup_path, future):
     data = ["\ud800foo"]
-    ser = Series(data, index=Index(data))
-    path = tmp_path / setup_path
-    # GH 20835
-    ser.to_hdf(path, "table", format=format, errors="surrogatepass")
 
-    result = read_hdf(path, "table", errors="surrogatepass")
-    tm.assert_series_equal(result, ser)
+    with pd.option_context("future.infer_bytes", future):
+        ser = Series(data, index=Index(data))
+        path = tmp_path / setup_path
+        # GH 20835
+        ser.to_hdf(path, "table", format=format, errors="surrogatepass")
+
+        result = read_hdf(path, "table", errors="surrogatepass")
+        tm.assert_series_equal(result, ser)
 
 
 def test_create_table_index(setup_path):