diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5b77bb9651073..1bedf07103086 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -96,6 +96,7 @@ from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level from pandas.util._validators import ( + check_dtype_backend, validate_ascending, validate_bool_kwarg, validate_fillna_kwargs, @@ -6590,8 +6591,8 @@ def convert_dtypes( .. versionadded:: 1.2.0 dtype_backend : {"numpy_nullable", "pyarrow"}, default "numpy_nullable" - Which dtype_backend to use, e.g. whether a DataFrame should have NumPy - arrays, nullable dtypes are used for all dtypes that have a nullable + Which dtype_backend to use, e.g. whether a DataFrame should use nullable + dtypes for all dtypes that have a nullable implementation when "numpy_nullable" is set, pyarrow is used for all dtypes if "pyarrow" is set. @@ -6710,6 +6711,7 @@ def convert_dtypes( 2 dtype: string """ + check_dtype_backend(dtype_backend) if self.ndim == 1: return self._convert_dtypes( infer_objects, diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 4458c1dc09d41..b114b8a1aa7aa 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -980,7 +980,7 @@ def convert_object_array( ---------- content: List[np.ndarray] dtype: np.dtype or ExtensionDtype - dtype_backend: Controls if nullable dtypes are returned. + dtype_backend: Controls if nullable/pyarrow dtypes are returned. coerce_float: Cast floats that are integers to int. Returns diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index c7908dee83617..62976f68cbdd4 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -8,6 +8,7 @@ import numpy as np from pandas._libs import lib +from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.cast import maybe_downcast_numeric from pandas.core.dtypes.common import ( @@ -166,6 +167,8 @@ def to_numeric( if errors not in ("ignore", "raise", "coerce"): raise ValueError("invalid error value specified") + check_dtype_backend(dtype_backend) + is_series = False is_index = False is_scalars = False diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index 534b75a8afdd6..e5981e8d15eb7 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -7,6 +7,7 @@ from pandas._libs import lib from pandas.util._exceptions import find_stack_level +from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.generic import ABCDataFrame @@ -58,6 +59,8 @@ def read_clipboard( if encoding is not None and encoding.lower().replace("-", "") != "utf8": raise NotImplementedError("reading from clipboard only supports utf-8 encoding") + check_dtype_backend(dtype_backend) + from pandas.io.clipboard import clipboard_get from pandas.io.parsers import read_csv diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 8e5feb577bef6..3c1ecffe21353 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -36,6 +36,7 @@ Appender, doc, ) +from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import ( is_bool, @@ -472,6 +473,8 @@ def read_excel( storage_options: StorageOptions = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | dict[IntStrT, DataFrame]: + check_dtype_backend(dtype_backend) + should_close = False if not isinstance(io, ExcelFile): should_close = True diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index c07485d0aae23..4d17173fa0ceb 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -10,6 +10,7 @@ from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc +from pandas.util._validators import check_dtype_backend import pandas as pd from pandas.core.api import DataFrame @@ -105,6 +106,8 @@ def read_feather( import_optional_dependency("pyarrow") from pyarrow import feather + check_dtype_backend(dtype_backend) + with get_handle( path, "rb", storage_options=storage_options, is_text=False ) as handles: diff --git a/pandas/io/html.py b/pandas/io/html.py index 66c2ea1a1af51..70cdedfd21ada 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -24,6 +24,7 @@ AbstractMethodError, EmptyDataError, ) +from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import is_list_like @@ -1170,6 +1171,7 @@ def read_html( f'"{extract_links}"' ) validate_header_arg(header) + check_dtype_backend(dtype_backend) io = stringify_path(io) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 7b4fc8dfa0b37..588ec639bc2fd 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -29,6 +29,7 @@ from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError from pandas.util._decorators import doc +from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import ( ensure_str, @@ -747,6 +748,8 @@ def read_json( if orient == "table" and convert_axes: raise ValueError("cannot pass both convert_axes and orient='table'") + check_dtype_backend(dtype_backend) + if dtype is None and orient != "table": # error: Incompatible types in assignment (expression has type "bool", variable # has type "Union[ExtensionDtype, str, dtype[Any], Type[str], Type[float], @@ -947,14 +950,18 @@ def read(self) -> DataFrame | Series: if self.engine == "pyarrow": pyarrow_json = import_optional_dependency("pyarrow.json") pa_table = pyarrow_json.read_json(self.data) + + mapping: type[ArrowDtype] | None | Callable if self.dtype_backend == "pyarrow": - return pa_table.to_pandas(types_mapper=ArrowDtype) + mapping = ArrowDtype elif self.dtype_backend == "numpy_nullable": from pandas.io._util import _arrow_dtype_mapping - mapping = _arrow_dtype_mapping() - return pa_table.to_pandas(types_mapper=mapping.get) - return pa_table.to_pandas() + mapping = _arrow_dtype_mapping().get + else: + mapping = None + + return pa_table.to_pandas(types_mapper=mapping) elif self.engine == "ujson": if self.lines: if self.chunksize: diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 72423473e019f..10530a34ee218 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -12,6 +12,7 @@ from pandas._libs import lib from pandas.compat import pa_version_under8p0 from pandas.compat._optional import import_optional_dependency +from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -98,6 +99,8 @@ def read_orc( orc = import_optional_dependency("pyarrow.orc") + check_dtype_backend(dtype_backend) + with get_handle(path, "rb", is_text=False) as handles: source = handles.handle if is_fsspec_url(path) and filesystem is None: diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 52f16f7450f85..0f8bf004b729a 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -16,6 +16,7 @@ from pandas.errors import AbstractMethodError from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level +from pandas.util._validators import check_dtype_backend import pandas as pd from pandas import ( @@ -515,6 +516,7 @@ def read_parquet( DataFrame """ impl = get_engine(engine) + if use_nullable_dtypes is not lib.no_default: msg = ( "The argument 'use_nullable_dtypes' is deprecated and will be removed " @@ -527,6 +529,7 @@ def read_parquet( warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) else: use_nullable_dtypes = False + check_dtype_backend(dtype_backend) return impl.read( path, diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 3d10570e25aec..015f27ed4f2c4 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -34,6 +34,7 @@ ) from pandas.util._decorators import Appender from pandas.util._exceptions import find_stack_level +from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import ( is_file_like, @@ -1366,6 +1367,8 @@ def read_fwf( kwds["colspecs"] = colspecs kwds["infer_nrows"] = infer_nrows kwds["engine"] = "python-fwf" + + check_dtype_backend(dtype_backend) kwds["dtype_backend"] = dtype_backend return _read(filepath_or_buffer, kwds) @@ -2019,6 +2022,8 @@ def _refine_defaults_read( else: raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines") + check_dtype_backend(dtype_backend) + kwds["dtype_backend"] = dtype_backend return kwds diff --git a/pandas/io/spss.py b/pandas/io/spss.py index 7b6247ec55d95..876eb83890836 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -7,6 +7,7 @@ from pandas._libs import lib from pandas.compat._optional import import_optional_dependency +from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.inference import is_list_like @@ -52,6 +53,7 @@ def read_spss( DataFrame """ pyreadstat = import_optional_dependency("pyreadstat") + check_dtype_backend(dtype_backend) if usecols is not None: if not is_list_like(usecols): diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 1eb24f4a6c375..8d48d04c738e8 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -39,6 +39,7 @@ DatabaseError, ) from pandas.util._exceptions import find_stack_level +from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import ( is_datetime64tz_dtype, @@ -327,6 +328,7 @@ def read_sql_table( >>> pd.read_sql_table('table_name', 'postgres:///db_name') # doctest:+SKIP """ + check_dtype_backend(dtype_backend) if dtype_backend is lib.no_default: dtype_backend = "numpy" # type: ignore[assignment] assert dtype_backend is not lib.no_default @@ -459,6 +461,7 @@ def read_sql_query( parameter will be converted to UTC. """ + check_dtype_backend(dtype_backend) if dtype_backend is lib.no_default: dtype_backend = "numpy" # type: ignore[assignment] assert dtype_backend is not lib.no_default @@ -624,6 +627,7 @@ def read_sql( 1 1 2010-11-12 """ + check_dtype_backend(dtype_backend) if dtype_backend is lib.no_default: dtype_backend = "numpy" # type: ignore[assignment] assert dtype_backend is not lib.no_default diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 867d7b2c53278..65cc369416352 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -19,6 +19,7 @@ ParserError, ) from pandas.util._decorators import doc +from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import is_list_like @@ -1112,6 +1113,7 @@ def read_xml( 1 circle 360 NaN 2 triangle 180 3.0 """ + check_dtype_backend(dtype_backend) return _parse( path_or_buffer=path_or_buffer, diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index ad3be3d4014a7..6076933eecec4 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -124,3 +124,13 @@ def test_pyarrow_dtype_empty_object(self): expected = pd.DataFrame(columns=[0]) result = expected.convert_dtypes(dtype_backend="pyarrow") tm.assert_frame_equal(result, expected) + + def test_pyarrow_engine_lines_false(self): + # GH 48893 + df = pd.DataFrame({"a": [1, 2, 3]}) + msg = ( + "dtype_backend numpy is invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + df.convert_dtypes(dtype_backend="numpy") diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index fde62eb7a91a5..08308ebd2f1cf 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1944,6 +1944,14 @@ def test_read_json_nullable_series(self, string_storage, dtype_backend, orient): tm.assert_series_equal(result, expected) + def test_invalid_dtype_backend(self): + msg = ( + "dtype_backend numpy is invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + read_json("test", dtype_backend="numpy") + def test_invalid_engine(): # GH 48893 diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index fe2de5355a6be..d166946704e13 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -1001,3 +1001,12 @@ def test_dtype_backend(string_storage, dtype_backend): expected["i"] = ArrowExtensionArray(pa.array([None, None])) tm.assert_frame_equal(result, expected) + + +def test_invalid_dtype_backend(): + msg = ( + "dtype_backend numpy is invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + read_fwf("test", dtype_backend="numpy") diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 185dc733df3c2..1a9d99b0b5c1f 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -200,3 +200,13 @@ def test_invalid_file_inputs(request, all_parsers): with pytest.raises(ValueError, match="Invalid"): parser.read_csv([]) + + +def test_invalid_dtype_backend(all_parsers): + parser = all_parsers + msg = ( + "dtype_backend numpy is invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + parser.read_csv("test", dtype_backend="numpy") diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 94f073b1abb86..baf2bcdc9386f 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -467,3 +467,11 @@ def test_read_clipboard_dtype_backend( expected["g"] = ArrowExtensionArray(pa.array([None, None])) tm.assert_frame_equal(result, expected) + + def test_invalid_dtype_backend(self): + msg = ( + "dtype_backend numpy is invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + read_clipboard(dtype_backend="numpy") diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 203472b0d0953..c5bd8341e1a54 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -212,3 +212,14 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend): def test_int_columns_and_index(self): df = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index([3, 4, 5], name="test")) self.check_round_trip(df) + + def test_invalid_dtype_backend(self): + msg = ( + "dtype_backend numpy is invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + df = pd.DataFrame({"int": list(range(1, 4))}) + with tm.ensure_clean("tmp.feather") as path: + df.to_feather(path) + with pytest.raises(ValueError, match=msg): + read_feather(path, dtype_backend="numpy") diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 1595fa86567c9..03f1bcb13d077 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1465,3 +1465,11 @@ def test_extract_links_all_no_header(self): result = self.read_html(data, extract_links="all")[0] expected = DataFrame([[("Google.com", "https://google.com")]]) tm.assert_frame_equal(result, expected) + + def test_invalid_dtype_backend(self): + msg = ( + "dtype_backend numpy is invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + read_html("test", dtype_backend="numpy") diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index dccdfdc897dc1..36cfe5576adf9 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -409,3 +409,15 @@ def test_to_orc_non_default_index(index): ) with pytest.raises(ValueError, match=msg): df.to_orc() + + +def test_invalid_dtype_backend(): + msg = ( + "dtype_backend numpy is invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + df = pd.DataFrame({"int": list(range(1, 4))}) + with tm.ensure_clean("tmp.orc") as path: + df.to_orc(path) + with pytest.raises(ValueError, match=msg): + read_orc(path, dtype_backend="numpy") diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index af35b50ed50d8..1548208c7eeaa 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1210,3 +1210,14 @@ def test_bytes_file_name(self, engine): result = read_parquet(path, engine=engine) tm.assert_frame_equal(result, df) + + def test_invalid_dtype_backend(self, engine): + msg = ( + "dtype_backend numpy is invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + df = pd.DataFrame({"int": list(range(1, 4))}) + with tm.ensure_clean("tmp.parquet") as path: + df.to_parquet(path) + with pytest.raises(ValueError, match=msg): + read_parquet(path, dtype_backend="numpy") diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index fe414f6c3d52c..9e1f6cf7cd8d4 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -102,3 +102,12 @@ def test_spss_umlauts_dtype_backend(datapath, dtype_backend): ) tm.assert_frame_equal(df, expected) + + +def test_invalid_dtype_backend(): + msg = ( + "dtype_backend numpy is invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + pd.read_spss("test", dtype_backend="numpy") diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 1691b6b72c40b..dc51a5b0a77fb 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2486,6 +2486,19 @@ def test_read_sql_dtype_backend_table(self, string_storage, func, dtype_backend) for result in iterator: tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("func", ["read_sql", "read_sql_table", "read_sql_query"]) + def test_read_sql_invalid_dtype_backend_table(self, func): + table = "test" + df = self.dtype_backend_data() + df.to_sql(table, self.conn, index=False, if_exists="replace") + + msg = ( + "dtype_backend numpy is invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + getattr(pd, func)(table, self.conn, dtype_backend="numpy") + def dtype_backend_data(self) -> DataFrame: return DataFrame( { diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index eaadcd6cee11b..a53e5f247c73a 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -1889,3 +1889,12 @@ def test_read_xml_nullable_dtypes(parser, string_storage, dtype_backend): expected["g"] = ArrowExtensionArray(pa.array([None, None])) tm.assert_frame_equal(result, expected) + + +def test_invalid_dtype_backend(): + msg = ( + "dtype_backend numpy is invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + read_xml("test", dtype_backend="numpy") diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 07569aa21dbe2..4a0b01a275523 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -914,3 +914,13 @@ def test_to_numeric_dtype_backend_error(dtype_backend): dtype = "Float64" expected = Series([np.nan, np.nan, np.nan], dtype=dtype) tm.assert_series_equal(result, expected) + + +def test_invalid_dtype_backend(): + ser = Series([1, 2, 3]) + msg = ( + "dtype_backend numpy is invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + to_numeric(ser, dtype_backend="numpy") diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index 7b1eca695c6d6..17ef583febc24 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -13,6 +13,8 @@ import numpy as np +from pandas._libs import lib + from pandas.core.dtypes.common import ( is_bool, is_integer, @@ -437,3 +439,12 @@ def validate_insert_loc(loc: int, length: int) -> int: if not 0 <= loc <= length: raise IndexError(f"loc must be an integer between -{length} and {length}") return loc + + +def check_dtype_backend(dtype_backend) -> None: + if dtype_backend is not lib.no_default: + if dtype_backend not in ["numpy_nullable", "pyarrow"]: + raise ValueError( + f"dtype_backend {dtype_backend} is invalid, only 'numpy_nullable' and " + f"'pyarrow' are allowed.", + )