From 1958b53e7fd829df23bec520d4a90d8db3b08074 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 9 Mar 2023 00:37:05 +0100 Subject: [PATCH 01/18] Remove use_nullable_dtypes and add dtype_backend keyword --- doc/source/user_guide/io.rst | 11 +- doc/source/user_guide/pyarrow.rst | 18 +- doc/source/whatsnew/v2.0.0.rst | 37 ++-- pandas/_libs/parsers.pyi | 2 +- pandas/_libs/parsers.pyx | 28 ++- pandas/_typing.py | 1 + pandas/conftest.py | 2 +- pandas/core/arrays/numeric.py | 2 +- pandas/core/config_init.py | 28 --- pandas/core/dtypes/cast.py | 6 +- pandas/core/generic.py | 9 + pandas/core/internals/construction.py | 10 +- pandas/core/series.py | 3 +- pandas/core/tools/numeric.py | 41 +--- pandas/io/clipboards.py | 33 +--- pandas/io/excel/_base.py | 43 ++--- pandas/io/feather_format.py | 33 +--- pandas/io/html.py | 28 +-- pandas/io/json/_json.py | 77 +++----- pandas/io/orc.py | 32 +--- pandas/io/parquet.py | 63 ++++--- pandas/io/parsers/arrow_parser_wrapper.py | 10 +- pandas/io/parsers/base_parser.py | 25 ++- pandas/io/parsers/c_parser_wrapper.py | 13 +- pandas/io/parsers/readers.py | 78 +++----- pandas/io/spss.py | 31 +-- pandas/io/sql.py | 178 +++++++----------- pandas/io/xml.py | 32 +--- .../frame/methods/test_convert_dtypes.py | 12 +- pandas/tests/io/excel/test_readers.py | 25 ++- pandas/tests/io/json/test_pandas.py | 17 +- .../io/parser/dtypes/test_dtypes_basic.py | 77 +++----- pandas/tests/io/parser/test_read_fwf.py | 18 +- pandas/tests/io/parser/test_upcast.py | 12 +- pandas/tests/io/test_clipboard.py | 24 +-- pandas/tests/io/test_feather.py | 10 +- pandas/tests/io/test_html.py | 16 +- pandas/tests/io/test_orc.py | 23 +-- pandas/tests/io/test_parquet.py | 64 +++---- pandas/tests/io/test_spss.py | 5 +- pandas/tests/io/test_sql.py | 104 +++++----- pandas/tests/io/xml/test_xml.py | 21 +-- pandas/tests/tools/test_to_numeric.py | 59 ++---- 43 files changed, 449 insertions(+), 912 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 3c3a655626bb6..04892cd0b0cde 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -170,10 +170,9 @@ dtype : Type name or dict of column -> type, default ``None`` the default determines the dtype of the columns which are not explicitly listed. -use_nullable_dtypes : bool = False - Whether or not to use nullable dtypes as default when reading data. If - set to True, nullable dtypes are used for all dtypes that have a nullable - implementation, even if no nulls are present. +dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames. + Which dtype backend to use. If + set to True, nullable dtypes or pyarrow dtypes are used for all dtypes. .. versionadded:: 2.0 @@ -475,7 +474,7 @@ worth trying. os.remove("foo.csv") -Setting ``use_nullable_dtypes=True`` will result in nullable dtypes for every column. +Setting ``dtype_backend="numpy_nullable"`` will result in nullable dtypes for every column. .. ipython:: python @@ -484,7 +483,7 @@ Setting ``use_nullable_dtypes=True`` will result in nullable dtypes for every co 3,4.5,False,b,6,7.5,True,a,12-31-2019, """ - df = pd.read_csv(StringIO(data), use_nullable_dtypes=True, parse_dates=["i"]) + df = pd.read_csv(StringIO(data), dtype_backend="numpy_nullable", parse_dates=["i"]) df df.dtypes diff --git a/doc/source/user_guide/pyarrow.rst b/doc/source/user_guide/pyarrow.rst index 876ca9c164823..8531216ecc61e 100644 --- a/doc/source/user_guide/pyarrow.rst +++ b/doc/source/user_guide/pyarrow.rst @@ -145,8 +145,8 @@ functions provide an ``engine`` keyword that can dispatch to PyArrow to accelera df By default, these functions and all other IO reader functions return NumPy-backed data. These readers can return -PyArrow-backed data by specifying the parameter ``use_nullable_dtypes=True`` **and** the global configuration option ``"mode.dtype_backend"`` -set to ``"pyarrow"``. A reader does not need to set ``engine="pyarrow"`` to necessarily return PyArrow-backed data. +PyArrow-backed data by specifying the parameter ``dtype_backend="pyarrow"``. A reader does not need to set +``engine="pyarrow"`` to necessarily return PyArrow-backed data. .. ipython:: python @@ -155,20 +155,10 @@ set to ``"pyarrow"``. A reader does not need to set ``engine="pyarrow"`` to nece 1,2.5,True,a,,,,, 3,4.5,False,b,6,7.5,True,a, """) - with pd.option_context("mode.dtype_backend", "pyarrow"): - df_pyarrow = pd.read_csv(data, use_nullable_dtypes=True) + df_pyarrow = pd.read_csv(data, dtype_backend="pyarrow") df_pyarrow.dtypes -To simplify specifying ``use_nullable_dtypes=True`` in several functions, you can set a global option ``nullable_dtypes`` -to ``True``. You will still need to set the global configuration option ``"mode.dtype_backend"`` to ``pyarrow``. - -.. code-block:: ipython - - In [1]: pd.set_option("mode.dtype_backend", "pyarrow") - - In [2]: pd.options.mode.nullable_dtypes = True - -Several non-IO reader functions can also use the ``"mode.dtype_backend"`` option to return PyArrow-backed data including: +Several non-IO reader functions can also use the ``dtype_backend`` argument to return PyArrow-backed data including: * :func:`to_numeric` * :meth:`DataFrame.convert_dtypes` diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index c987588097953..ecea272b8a3f5 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -103,12 +103,12 @@ Below is a possibly non-exhaustive list of changes: pd.Index([1, 2, 3], dtype=np.float16) -.. _whatsnew_200.enhancements.io_use_nullable_dtypes_and_dtype_backend: +.. _whatsnew_200.enhancements.io_dtype_backend: -Configuration option, ``mode.dtype_backend``, to return pyarrow-backed dtypes -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Argument ``dtype_backend``, to return pyarrow-backed or numpy-backed nullable dtypes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The ``use_nullable_dtypes`` keyword argument has been expanded to the following functions to enable automatic conversion to nullable dtypes (:issue:`36712`) +The following functions gained a new keyword ``dtype_backend`` (:issue:`36712`) * :func:`read_csv` * :func:`read_clipboard` @@ -124,19 +124,13 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following * :func:`read_feather` * :func:`read_spss` * :func:`to_numeric` +* :meth:`DataFrame.convert_dtypes` +* :meth:`Series.convert_dtypes` -To simplify opting-in to nullable dtypes for these functions, a new option ``nullable_dtypes`` was added that allows setting -the keyword argument globally to ``True`` if not specified directly. The option can be enabled -through: - -.. ipython:: python - - pd.options.mode.nullable_dtypes = True - -The option will only work for functions with the keyword ``use_nullable_dtypes``. +When this option is set to ``numpy_nullable`` it will return a :class:`DataFrame` that is +backed by nullable dtypes. -Additionally a new global configuration, ``mode.dtype_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions -to select the nullable dtypes implementation. +When this keyword is set to ``pyarrow``, then these functions will return pyarrow-backed nullable :class:`ArrowDtype` DataFrames (:issue:`48957`, :issue:`49997`): * :func:`read_csv` * :func:`read_clipboard` @@ -153,16 +147,9 @@ to select the nullable dtypes implementation. * :func:`read_feather` * :func:`read_spss` * :func:`to_numeric` - - -And the following methods will also utilize the ``mode.dtype_backend`` option. - * :meth:`DataFrame.convert_dtypes` * :meth:`Series.convert_dtypes` -By default, ``mode.dtype_backend`` is set to ``"pandas"`` to return existing, numpy-backed nullable dtypes, but it can also -be set to ``"pyarrow"`` to return pyarrow-backed, nullable :class:`ArrowDtype` (:issue:`48957`, :issue:`49997`). - .. ipython:: python import io @@ -170,13 +157,11 @@ be set to ``"pyarrow"`` to return pyarrow-backed, nullable :class:`ArrowDtype` ( 1,2.5,True,a,,,,, 3,4.5,False,b,6,7.5,True,a, """) - with pd.option_context("mode.dtype_backend", "pandas"): - df = pd.read_csv(data, use_nullable_dtypes=True) + df = pd.read_csv(data, dtype_backend="pyarrow") df.dtypes data.seek(0) - with pd.option_context("mode.dtype_backend", "pyarrow"): - df_pyarrow = pd.read_csv(data, use_nullable_dtypes=True, engine="pyarrow") + df_pyarrow = pd.read_csv(data, dtype_backend="pyarrow", engine="pyarrow") df_pyarrow.dtypes Copy-on-Write improvements diff --git a/pandas/_libs/parsers.pyi b/pandas/_libs/parsers.pyi index 21a440aa4c849..3b6e4dca47b14 100644 --- a/pandas/_libs/parsers.pyi +++ b/pandas/_libs/parsers.pyi @@ -72,5 +72,5 @@ class TextReader: na_values: dict def _maybe_upcast( - arr, use_nullable_dtypes: bool = ..., dtype_backend: str = ... + arr, use_dtype_backend: bool = ..., dtype_backend: str = ... ) -> np.ndarray: ... diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 5bddaa61d3196..2839730ca46bd 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -339,7 +339,6 @@ cdef class TextReader: object index_col object skiprows object dtype - bint use_nullable_dtypes object usecols set unnamed_cols # set[str] str dtype_backend @@ -379,8 +378,7 @@ cdef class TextReader: float_precision=None, bint skip_blank_lines=True, encoding_errors=b"strict", - use_nullable_dtypes=False, - dtype_backend="pandas"): + dtype_backend="numpy"): # set encoding for native Python and C library if isinstance(encoding_errors, str): @@ -501,7 +499,6 @@ cdef class TextReader: # - DtypeObj # - dict[Any, DtypeObj] self.dtype = dtype - self.use_nullable_dtypes = use_nullable_dtypes self.dtype_backend = dtype_backend self.noconvert = set() @@ -928,7 +925,6 @@ cdef class TextReader: bint na_filter = 0 int64_t num_cols dict results - bint use_nullable_dtypes start = self.parser_start @@ -1049,12 +1045,12 @@ cdef class TextReader: # don't try to upcast EAs if ( na_count > 0 and not is_extension_array_dtype(col_dtype) - or self.use_nullable_dtypes + or self.dtype_backend != "numpy" ): - use_nullable_dtypes = self.use_nullable_dtypes and col_dtype is None + use_dtype_backend = self.dtype_backend != "numpy" and col_dtype is None col_res = _maybe_upcast( col_res, - use_nullable_dtypes=use_nullable_dtypes, + use_dtype_backend=use_dtype_backend, dtype_backend=self.dtype_backend, ) @@ -1389,11 +1385,11 @@ _NA_VALUES = _ensure_encoded(list(STR_NA_VALUES)) def _maybe_upcast( - arr, use_nullable_dtypes: bool = False, dtype_backend: str = "pandas" + arr, use_dtype_backend: bool = False, dtype_backend: str = "numpy" ): """Sets nullable dtypes or upcasts if nans are present. - Upcast, if use_nullable_dtypes is false and nans are present so that the + Upcast, if use_dtype_backend is false and nans are present so that the current dtype can not hold the na value. We use nullable dtypes if the flag is true for every array. @@ -1402,7 +1398,7 @@ def _maybe_upcast( arr: ndarray Numpy array that is potentially being upcast. - use_nullable_dtypes: bool, default False + use_dtype_backend: bool, default False If true, we cast to the associated nullable dtypes. Returns @@ -1419,7 +1415,7 @@ def _maybe_upcast( if issubclass(arr.dtype.type, np.integer): mask = arr == na_value - if use_nullable_dtypes: + if use_dtype_backend: arr = IntegerArray(arr, mask) else: arr = arr.astype(float) @@ -1428,22 +1424,22 @@ def _maybe_upcast( elif arr.dtype == np.bool_: mask = arr.view(np.uint8) == na_value - if use_nullable_dtypes: + if use_dtype_backend: arr = BooleanArray(arr, mask) else: arr = arr.astype(object) np.putmask(arr, mask, np.nan) elif issubclass(arr.dtype.type, float) or arr.dtype.type == np.float32: - if use_nullable_dtypes: + if use_dtype_backend: mask = np.isnan(arr) arr = FloatingArray(arr, mask) elif arr.dtype == np.object_: - if use_nullable_dtypes: + if use_dtype_backend: arr = StringDtype().construct_array_type()._from_sequence(arr) - if use_nullable_dtypes and dtype_backend == "pyarrow": + if use_dtype_backend and dtype_backend == "pyarrow": import pyarrow as pa if isinstance(arr, IntegerArray) and arr.isna().all(): # use null instead of int64 in pyarrow diff --git a/pandas/_typing.py b/pandas/_typing.py index 6059bced4a7d4..9064764e35423 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -377,3 +377,4 @@ def closed(self) -> bool: Literal["pearson", "kendall", "spearman"], Callable[[np.ndarray, np.ndarray], float] ] AlignJoin = Literal["outer", "inner", "left", "right"] +DtypeBackend = Literal["pyarrow", "numpy_nullable"] diff --git a/pandas/conftest.py b/pandas/conftest.py index 05f473059758c..95bb2078d151c 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1274,7 +1274,7 @@ def string_storage(request): @pytest.fixture( params=[ - "pandas", + "numpy_nullable", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), ] ) diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 2d9a3ae63259d..95802b0175f91 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -285,7 +285,7 @@ def _from_sequence_of_strings( ) -> T: from pandas.core.tools.numeric import to_numeric - scalars = to_numeric(strings, errors="raise", use_nullable_dtypes=True) + scalars = to_numeric(strings, errors="raise", dtype_backend="numpy_nullable") return cls._from_sequence(scalars, dtype=dtype, copy=copy) _HANDLED_TYPES = (np.ndarray, numbers.Number) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 3f0366f33a94b..54d1497ad05f3 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -487,13 +487,6 @@ def use_inf_as_na_cb(key) -> None: The default storage for StringDtype. """ -dtype_backend_doc = """ -: string - The nullable dtype implementation to return. Only applicable to certain - operations where documented. Available options: 'pandas', 'pyarrow', - the default is 'pandas'. -""" - with cf.config_prefix("mode"): cf.register_option( "string_storage", @@ -501,27 +494,6 @@ def use_inf_as_na_cb(key) -> None: string_storage_doc, validator=is_one_of_factory(["python", "pyarrow"]), ) - cf.register_option( - "dtype_backend", - "pandas", - dtype_backend_doc, - validator=is_one_of_factory(["pandas", "pyarrow"]), - ) - - -nullable_dtypes_doc = """ -: bool - If nullable dtypes should be returned. This is only applicable to functions - where the ``use_nullable_dtypes`` keyword is implemented. -""" - -with cf.config_prefix("mode"): - cf.register_option( - "nullable_dtypes", - False, - nullable_dtypes_doc, - validator=is_bool, - ) # Set up the io.excel specific reader configuration. diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index cfcae7f40919a..e9da7598d1ebc 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1007,7 +1007,7 @@ def convert_dtypes( convert_boolean: bool = True, convert_floating: bool = True, infer_objects: bool = False, - dtype_backend: Literal["pandas", "pyarrow"] = "pandas", + dtype_backend: Literal["numpy_nullable", "pyarrow"] = "numpy_nullable", ) -> DtypeObj: """ Convert objects to best possible type, and optionally, @@ -1029,10 +1029,10 @@ def convert_dtypes( infer_objects : bool, defaults False Whether to also infer objects to float/int if possible. Is only hit if the object array contains pd.NA. - dtype_backend : str, default "pandas" + dtype_backend : str, default "numpy_nullable" Nullable dtype implementation to use. - * "pandas" returns numpy-backed nullable types + * "numpy_nullable" returns numpy-backed nullable types * "pyarrow" returns pyarrow-backed nullable types using ``ArrowDtype`` Returns diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 003e4cc5b8b23..c191ccd122e09 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -52,6 +52,7 @@ CompressionOptions, Dtype, DtypeArg, + DtypeBackend, DtypeObj, FilePath, FillnaOptions, @@ -6547,6 +6548,7 @@ def convert_dtypes( convert_integer: bool_t = True, convert_boolean: bool_t = True, convert_floating: bool_t = True, + dtype_backend: DtypeBackend = "numpy_nullable", ) -> NDFrameT: """ Convert columns to the best possible dtypes using dtypes supporting ``pd.NA``. @@ -6567,6 +6569,11 @@ def convert_dtypes( dtypes if the floats can be faithfully casted to integers. .. versionadded:: 1.2.0 + dtype_backend : {"numpy_nullable", "pyarrow"}, default "numpy_nullable" + Which dtype_backend to use, e.g. whether a DataFrame should have nullable + extension dtypes or pyarrow dtypes. + + .. versionadded:: 2.0 Returns ------- @@ -6686,6 +6693,7 @@ def convert_dtypes( convert_integer, convert_boolean, convert_floating, + dtype_backend=dtype_backend, ) else: results = [ @@ -6695,6 +6703,7 @@ def convert_dtypes( convert_integer, convert_boolean, convert_floating, + dtype_backend=dtype_backend, ) for col_name, col in self.items() ] diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index f2d216d834b59..4458c1dc09d41 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -970,7 +970,7 @@ def _validate_or_indexify_columns( def convert_object_array( content: list[npt.NDArray[np.object_]], dtype: DtypeObj | None, - use_nullable_dtypes: bool = False, + dtype_backend: str = "numpy", coerce_float: bool = False, ) -> list[ArrayLike]: """ @@ -980,7 +980,7 @@ def convert_object_array( ---------- content: List[np.ndarray] dtype: np.dtype or ExtensionDtype - use_nullable_dtypes: Controls if nullable dtypes are returned. + dtype_backend: Controls if nullable dtypes are returned. coerce_float: Cast floats that are integers to int. Returns @@ -994,7 +994,7 @@ def convert(arr): arr = lib.maybe_convert_objects( arr, try_float=coerce_float, - convert_to_nullable_dtype=use_nullable_dtypes, + convert_to_nullable_dtype=dtype_backend != "numpy", ) # Notes on cases that get here 2023-02-15 # 1) we DO get here when arr is all Timestamps and dtype=None @@ -1007,9 +1007,9 @@ def convert(arr): if arr.dtype == np.dtype("O"): # i.e. maybe_convert_objects didn't convert arr = maybe_infer_to_datetimelike(arr) - if use_nullable_dtypes and arr.dtype == np.dtype("O"): + if dtype_backend != "numpy" and arr.dtype == np.dtype("O"): arr = StringDtype().construct_array_type()._from_sequence(arr) - elif use_nullable_dtypes and isinstance(arr, np.ndarray): + elif dtype_backend != "numpy" and isinstance(arr, np.ndarray): if is_integer_dtype(arr.dtype): arr = IntegerArray(arr, np.zeros(arr.shape, dtype=np.bool_)) elif is_bool_dtype(arr.dtype): diff --git a/pandas/core/series.py b/pandas/core/series.py index 95ee3f1af58f1..42dc6a2a8f6f3 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -155,6 +155,7 @@ CorrelationMethod, DropKeep, Dtype, + DtypeBackend, DtypeObj, FilePath, FillnaOptions, @@ -5439,6 +5440,7 @@ def _convert_dtypes( convert_integer: bool = True, convert_boolean: bool = True, convert_floating: bool = True, + dtype_backend: DtypeBackend = "numpy_nullable", ) -> Series: input_series = self if infer_objects: @@ -5447,7 +5449,6 @@ def _convert_dtypes( input_series = input_series.copy(deep=None) if convert_string or convert_integer or convert_boolean or convert_floating: - dtype_backend = get_option("mode.dtype_backend") inferred_dtype = convert_dtypes( input_series._values, convert_string, diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 7517d5278e52a..38034436d579d 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -7,11 +7,6 @@ import numpy as np -from pandas._config import ( - get_option, - using_nullable_dtypes, -) - from pandas._libs import lib from pandas.core.dtypes.cast import maybe_downcast_numeric @@ -38,6 +33,7 @@ if TYPE_CHECKING: from pandas._typing import ( DateTimeErrorChoices, + DtypeBackend, npt, ) @@ -46,7 +42,7 @@ def to_numeric( arg, errors: DateTimeErrorChoices = "raise", downcast: Literal["integer", "signed", "unsigned", "float"] | None = None, - use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ): """ Convert argument to a numeric type. @@ -91,20 +87,11 @@ def to_numeric( the dtype it is to be cast to, so if none of the dtypes checked satisfy that specification, no downcasting will be performed on the data. - use_nullable_dtypes : bool = False - Whether or not to use nullable dtypes as default when converting data. If - set to True, nullable dtypes are used for all dtypes that have a nullable - implementation, even if no nulls are present. - - .. note:: + dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames + Which dtype_backend to use, e.g. whether a DataFrame should have NumPy + arrays, nullable extension dtypes or pyarrow dtypes. - The nullable dtype implementation can be configured by calling - ``pd.set_option("mode.dtype_backend", "pandas")`` to use - numpy-backed nullable dtypes or - ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use - pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). - - .. versionadded:: 2.0.0 + .. versionadded:: 2.0 Returns ------- @@ -175,12 +162,6 @@ def to_numeric( if errors not in ("ignore", "raise", "coerce"): raise ValueError("invalid error value specified") - _use_nullable_dtypes = ( - use_nullable_dtypes - if use_nullable_dtypes is not lib.no_default - else using_nullable_dtypes() - ) - is_series = False is_index = False is_scalars = False @@ -228,11 +209,11 @@ def to_numeric( values = ensure_object(values) coerce_numeric = errors not in ("ignore", "raise") try: - values, new_mask = lib.maybe_convert_numeric( + values, new_mask = lib.maybe_convert_numeric( # type: ignore[call-overload] # noqa values, set(), coerce_numeric=coerce_numeric, - convert_to_masked_nullable=_use_nullable_dtypes, + convert_to_masked_nullable=dtype_backend is not lib.no_default, ) except (ValueError, TypeError): if errors == "raise": @@ -242,7 +223,7 @@ def to_numeric( # Remove unnecessary values, is expected later anyway and enables # downcasting values = values[~new_mask] - elif _use_nullable_dtypes and new_mask is None: + elif dtype_backend is not lib.no_default and new_mask is None: new_mask = np.zeros(values.shape, dtype=np.bool_) # attempt downcast only if the data has been successfully converted @@ -302,9 +283,7 @@ def to_numeric( klass = FloatingArray values = klass(data, mask) - if get_option("mode.dtype_backend") == "pyarrow" or isinstance( - values_dtype, pd.ArrowDtype - ): + if dtype_backend == "pyarrow" or isinstance(values_dtype, pd.ArrowDtype): values = ArrowExtensionArray(values.__arrow_array__()) if is_series: diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index fa87e02793b55..15693b11f0b46 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -2,10 +2,9 @@ from __future__ import annotations from io import StringIO +from typing import TYPE_CHECKING import warnings -from pandas._config import using_nullable_dtypes - from pandas._libs import lib from pandas.util._exceptions import find_stack_level @@ -16,10 +15,13 @@ option_context, ) +if TYPE_CHECKING: + from pandas._typing import DtypeBackend + def read_clipboard( sep: str = r"\s+", - use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, **kwargs, ): # pragma: no cover r""" @@ -31,18 +33,9 @@ def read_clipboard( A string or regex delimiter. The default of '\s+' denotes one or more whitespace characters. - use_nullable_dtypes : bool = False - Whether or not to use nullable dtypes as default when reading data. If - set to True, nullable dtypes are used for all dtypes that have a nullable - implementation, even if no nulls are present. - - .. note:: - - The nullable dtype implementation can be configured by calling - ``pd.set_option("mode.dtype_backend", "pandas")`` to use - numpy-backed nullable dtypes or - ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use - pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames + Which dtype_backend to use, e.g. whether a DataFrame should have NumPy + arrays, nullable extension dtypes or pyarrow dtypes. .. versionadded:: 2.0 @@ -61,12 +54,6 @@ def read_clipboard( if encoding is not None and encoding.lower().replace("-", "") != "utf8": raise NotImplementedError("reading from clipboard only supports utf-8 encoding") - use_nullable_dtypes = ( - use_nullable_dtypes - if use_nullable_dtypes is not lib.no_default - else using_nullable_dtypes() - ) - from pandas.io.clipboard import clipboard_get from pandas.io.parsers import read_csv @@ -113,9 +100,7 @@ def read_clipboard( stacklevel=find_stack_level(), ) - return read_csv( - StringIO(text), sep=sep, use_nullable_dtypes=use_nullable_dtypes, **kwargs - ) + return read_csv(StringIO(text), sep=sep, dtype_backend=dtype_backend, **kwargs) def to_clipboard( diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 065d8992ab0ac..1619a678ecfc8 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -23,10 +23,7 @@ ) import zipfile -from pandas._config import ( - config, - using_nullable_dtypes, -) +from pandas._config import config from pandas._libs import lib from pandas._libs.parsers import STR_NA_VALUES @@ -72,6 +69,7 @@ from pandas._typing import ( DtypeArg, + DtypeBackend, FilePath, IntStrT, ReadBuffer, @@ -280,18 +278,9 @@ .. versionadded:: 1.2.0 -use_nullable_dtypes : bool, default False - Whether or not to use nullable dtypes as default when reading data. If - set to True, nullable dtypes are used for all dtypes that have a nullable - implementation, even if no nulls are present. Dtype takes precedence if given. - - .. note:: - - The nullable dtype implementation can be configured by calling - ``pd.set_option("mode.dtype_backend", "pandas")`` to use - numpy-backed nullable dtypes or - ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use - pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). +dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames + Which dtype_backend to use, e.g. whether a DataFrame should have NumPy + arrays, nullable extension dtypes or pyarrow dtypes. .. versionadded:: 2.0 @@ -399,7 +388,7 @@ def read_excel( comment: str | None = ..., skipfooter: int = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool | lib.NoDefault = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., ) -> DataFrame: ... @@ -438,7 +427,7 @@ def read_excel( comment: str | None = ..., skipfooter: int = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool | lib.NoDefault = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., ) -> dict[IntStrT, DataFrame]: ... @@ -477,7 +466,7 @@ def read_excel( comment: str | None = None, skipfooter: int = 0, storage_options: StorageOptions = None, - use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | dict[IntStrT, DataFrame]: should_close = False if not isinstance(io, ExcelFile): @@ -489,12 +478,6 @@ def read_excel( "an ExcelFile - ExcelFile already has the engine set" ) - use_nullable_dtypes = ( - use_nullable_dtypes - if use_nullable_dtypes is not lib.no_default - else using_nullable_dtypes() - ) - try: data = io.parse( sheet_name=sheet_name, @@ -519,7 +502,7 @@ def read_excel( decimal=decimal, comment=comment, skipfooter=skipfooter, - use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, ) finally: # make sure to close opened file handles @@ -723,7 +706,7 @@ def parse( decimal: str = ".", comment: str | None = None, skipfooter: int = 0, - use_nullable_dtypes: bool = False, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, **kwds, ): validate_header_arg(header) @@ -882,7 +865,7 @@ def parse( comment=comment, skipfooter=skipfooter, usecols=usecols, - use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, **kwds, ) @@ -1547,7 +1530,7 @@ def parse( thousands: str | None = None, comment: str | None = None, skipfooter: int = 0, - use_nullable_dtypes: bool = False, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, **kwds, ) -> DataFrame | dict[str, DataFrame] | dict[int, DataFrame]: """ @@ -1579,7 +1562,7 @@ def parse( thousands=thousands, comment=comment, skipfooter=skipfooter, - use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, **kwds, ) diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 63aba65274de4..0295ade472962 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -7,14 +7,11 @@ Sequence, ) -from pandas._config import using_nullable_dtypes - from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc import pandas as pd -from pandas import get_option from pandas.core.api import DataFrame from pandas.core.shared_docs import _shared_docs @@ -22,6 +19,7 @@ if TYPE_CHECKING: from pandas._typing import ( + DtypeBackend, FilePath, ReadBuffer, StorageOptions, @@ -70,7 +68,7 @@ def read_feather( columns: Sequence[Hashable] | None = None, use_threads: bool = True, storage_options: StorageOptions = None, - use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ): """ Load a feather-format object from the file path. @@ -90,18 +88,9 @@ def read_feather( .. versionadded:: 1.2.0 - use_nullable_dtypes : bool = False - Whether or not to use nullable dtypes as default when reading data. If - set to True, nullable dtypes are used for all dtypes that have a nullable - implementation, even if no nulls are present. - - .. note:: - - The nullable dtype implementation can be configured by calling - ``pd.set_option("mode.dtype_backend", "pandas")`` to use - numpy-backed nullable dtypes or - ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use - pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames + Which dtype_backend to use, e.g. whether a DataFrame should have NumPy + arrays, nullable extension dtypes or pyarrow dtypes. .. versionadded:: 2.0 @@ -112,27 +101,19 @@ def read_feather( import_optional_dependency("pyarrow") from pyarrow import feather - use_nullable_dtypes = ( - use_nullable_dtypes - if use_nullable_dtypes is not lib.no_default - else using_nullable_dtypes() - ) - with get_handle( path, "rb", storage_options=storage_options, is_text=False ) as handles: - if not use_nullable_dtypes: + if dtype_backend is lib.no_default: return feather.read_feather( handles.handle, columns=columns, use_threads=bool(use_threads) ) - dtype_backend = get_option("mode.dtype_backend") - pa_table = feather.read_table( handles.handle, columns=columns, use_threads=bool(use_threads) ) - if dtype_backend == "pandas": + if dtype_backend == "numpy_nullable": from pandas.io._util import _arrow_dtype_mapping return pa_table.to_pandas(types_mapper=_arrow_dtype_mapping().get) diff --git a/pandas/io/html.py b/pandas/io/html.py index 25eb6bd0bbc90..15604101f2121 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -18,8 +18,6 @@ cast, ) -from pandas._config import using_nullable_dtypes - from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import ( @@ -48,6 +46,7 @@ if TYPE_CHECKING: from pandas._typing import ( BaseBuffer, + DtypeBackend, FilePath, ReadBuffer, ) @@ -1007,7 +1006,7 @@ def read_html( keep_default_na: bool = True, displayed_only: bool = True, extract_links: Literal[None, "header", "footer", "body", "all"] = None, - use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> list[DataFrame]: r""" Read HTML tables into a ``list`` of ``DataFrame`` objects. @@ -1108,18 +1107,9 @@ def read_html( .. versionadded:: 1.5.0 - use_nullable_dtypes : bool = False - Whether to use nullable dtypes as default when reading data. If - set to True, nullable dtypes are used for all dtypes that have a nullable - implementation, even if no nulls are present. - - .. note:: - - The nullable dtype implementation can be configured by calling - ``pd.set_option("mode.dtype_backend", "pandas")`` to use - numpy-backed nullable dtypes or - ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use - pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames + Which dtype_backend to use, e.g. whether a DataFrame should have NumPy + arrays, nullable extension dtypes or pyarrow dtypes. .. versionadded:: 2.0 @@ -1177,12 +1167,6 @@ def read_html( ) validate_header_arg(header) - use_nullable_dtypes = ( - use_nullable_dtypes - if use_nullable_dtypes is not lib.no_default - else using_nullable_dtypes() - ) - io = stringify_path(io) return _parse( @@ -1202,5 +1186,5 @@ def read_html( keep_default_na=keep_default_na, displayed_only=displayed_only, extract_links=extract_links, - use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, ) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 28138249ec30b..91f150be9776c 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -20,11 +20,6 @@ import numpy as np -from pandas._config import ( - get_option, - using_nullable_dtypes, -) - from pandas._libs import lib from pandas._libs.json import ( dumps, @@ -77,6 +72,7 @@ from pandas._typing import ( CompressionOptions, DtypeArg, + DtypeBackend, FilePath, IndexLabel, JSONEngine, @@ -407,7 +403,7 @@ def read_json( compression: CompressionOptions = ..., nrows: int | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., engine: JSONEngine = ..., ) -> JsonReader[Literal["frame"]]: ... @@ -432,7 +428,7 @@ def read_json( compression: CompressionOptions = ..., nrows: int | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., engine: JSONEngine = ..., ) -> JsonReader[Literal["series"]]: ... @@ -457,7 +453,7 @@ def read_json( compression: CompressionOptions = ..., nrows: int | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., engine: JSONEngine = ..., ) -> Series: ... @@ -482,7 +478,7 @@ def read_json( compression: CompressionOptions = ..., nrows: int | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., engine: JSONEngine = ..., ) -> DataFrame: ... @@ -510,7 +506,7 @@ def read_json( compression: CompressionOptions = "infer", nrows: int | None = None, storage_options: StorageOptions = None, - use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, engine: JSONEngine = "ujson", ) -> DataFrame | Series | JsonReader: """ @@ -651,18 +647,9 @@ def read_json( .. versionadded:: 1.2.0 - use_nullable_dtypes : bool = False - Whether or not to use nullable dtypes as default when reading data. If - set to True, nullable dtypes are used for all dtypes that have a nullable - implementation, even if no nulls are present. - - .. note:: - - The nullable dtype implementation can be configured by calling - ``pd.set_option("mode.dtype_backend", "pandas")`` to use - numpy-backed nullable dtypes or - ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use - pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames + Which dtype_backend to use, e.g. whether a DataFrame should have NumPy + arrays, nullable extension dtypes or pyarrow dtypes. .. versionadded:: 2.0 @@ -756,12 +743,6 @@ def read_json( if orient == "table" and convert_axes: raise ValueError("cannot pass both convert_axes and orient='table'") - use_nullable_dtypes = ( - use_nullable_dtypes - if use_nullable_dtypes is not lib.no_default - else using_nullable_dtypes() - ) - if dtype is None and orient != "table": # error: Incompatible types in assignment (expression has type "bool", variable # has type "Union[ExtensionDtype, str, dtype[Any], Type[str], Type[float], @@ -789,7 +770,7 @@ def read_json( nrows=nrows, storage_options=storage_options, encoding_errors=encoding_errors, - use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, engine=engine, ) @@ -826,7 +807,7 @@ def __init__( nrows: int | None, storage_options: StorageOptions = None, encoding_errors: str | None = "strict", - use_nullable_dtypes: bool = False, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, engine: JSONEngine = "ujson", ) -> None: self.orient = orient @@ -847,7 +828,7 @@ def __init__( self.nrows = nrows self.encoding_errors = encoding_errors self.handles: IOHandles[str] | None = None - self.use_nullable_dtypes = use_nullable_dtypes + self.dtype_backend = dtype_backend if self.engine not in {"pyarrow", "ujson"}: raise ValueError( @@ -962,15 +943,13 @@ def read(self) -> DataFrame | Series: if self.engine == "pyarrow": pyarrow_json = import_optional_dependency("pyarrow.json") pa_table = pyarrow_json.read_json(self.data) - if self.use_nullable_dtypes: - if get_option("mode.dtype_backend") == "pyarrow": - return pa_table.to_pandas(types_mapper=ArrowDtype) + if self.dtype_backend == "pyarrow": + return pa_table.to_pandas(types_mapper=ArrowDtype) + elif self.dtype_backend == "numpy_nullable": + from pandas.io._util import _arrow_dtype_mapping - elif get_option("mode.dtype_backend") == "pandas": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping() - return pa_table.to_pandas(types_mapper=mapping.get) + mapping = _arrow_dtype_mapping() + return pa_table.to_pandas(types_mapper=mapping.get) return pa_table.to_pandas() elif self.engine == "ujson": if self.lines: @@ -986,8 +965,10 @@ def read(self) -> DataFrame | Series: obj = self._get_object_parser(self._combine_lines(data_lines)) else: obj = self._get_object_parser(self.data) - if self.use_nullable_dtypes: - return obj.convert_dtypes(infer_objects=False) + if self.dtype_backend is not lib.no_default: + return obj.convert_dtypes( + infer_objects=False, dtype_backend=self.dtype_backend + ) else: return obj @@ -1005,7 +986,7 @@ def _get_object_parser(self, json) -> DataFrame | Series: "keep_default_dates": self.keep_default_dates, "precise_float": self.precise_float, "date_unit": self.date_unit, - "use_nullable_dtypes": self.use_nullable_dtypes, + "dtype_backend": self.dtype_backend, } obj = None if typ == "frame": @@ -1064,8 +1045,10 @@ def __next__(self) -> DataFrame | Series: self.close() raise ex - if self.use_nullable_dtypes: - return obj.convert_dtypes(infer_objects=False) + if self.dtype_backend is not lib.no_default: + return obj.convert_dtypes( + infer_objects=False, dtype_backend=self.dtype_backend + ) else: return obj @@ -1103,7 +1086,7 @@ def __init__( keep_default_dates: bool = False, precise_float: bool = False, date_unit=None, - use_nullable_dtypes: bool = False, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> None: self.json = json @@ -1128,7 +1111,7 @@ def __init__( self.date_unit = date_unit self.keep_default_dates = keep_default_dates self.obj: DataFrame | Series | None = None - self.use_nullable_dtypes = use_nullable_dtypes + self.dtype_backend = dtype_backend def check_keys_split(self, decoded) -> None: """ @@ -1206,7 +1189,7 @@ def _try_convert_data( if result: return new_data, True - if self.use_nullable_dtypes and not isinstance(data, ABCIndex): + if self.dtype_backend is not lib.no_default and not isinstance(data, ABCIndex): # Fall through for conversion later on return data, True elif data.dtype == "object": diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 8db5d4608d3fd..a4ac9fe9d3042 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -9,11 +9,6 @@ Literal, ) -from pandas._config import ( - get_option, - using_nullable_dtypes, -) - from pandas._libs import lib from pandas.compat import pa_version_under8p0 from pandas.compat._optional import import_optional_dependency @@ -34,6 +29,7 @@ if TYPE_CHECKING: from pandas._typing import ( + DtypeBackend, FilePath, ReadBuffer, WriteBuffer, @@ -45,7 +41,7 @@ def read_orc( path: FilePath | ReadBuffer[bytes], columns: list[str] | None = None, - use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, filesystem=None, **kwargs, ) -> DataFrame: @@ -65,18 +61,9 @@ def read_orc( Output always follows the ordering of the file and not the columns list. This mirrors the original behaviour of :external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`. - use_nullable_dtypes : bool, default False - Whether or not to use nullable dtypes as default when reading data. If - set to True, nullable dtypes are used for all dtypes that have a nullable - implementation, even if no nulls are present. - - .. note:: - - The nullable dtype implementation can be configured by calling - ``pd.set_option("mode.dtype_backend", "pandas")`` to use - numpy-backed nullable dtypes or - ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use - pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames + Which dtype_backend to use, e.g. whether a DataFrame should have NumPy + arrays, nullable extension dtypes or pyarrow dtypes. .. versionadded:: 2.0 @@ -106,12 +93,6 @@ def read_orc( orc = import_optional_dependency("pyarrow.orc") - use_nullable_dtypes = ( - use_nullable_dtypes - if use_nullable_dtypes is not lib.no_default - else using_nullable_dtypes() - ) - with get_handle(path, "rb", is_text=False) as handles: source = handles.handle if is_fsspec_url(path) and filesystem is None: @@ -125,8 +106,7 @@ def read_orc( pa_table = orc.read_table( source=source, columns=columns, filesystem=filesystem, **kwargs ) - if use_nullable_dtypes: - dtype_backend = get_option("mode.dtype_backend") + if dtype_backend is not lib.no_default: if dtype_backend == "pyarrow": df = pa_table.to_pandas(types_mapper=pd.ArrowDtype) else: diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 7ad3ba295068b..04a83c94ac737 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -8,14 +8,14 @@ Any, Literal, ) +import warnings from warnings import catch_warnings -from pandas._config import using_nullable_dtypes - from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level import pandas as pd from pandas import ( @@ -36,6 +36,7 @@ if TYPE_CHECKING: from pandas._typing import ( + DtypeBackend, FilePath, ReadBuffer, StorageOptions, @@ -221,19 +222,20 @@ def read( path, columns=None, use_nullable_dtypes: bool = False, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, storage_options: StorageOptions = None, **kwargs, ) -> DataFrame: kwargs["use_pandas_metadata"] = True - dtype_backend = get_option("mode.dtype_backend") to_pandas_kwargs = {} - if use_nullable_dtypes: - if dtype_backend == "pandas": - from pandas.io._util import _arrow_dtype_mapping + if dtype_backend == "numpy_nullable": + from pandas.io._util import _arrow_dtype_mapping - mapping = _arrow_dtype_mapping() - to_pandas_kwargs["types_mapper"] = mapping.get + mapping = _arrow_dtype_mapping() + to_pandas_kwargs["types_mapper"] = mapping.get + elif dtype_backend == "pyarrow": + to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] # noqa manager = get_option("mode.data_manager") if manager == "array": @@ -249,13 +251,7 @@ def read( pa_table = self.api.parquet.read_table( path_or_handle, columns=columns, **kwargs ) - if dtype_backend == "pandas": - result = pa_table.to_pandas(**to_pandas_kwargs) - elif dtype_backend == "pyarrow": - # Incompatible types in assignment (expression has type - # "Type[ArrowDtype]", target has type overloaded function - to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] # noqa - result = pa_table.to_pandas(**to_pandas_kwargs) + result = pa_table.to_pandas(**to_pandas_kwargs) if manager == "array": result = result._as_manager("array", copy=False) @@ -326,6 +322,7 @@ def read( ) -> DataFrame: parquet_kwargs: dict[str, Any] = {} use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False) + dtype_backend = kwargs.pop("dtype_backend", lib.no_default) if Version(self.api.__version__) >= Version("0.7.1"): # We are disabling nullable dtypes for fastparquet pending discussion parquet_kwargs["pandas_nulls"] = False @@ -334,6 +331,11 @@ def read( "The 'use_nullable_dtypes' argument is not supported for the " "fastparquet engine" ) + if dtype_backend is not lib.no_default: + raise ValueError( + "The 'dtype_backend' argument is not supported for the " + "fastparquet engine" + ) path = stringify_path(path) handles = None if is_fsspec_url(path): @@ -454,6 +456,7 @@ def read_parquet( columns: list[str] | None = None, storage_options: StorageOptions = None, use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, **kwargs, ) -> DataFrame: """ @@ -492,17 +495,13 @@ def read_parquet( Note: this is an experimental option, and behaviour (e.g. additional support dtypes) may change without notice. - .. versionadded:: 1.2.0 + .. deprecated:: 2.0 - .. note:: + dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames + Which dtype_backend to use, e.g. whether a DataFrame should have NumPy + arrays, nullable extension dtypes or pyarrow dtypes. - The nullable dtype implementation can be configured by calling - ``pd.set_option("mode.dtype_backend", "pandas")`` to use - numpy-backed nullable dtypes or - ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use - pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). - - .. versionadded:: 2.0.0 + .. versionadded:: 2.0 **kwargs Any additional kwargs are passed to the engine. @@ -512,17 +511,21 @@ def read_parquet( DataFrame """ impl = get_engine(engine) - - use_nullable_dtypes = ( - use_nullable_dtypes - if use_nullable_dtypes is not lib.no_default - else using_nullable_dtypes() - ) + if use_nullable_dtypes is not lib.no_default: + warnings.warn( + "The argument 'use_nullable_dtypes' is deprecated. Use " + "dtype_backend='numpy_nullable' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + use_nullable_dtypes = False return impl.read( path, columns=columns, storage_options=storage_options, use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, **kwargs, ) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index e71a1732da037..b98a31e3f940b 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -7,10 +7,7 @@ from pandas.core.dtypes.inference import is_integer import pandas as pd -from pandas import ( - DataFrame, - get_option, -) +from pandas import DataFrame from pandas.io.parsers.base_parser import ParserBase @@ -152,10 +149,7 @@ def read(self) -> DataFrame: parse_options=pyarrow_csv.ParseOptions(**self.parse_options), convert_options=pyarrow_csv.ConvertOptions(**self.convert_options), ) - if ( - self.kwds["use_nullable_dtypes"] - and get_option("mode.dtype_backend") == "pyarrow" - ): + if self.kwds["dtype_backend"] == "pyarrow": frame = table.to_pandas(types_mapper=pd.ArrowDtype) else: frame = table.to_pandas() diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 90ad0d04e0ea7..7a90fadcb9c6f 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -13,7 +13,6 @@ Hashable, Iterable, List, - Literal, Mapping, Sequence, Tuple, @@ -25,8 +24,6 @@ import numpy as np -from pandas._config.config import get_option - from pandas._libs import ( lib, parsers, @@ -127,7 +124,7 @@ def __init__(self, kwds) -> None: self.dtype = copy(kwds.get("dtype", None)) self.converters = kwds.get("converters") - self.use_nullable_dtypes = kwds.get("use_nullable_dtypes", False) + self.dtype_backend = kwds.get("dtype_backend") self.true_values = kwds.get("true_values") self.false_values = kwds.get("false_values") @@ -691,10 +688,10 @@ def _infer_types( np.putmask(values, mask, np.nan) return values, na_count - use_nullable_dtypes: Literal[True] | Literal[False] = ( - self.use_nullable_dtypes and no_dtype_specified + dtype_backend = self.dtype_backend + non_default_dtype_backend = ( + no_dtype_specified and dtype_backend is not lib.no_default ) - dtype_backend = get_option("mode.dtype_backend") result: ArrayLike if try_num_bool and is_object_dtype(values.dtype): @@ -704,7 +701,7 @@ def _infer_types( values, na_values, False, - convert_to_masked_nullable=use_nullable_dtypes, + convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] # noqa ) except (ValueError, TypeError): # e.g. encountering datetime string gets ValueError @@ -712,7 +709,7 @@ def _infer_types( na_count = parsers.sanitize_objects(values, na_values) result = values else: - if use_nullable_dtypes: + if non_default_dtype_backend: if result_mask is None: result_mask = np.zeros(result.shape, dtype=np.bool_) @@ -740,19 +737,19 @@ def _infer_types( np.asarray(values), true_values=self.true_values, false_values=self.false_values, - convert_to_masked_nullable=use_nullable_dtypes, + convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] # noqa ) - if result.dtype == np.bool_ and use_nullable_dtypes: + if result.dtype == np.bool_ and non_default_dtype_backend: if bool_mask is None: bool_mask = np.zeros(result.shape, dtype=np.bool_) result = BooleanArray(result, bool_mask) - elif result.dtype == np.object_ and use_nullable_dtypes: + elif result.dtype == np.object_ and non_default_dtype_backend: # read_excel sends array of datetime objects inferred_type = lib.infer_dtype(result) if inferred_type != "datetime": result = StringDtype().construct_array_type()._from_sequence(values) - if use_nullable_dtypes and dtype_backend == "pyarrow": + if dtype_backend == "pyarrow": pa = import_optional_dependency("pyarrow") if isinstance(result, np.ndarray): result = ArrowExtensionArray(pa.array(result, from_pandas=True)) @@ -1186,7 +1183,7 @@ def converter(*date_cols, col: Hashable): "skip_blank_lines": True, "encoding_errors": "strict", "on_bad_lines": ParserBase.BadLineHandleMethod.ERROR, - "use_nullable_dtypes": False, + "dtype_backend": lib.no_default, } diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index e87a34ddee0ff..4b8bc5c402157 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -11,9 +11,10 @@ import numpy as np -from pandas._config.config import get_option - -from pandas._libs import parsers +from pandas._libs import ( + lib, + parsers, +) from pandas.compat._optional import import_optional_dependency from pandas.errors import DtypeWarning from pandas.util._exceptions import find_stack_level @@ -83,9 +84,9 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: kwds.pop(key, None) kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None)) - dtype_backend = get_option("mode.dtype_backend") - kwds["dtype_backend"] = dtype_backend - if dtype_backend == "pyarrow": + if "dtype_backend" not in kwds or kwds["dtype_backend"] is lib.no_default: + kwds["dtype_backend"] = "numpy" + if kwds["dtype_backend"] == "pyarrow": # Fail here loudly instead of in cython after reading import_optional_dependency("pyarrow") self._reader = parsers.TextReader(src, **kwds) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index af0d34745a317..88ff826d2bf81 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -24,8 +24,6 @@ import numpy as np -from pandas._config import using_nullable_dtypes - from pandas._libs import lib from pandas._libs.parsers import STR_NA_VALUES from pandas.errors import ( @@ -71,6 +69,7 @@ CompressionOptions, CSVEngine, DtypeArg, + DtypeBackend, FilePath, IndexLabel, ReadCsvBuffer, @@ -403,18 +402,9 @@ .. versionadded:: 1.2 -use_nullable_dtypes : bool = False - Whether or not to use nullable dtypes as default when reading data. If - set to True, nullable dtypes are used for all dtypes that have a nullable - implementation, even if no nulls are present. - - .. note:: - - The nullable dtype implementation can be configured by calling - ``pd.set_option("mode.dtype_backend", "pandas")`` to use - numpy-backed nullable dtypes or - ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use - pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). +dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames + Which dtype_backend to use, e.g. whether a DataFrame should have NumPy + arrays, nullable extension dtypes or pyarrow dtypes. .. versionadded:: 2.0 @@ -644,7 +634,7 @@ def read_csv( memory_map: bool = ..., float_precision: Literal["high", "legacy"] | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool | lib.NoDefault = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., ) -> TextFileReader: ... @@ -701,7 +691,7 @@ def read_csv( memory_map: bool = ..., float_precision: Literal["high", "legacy"] | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool | lib.NoDefault = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., ) -> TextFileReader: ... @@ -758,7 +748,7 @@ def read_csv( memory_map: bool = ..., float_precision: Literal["high", "legacy"] | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool | lib.NoDefault = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., ) -> DataFrame: ... @@ -815,7 +805,7 @@ def read_csv( memory_map: bool = ..., float_precision: Literal["high", "legacy"] | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool | lib.NoDefault = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., ) -> DataFrame | TextFileReader: ... @@ -888,7 +878,7 @@ def read_csv( memory_map: bool = False, float_precision: Literal["high", "legacy"] | None = None, storage_options: StorageOptions = None, - use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: if infer_datetime_format is not lib.no_default: warnings.warn( @@ -914,7 +904,7 @@ def read_csv( on_bad_lines, names, defaults={"delimiter": ","}, - use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, ) kwds.update(kwds_defaults) @@ -973,7 +963,7 @@ def read_table( memory_map: bool = ..., float_precision: str | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool | lib.NoDefault = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., ) -> TextFileReader: ... @@ -1030,7 +1020,7 @@ def read_table( memory_map: bool = ..., float_precision: str | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool | lib.NoDefault = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., ) -> TextFileReader: ... @@ -1087,7 +1077,7 @@ def read_table( memory_map: bool = ..., float_precision: str | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool | lib.NoDefault = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., ) -> DataFrame: ... @@ -1144,7 +1134,7 @@ def read_table( memory_map: bool = ..., float_precision: str | None = ..., storage_options: StorageOptions = ..., - use_nullable_dtypes: bool | lib.NoDefault = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., ) -> DataFrame | TextFileReader: ... @@ -1217,7 +1207,7 @@ def read_table( memory_map: bool = False, float_precision: str | None = None, storage_options: StorageOptions = None, - use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: if infer_datetime_format is not lib.no_default: warnings.warn( @@ -1244,7 +1234,7 @@ def read_table( on_bad_lines, names, defaults={"delimiter": "\t"}, - use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, ) kwds.update(kwds_defaults) @@ -1257,7 +1247,7 @@ def read_fwf( colspecs: Sequence[tuple[int, int]] | str | None = "infer", widths: Sequence[int] | None = None, infer_nrows: int = 100, - use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, **kwds, ) -> DataFrame | TextFileReader: r""" @@ -1289,20 +1279,9 @@ def read_fwf( infer_nrows : int, default 100 The number of rows to consider when letting the parser determine the `colspecs`. - use_nullable_dtypes : bool = False - Whether or not to use nullable dtypes as default when reading data. If - set to True, nullable dtypes are used for all dtypes that have a nullable - implementation, even if no nulls are present. - - .. note:: - - The nullable dtype implementation can be configured by calling - ``pd.set_option("mode.dtype_backend", "pandas")`` to use - numpy-backed nullable dtypes or - ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use - pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). - This is only implemented for the ``pyarrow`` or ``python`` - engines. + dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames + Which dtype_backend to use, e.g. whether a DataFrame should have NumPy + arrays, nullable extension dtypes or pyarrow dtypes. .. versionadded:: 2.0 @@ -1330,12 +1309,6 @@ def read_fwf( if colspecs not in (None, "infer") and widths is not None: raise ValueError("You must specify only one of 'widths' and 'colspecs'") - use_nullable_dtypes = ( - use_nullable_dtypes - if use_nullable_dtypes is not lib.no_default - else using_nullable_dtypes() - ) - # Compute 'colspecs' from 'widths', if specified. if widths is not None: colspecs, col = [], 0 @@ -1368,7 +1341,7 @@ def read_fwf( kwds["colspecs"] = colspecs kwds["infer_nrows"] = infer_nrows kwds["engine"] = "python-fwf" - kwds["use_nullable_dtypes"] = use_nullable_dtypes + kwds["dtype_backend"] = dtype_backend return _read(filepath_or_buffer, kwds) @@ -1907,7 +1880,7 @@ def _refine_defaults_read( on_bad_lines: str | Callable, names: Sequence[Hashable] | None | lib.NoDefault, defaults: dict[str, Any], - use_nullable_dtypes: bool | lib.NoDefault, + dtype_backend: DtypeBackend | lib.NoDefault, ): """Validate/refine default values of input parameters of read_csv, read_table. @@ -2021,12 +1994,7 @@ def _refine_defaults_read( else: raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines") - use_nullable_dtypes = ( - use_nullable_dtypes - if use_nullable_dtypes is not lib.no_default - else using_nullable_dtypes() - ) - kwds["use_nullable_dtypes"] = use_nullable_dtypes + kwds["dtype_backend"] = dtype_backend return kwds diff --git a/pandas/io/spss.py b/pandas/io/spss.py index 4f898b3e2402d..a80e4497999e6 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -5,8 +5,6 @@ Sequence, ) -from pandas._config import using_nullable_dtypes - from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -17,6 +15,8 @@ if TYPE_CHECKING: from pathlib import Path + from pandas._typing import DtypeBackend + from pandas import DataFrame @@ -24,7 +24,7 @@ def read_spss( path: str | Path, usecols: Sequence[str] | None = None, convert_categoricals: bool = True, - use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame: """ Load an SPSS file from the file path, returning a DataFrame. @@ -37,18 +37,9 @@ def read_spss( Return a subset of the columns. If None, return all columns. convert_categoricals : bool, default is True Convert categorical columns into pd.Categorical. - use_nullable_dtypes : bool = False - Whether to use nullable dtypes as default when reading data. If - set to True, nullable dtypes are used for all dtypes that have a nullable - implementation, even if no nulls are present. - - .. note:: - - The nullable dtype implementation can be configured by calling - ``pd.set_option("mode.dtype_backend", "pandas")`` to use - numpy-backed nullable dtypes or - ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use - pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames + Which dtype_backend to use, e.g. whether a DataFrame should have NumPy + arrays, nullable extension dtypes or pyarrow dtypes. .. versionadded:: 2.0 @@ -58,12 +49,6 @@ def read_spss( """ pyreadstat = import_optional_dependency("pyreadstat") - use_nullable_dtypes = ( - use_nullable_dtypes - if use_nullable_dtypes is not lib.no_default - else using_nullable_dtypes() - ) - if usecols is not None: if not is_list_like(usecols): raise TypeError("usecols must be list-like.") @@ -72,6 +57,6 @@ def read_spss( df, _ = pyreadstat.read_sav( stringify_path(path), usecols=usecols, apply_value_formats=convert_categoricals ) - if use_nullable_dtypes: - df = df.convert_dtypes() + if dtype_backend is not lib.no_default: + df = df.convert_dtypes(dtype_backend=dtype_backend) return df diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 340e181121bdb..5950f080e84a3 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -32,8 +32,6 @@ import numpy as np -from pandas._config import using_nullable_dtypes - from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import ( @@ -72,6 +70,7 @@ from pandas._typing import ( DateTimeErrorChoices, DtypeArg, + DtypeBackend, IndexLabel, ) @@ -144,16 +143,15 @@ def _convert_arrays_to_dataframe( data, columns, coerce_float: bool = True, - use_nullable_dtypes: bool = False, + dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", ) -> DataFrame: content = lib.to_object_array_tuples(data) arrays = convert_object_array( list(content.T), dtype=None, coerce_float=coerce_float, - use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, ) - dtype_backend = get_option("mode.dtype_backend") if dtype_backend == "pyarrow": pa = import_optional_dependency("pyarrow") arrays = [ @@ -172,12 +170,10 @@ def _wrap_result( coerce_float: bool = True, parse_dates=None, dtype: DtypeArg | None = None, - use_nullable_dtypes: bool = False, + dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", ): """Wrap result set of query in a DataFrame.""" - frame = _convert_arrays_to_dataframe( - data, columns, coerce_float, use_nullable_dtypes - ) + frame = _convert_arrays_to_dataframe(data, columns, coerce_float, dtype_backend) if dtype: frame = frame.astype(dtype) @@ -235,7 +231,7 @@ def read_sql_table( parse_dates: list[str] | dict[str, str] | None = ..., columns: list[str] | None = ..., chunksize: None = ..., - use_nullable_dtypes: bool | lib.NoDefault = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., ) -> DataFrame: ... @@ -250,7 +246,7 @@ def read_sql_table( parse_dates: list[str] | dict[str, str] | None = ..., columns: list[str] | None = ..., chunksize: int = ..., - use_nullable_dtypes: bool | lib.NoDefault = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., ) -> Iterator[DataFrame]: ... @@ -264,7 +260,7 @@ def read_sql_table( parse_dates: list[str] | dict[str, str] | None = None, columns: list[str] | None = None, chunksize: int | None = None, - use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | Iterator[DataFrame]: """ Read SQL database table into a DataFrame. @@ -301,18 +297,9 @@ def read_sql_table( chunksize : int, default None If specified, returns an iterator where `chunksize` is the number of rows to include in each chunk. - use_nullable_dtypes : bool = False - Whether to use nullable dtypes as default when reading data. If - set to True, nullable dtypes are used for all dtypes that have a nullable - implementation, even if no nulls are present. - - .. note:: - - The nullable dtype implementation can be configured by calling - ``pd.set_option("mode.dtype_backend", "pandas")`` to use - numpy-backed nullable dtypes or - ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use - pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames + Which dtype_backend to use, e.g. whether a DataFrame should have NumPy + arrays, nullable extension dtypes or pyarrow dtypes. .. versionadded:: 2.0 @@ -335,11 +322,9 @@ def read_sql_table( -------- >>> pd.read_sql_table('table_name', 'postgres:///db_name') # doctest:+SKIP """ - use_nullable_dtypes = ( - use_nullable_dtypes - if use_nullable_dtypes is not lib.no_default - else using_nullable_dtypes() - ) + + if dtype_backend is lib.no_default: + dtype_backend = "numpy" # type: ignore[assignment] with pandasSQL_builder(con, schema=schema, need_transaction=True) as pandas_sql: if not pandas_sql.has_table(table_name): @@ -352,7 +337,7 @@ def read_sql_table( parse_dates=parse_dates, columns=columns, chunksize=chunksize, - use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, ) if table is not None: @@ -371,7 +356,7 @@ def read_sql_query( parse_dates: list[str] | dict[str, str] | None = ..., chunksize: None = ..., dtype: DtypeArg | None = ..., - use_nullable_dtypes: bool | lib.NoDefault = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., ) -> DataFrame: ... @@ -386,7 +371,7 @@ def read_sql_query( parse_dates: list[str] | dict[str, str] | None = ..., chunksize: int = ..., dtype: DtypeArg | None = ..., - use_nullable_dtypes: bool | lib.NoDefault = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., ) -> Iterator[DataFrame]: ... @@ -400,7 +385,7 @@ def read_sql_query( parse_dates: list[str] | dict[str, str] | None = None, chunksize: int | None = None, dtype: DtypeArg | None = None, - use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | Iterator[DataFrame]: """ Read SQL query into a DataFrame. @@ -444,18 +429,9 @@ def read_sql_query( {‘a’: np.float64, ‘b’: np.int32, ‘c’: ‘Int64’}. .. versionadded:: 1.3.0 - use_nullable_dtypes : bool = False - Whether to use nullable dtypes as default when reading data. If - set to True, nullable dtypes are used for all dtypes that have a nullable - implementation, even if no nulls are present. - - .. note:: - - The nullable dtype implementation can be configured by calling - ``pd.set_option("mode.dtype_backend", "pandas")`` to use - numpy-backed nullable dtypes or - ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use - pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames + Which dtype_backend to use, e.g. whether a DataFrame should have NumPy + arrays, nullable extension dtypes or pyarrow dtypes. .. versionadded:: 2.0 @@ -473,11 +449,9 @@ def read_sql_query( Any datetime values with time zone information parsed via the `parse_dates` parameter will be converted to UTC. """ - use_nullable_dtypes = ( - use_nullable_dtypes - if use_nullable_dtypes is not lib.no_default - else using_nullable_dtypes() - ) + + if dtype_backend is lib.no_default: + dtype_backend = "numpy" # type: ignore[assignment] with pandasSQL_builder(con) as pandas_sql: return pandas_sql.read_query( @@ -488,7 +462,7 @@ def read_sql_query( parse_dates=parse_dates, chunksize=chunksize, dtype=dtype, - use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, ) @@ -502,7 +476,7 @@ def read_sql( parse_dates=..., columns: list[str] = ..., chunksize: None = ..., - use_nullable_dtypes: bool | lib.NoDefault = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., dtype: DtypeArg | None = None, ) -> DataFrame: ... @@ -518,7 +492,7 @@ def read_sql( parse_dates=..., columns: list[str] = ..., chunksize: int = ..., - use_nullable_dtypes: bool | lib.NoDefault = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., dtype: DtypeArg | None = None, ) -> Iterator[DataFrame]: ... @@ -533,7 +507,7 @@ def read_sql( parse_dates=None, columns: list[str] | None = None, chunksize: int | None = None, - use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, dtype: DtypeArg | None = None, ) -> DataFrame | Iterator[DataFrame]: """ @@ -582,18 +556,9 @@ def read_sql( chunksize : int, default None If specified, return an iterator where `chunksize` is the number of rows to include in each chunk. - use_nullable_dtypes : bool = False - Whether to use nullable dtypes as default when reading data. If - set to True, nullable dtypes are used for all dtypes that have a nullable - implementation, even if no nulls are present. - - .. note:: - - The nullable dtype implementation can be configured by calling - ``pd.set_option("mode.dtype_backend", "pandas")`` to use - numpy-backed nullable dtypes or - ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use - pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames + Which dtype_backend to use, e.g. whether a DataFrame should have NumPy + arrays, nullable extension dtypes or pyarrow dtypes. .. versionadded:: 2.0 dtype : Type name or dict of columns @@ -644,11 +609,9 @@ def read_sql( 0 0 2012-11-10 1 1 2010-11-12 """ - use_nullable_dtypes = ( - use_nullable_dtypes - if use_nullable_dtypes is not lib.no_default - else using_nullable_dtypes() - ) + + if dtype_backend is lib.no_default: + dtype_backend = "numpy" # type: ignore[assignment] with pandasSQL_builder(con) as pandas_sql: if isinstance(pandas_sql, SQLiteDatabase): @@ -659,7 +622,7 @@ def read_sql( coerce_float=coerce_float, parse_dates=parse_dates, chunksize=chunksize, - use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, # type: ignore[arg-type] dtype=dtype, ) @@ -677,7 +640,7 @@ def read_sql( parse_dates=parse_dates, columns=columns, chunksize=chunksize, - use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, ) else: return pandas_sql.read_query( @@ -687,7 +650,7 @@ def read_sql( coerce_float=coerce_float, parse_dates=parse_dates, chunksize=chunksize, - use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, dtype=dtype, ) @@ -1055,7 +1018,7 @@ def _query_iterator( columns, coerce_float: bool = True, parse_dates=None, - use_nullable_dtypes: bool = False, + dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", ): """Return generator through chunked result set.""" has_read_data = False @@ -1071,11 +1034,11 @@ def _query_iterator( has_read_data = True self.frame = _convert_arrays_to_dataframe( - data, columns, coerce_float, use_nullable_dtypes + data, columns, coerce_float, dtype_backend ) self._harmonize_columns( - parse_dates=parse_dates, use_nullable_dtypes=use_nullable_dtypes + parse_dates=parse_dates, dtype_backend=dtype_backend ) if self.index is not None: @@ -1090,7 +1053,7 @@ def read( parse_dates=None, columns=None, chunksize=None, - use_nullable_dtypes: bool = False, + dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", ) -> DataFrame | Iterator[DataFrame]: from sqlalchemy import select @@ -1113,16 +1076,16 @@ def read( column_names, coerce_float=coerce_float, parse_dates=parse_dates, - use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, ) else: data = result.fetchall() self.frame = _convert_arrays_to_dataframe( - data, column_names, coerce_float, use_nullable_dtypes + data, column_names, coerce_float, dtype_backend ) self._harmonize_columns( - parse_dates=parse_dates, use_nullable_dtypes=use_nullable_dtypes + parse_dates=parse_dates, dtype_backend=dtype_backend ) if self.index is not None: @@ -1207,7 +1170,9 @@ def _create_table_setup(self): return Table(self.name, meta, *columns, schema=schema) def _harmonize_columns( - self, parse_dates=None, use_nullable_dtypes: bool = False + self, + parse_dates=None, + dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", ) -> None: """ Make the DataFrame's column types align with the SQL table @@ -1248,11 +1213,11 @@ def _harmonize_columns( # Convert tz-aware Datetime SQL columns to UTC utc = col_type is DatetimeTZDtype self.frame[col_name] = _handle_date_column(df_col, utc=utc) - elif not use_nullable_dtypes and col_type is float: + elif dtype_backend == "numpy" and col_type is float: # floats support NA, can always convert! self.frame[col_name] = df_col.astype(col_type, copy=False) - elif not use_nullable_dtypes and len(df_col) == df_col.count(): + elif not dtype_backend == "numpy" and len(df_col) == df_col.count(): # No NA values, can convert ints and bools if col_type is np.dtype("int64") or col_type is bool: self.frame[col_name] = df_col.astype(col_type, copy=False) @@ -1379,7 +1344,7 @@ def read_table( columns=None, schema: str | None = None, chunksize: int | None = None, - use_nullable_dtypes: bool = False, + dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", ) -> DataFrame | Iterator[DataFrame]: raise NotImplementedError @@ -1393,7 +1358,7 @@ def read_query( params=None, chunksize: int | None = None, dtype: DtypeArg | None = None, - use_nullable_dtypes: bool = False, + dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", ) -> DataFrame | Iterator[DataFrame]: pass @@ -1587,7 +1552,7 @@ def read_table( columns=None, schema: str | None = None, chunksize: int | None = None, - use_nullable_dtypes: bool = False, + dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", ) -> DataFrame | Iterator[DataFrame]: """ Read SQL database table into a DataFrame. @@ -1620,18 +1585,9 @@ def read_table( chunksize : int, default None If specified, return an iterator where `chunksize` is the number of rows to include in each chunk. - use_nullable_dtypes : bool = False - Whether to use nullable dtypes as default when reading data. If - set to True, nullable dtypes are used for all dtypes that have a nullable - implementation, even if no nulls are present. - - .. note:: - - The nullable dtype implementation can be configured by calling - ``pd.set_option("mode.dtype_backend", "pandas")`` to use - numpy-backed nullable dtypes or - ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use - pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy + Which dtype_backend to use, e.g. whether a DataFrame should have NumPy + arrays, nullable extension dtypes or pyarrow dtypes. .. versionadded:: 2.0 @@ -1655,7 +1611,7 @@ def read_table( parse_dates=parse_dates, columns=columns, chunksize=chunksize, - use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, ) @staticmethod @@ -1668,7 +1624,7 @@ def _query_iterator( coerce_float: bool = True, parse_dates=None, dtype: DtypeArg | None = None, - use_nullable_dtypes: bool = False, + dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", ): """Return generator through chunked result set""" has_read_data = False @@ -1684,7 +1640,7 @@ def _query_iterator( coerce_float=coerce_float, parse_dates=parse_dates, dtype=dtype, - use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, ) break @@ -1696,7 +1652,7 @@ def _query_iterator( coerce_float=coerce_float, parse_dates=parse_dates, dtype=dtype, - use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, ) def read_query( @@ -1708,7 +1664,7 @@ def read_query( params=None, chunksize: int | None = None, dtype: DtypeArg | None = None, - use_nullable_dtypes: bool = False, + dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", ) -> DataFrame | Iterator[DataFrame]: """ Read SQL query into a DataFrame. @@ -1770,7 +1726,7 @@ def read_query( coerce_float=coerce_float, parse_dates=parse_dates, dtype=dtype, - use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, ) else: data = result.fetchall() @@ -1781,7 +1737,7 @@ def read_query( coerce_float=coerce_float, parse_dates=parse_dates, dtype=dtype, - use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, ) return frame @@ -2243,7 +2199,7 @@ def _query_iterator( coerce_float: bool = True, parse_dates=None, dtype: DtypeArg | None = None, - use_nullable_dtypes: bool = False, + dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", ): """Return generator through chunked result set""" has_read_data = False @@ -2270,7 +2226,7 @@ def _query_iterator( coerce_float=coerce_float, parse_dates=parse_dates, dtype=dtype, - use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, ) def read_query( @@ -2282,7 +2238,7 @@ def read_query( params=None, chunksize: int | None = None, dtype: DtypeArg | None = None, - use_nullable_dtypes: bool = False, + dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", ) -> DataFrame | Iterator[DataFrame]: cursor = self.execute(sql, params) columns = [col_desc[0] for col_desc in cursor.description] @@ -2296,7 +2252,7 @@ def read_query( coerce_float=coerce_float, parse_dates=parse_dates, dtype=dtype, - use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, ) else: data = self._fetchall_as_list(cursor) @@ -2309,7 +2265,7 @@ def read_query( coerce_float=coerce_float, parse_dates=parse_dates, dtype=dtype, - use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, ) return frame diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 25ef098881a92..03b31250fd6cb 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -12,8 +12,6 @@ Sequence, ) -from pandas._config import using_nullable_dtypes - from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import ( @@ -45,6 +43,7 @@ CompressionOptions, ConvertersArg, DtypeArg, + DtypeBackend, FilePath, ParseDatesArg, ReadBuffer, @@ -778,7 +777,7 @@ def _parse( iterparse: dict[str, list[str]] | None, compression: CompressionOptions, storage_options: StorageOptions, - use_nullable_dtypes: bool = False, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, **kwargs, ) -> DataFrame: """ @@ -848,7 +847,7 @@ def _parse( dtype=dtype, converters=converters, parse_dates=parse_dates, - use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, **kwargs, ) @@ -875,7 +874,7 @@ def read_xml( iterparse: dict[str, list[str]] | None = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, - use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame: r""" Read XML document into a ``DataFrame`` object. @@ -987,18 +986,9 @@ def read_xml( {storage_options} - use_nullable_dtypes : bool = False - Whether or not to use nullable dtypes as default when reading data. If - set to True, nullable dtypes are used for all dtypes that have a nullable - implementation, even if no nulls are present. - - .. note:: - - The nullable dtype implementation can be configured by calling - ``pd.set_option("mode.dtype_backend", "pandas")`` to use - numpy-backed nullable dtypes or - ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use - pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames + Which dtype_backend to use, e.g. whether a DataFrame should have NumPy + arrays, nullable extension dtypes or pyarrow dtypes. .. versionadded:: 2.0 @@ -1119,12 +1109,6 @@ def read_xml( 2 triangle 180 3.0 """ - use_nullable_dtypes = ( - use_nullable_dtypes - if use_nullable_dtypes is not lib.no_default - else using_nullable_dtypes() - ) - return _parse( path_or_buffer=path_or_buffer, xpath=xpath, @@ -1141,5 +1125,5 @@ def read_xml( iterparse=iterparse, compression=compression, storage_options=storage_options, - use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, ) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 919d25f5ba993..ad3be3d4014a7 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -56,8 +56,7 @@ def test_pyarrow_dtype_backend(self): "f": pd.Series(pd.timedelta_range("1D", periods=3)), } ) - with pd.option_context("mode.dtype_backend", "pyarrow"): - result = df.convert_dtypes() + result = df.convert_dtypes(dtype_backend="pyarrow") expected = pd.DataFrame( { "a": pd.arrays.ArrowExtensionArray( @@ -93,8 +92,7 @@ def test_pyarrow_dtype_backend(self): def test_pyarrow_dtype_backend_already_pyarrow(self): pytest.importorskip("pyarrow") expected = pd.DataFrame([1, 2, 3], dtype="int64[pyarrow]") - with pd.option_context("mode.dtype_backend", "pyarrow"): - result = expected.convert_dtypes() + result = expected.convert_dtypes(dtype_backend="pyarrow") tm.assert_frame_equal(result, expected) def test_pyarrow_dtype_backend_from_pandas_nullable(self): @@ -107,8 +105,7 @@ def test_pyarrow_dtype_backend_from_pandas_nullable(self): "d": pd.Series([None, 100.5, 200], dtype="Float64"), } ) - with pd.option_context("mode.dtype_backend", "pyarrow"): - result = df.convert_dtypes() + result = df.convert_dtypes(dtype_backend="pyarrow") expected = pd.DataFrame( { "a": pd.arrays.ArrowExtensionArray( @@ -125,6 +122,5 @@ def test_pyarrow_dtype_empty_object(self): # GH 50970 pytest.importorskip("pyarrow") expected = pd.DataFrame(columns=[0]) - with pd.option_context("mode.dtype_backend", "pyarrow"): - result = expected.convert_dtypes() + result = expected.convert_dtypes(dtype_backend="pyarrow") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 66e55fe86d964..2ab71c26a05c4 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -535,8 +535,7 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): actual = pd.read_excel(basename + read_ext, dtype=dtype) tm.assert_frame_equal(actual, expected) - @pytest.mark.parametrize("option", [True, False]) - def test_use_nullable_dtypes(self, read_ext, dtype_backend, option): + def test_dtype_backend(self, read_ext, dtype_backend): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") @@ -557,14 +556,9 @@ def test_use_nullable_dtypes(self, read_ext, dtype_backend, option): ) with tm.ensure_clean(read_ext) as file_path: df.to_excel(file_path, "test", index=False) - with pd.option_context("mode.dtype_backend", dtype_backend): - if not option: - result = pd.read_excel( - file_path, sheet_name="test", use_nullable_dtypes=True - ) - else: - with pd.option_context("mode.nullable_dtypes", True): - result = pd.read_excel(file_path, sheet_name="test") + result = pd.read_excel( + file_path, sheet_name="test", dtype_backend=dtype_backend + ) if dtype_backend == "pyarrow": import pyarrow as pa @@ -586,7 +580,7 @@ def test_use_nullable_dtypes(self, read_ext, dtype_backend, option): expected = df tm.assert_frame_equal(result, expected) - def test_use_nullabla_dtypes_and_dtype(self, read_ext): + def test_dtype_backend_and_dtype(self, read_ext): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") @@ -595,12 +589,15 @@ def test_use_nullabla_dtypes_and_dtype(self, read_ext): with tm.ensure_clean(read_ext) as file_path: df.to_excel(file_path, "test", index=False) result = pd.read_excel( - file_path, sheet_name="test", use_nullable_dtypes=True, dtype="float64" + file_path, + sheet_name="test", + dtype_backend="numpy_nullable", + dtype="float64", ) tm.assert_frame_equal(result, df) @td.skip_if_no("pyarrow") - def test_use_nullable_dtypes_string(self, read_ext, string_storage): + def test_dtype_backend_string(self, read_ext, string_storage): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") @@ -617,7 +614,7 @@ def test_use_nullable_dtypes_string(self, read_ext, string_storage): with tm.ensure_clean(read_ext) as file_path: df.to_excel(file_path, "test", index=False) result = pd.read_excel( - file_path, sheet_name="test", use_nullable_dtypes=True + file_path, sheet_name="test", dtype_backend="numpy_nullable" ) if string_storage == "python": diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 7e2d123c72b01..fde62eb7a91a5 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1867,8 +1867,7 @@ def test_json_uint64(self): @pytest.mark.parametrize( "orient", ["split", "records", "values", "index", "columns"] ) - @pytest.mark.parametrize("option", [True, False]) - def test_read_json_nullable(self, string_storage, dtype_backend, orient, option): + def test_read_json_dtype_backend(self, string_storage, dtype_backend, orient): # GH#50750 pa = pytest.importorskip("pyarrow") df = DataFrame( @@ -1894,12 +1893,7 @@ def test_read_json_nullable(self, string_storage, dtype_backend, orient, option) out = df.to_json(orient=orient) with pd.option_context("mode.string_storage", string_storage): - with pd.option_context("mode.dtype_backend", dtype_backend): - if option: - with pd.option_context("mode.nullable_dtypes", option): - result = read_json(out, orient=orient) - else: - result = read_json(out, use_nullable_dtypes=True, orient=orient) + result = read_json(out, dtype_backend=dtype_backend, orient=orient) expected = DataFrame( { @@ -1937,10 +1931,9 @@ def test_read_json_nullable_series(self, string_storage, dtype_backend, orient): out = ser.to_json(orient=orient) with pd.option_context("mode.string_storage", string_storage): - with pd.option_context("mode.dtype_backend", dtype_backend): - result = read_json( - out, use_nullable_dtypes=True, orient=orient, typ="series" - ) + result = read_json( + out, dtype_backend=dtype_backend, orient=orient, typ="series" + ) expected = Series([1, np.nan, 3], dtype="Int64") diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 21fec973897c0..d0e5cd02767bf 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -403,7 +403,7 @@ def test_dtypes_defaultdict_invalid(all_parsers): @pytest.mark.usefixtures("pyarrow_xfail") -def test_use_nullable_dtypes(all_parsers): +def test_dtype_backend(all_parsers): # GH#36712 parser = all_parsers @@ -413,7 +413,7 @@ def test_use_nullable_dtypes(all_parsers): 3,4.5,False,b,6,7.5,True,a,12-31-2019, """ result = parser.read_csv( - StringIO(data), use_nullable_dtypes=True, parse_dates=["i"] + StringIO(data), dtype_backend="numpy_nullable", parse_dates=["i"] ) expected = DataFrame( { @@ -432,7 +432,7 @@ def test_use_nullable_dtypes(all_parsers): tm.assert_frame_equal(result, expected) -def test_use_nullabla_dtypes_and_dtype(all_parsers): +def test_dtype_backend_and_dtype(all_parsers): # GH#36712 parser = all_parsers @@ -441,13 +441,15 @@ def test_use_nullabla_dtypes_and_dtype(all_parsers): 1,2.5 , """ - result = parser.read_csv(StringIO(data), use_nullable_dtypes=True, dtype="float64") + result = parser.read_csv( + StringIO(data), dtype_backend="numpy_nullable", dtype="float64" + ) expected = DataFrame({"a": [1.0, np.nan], "b": [2.5, np.nan]}) tm.assert_frame_equal(result, expected) @pytest.mark.usefixtures("pyarrow_xfail") -def test_use_nullable_dtypes_string(all_parsers, string_storage): +def test_dtype_backend_string(all_parsers, string_storage): # GH#36712 pa = pytest.importorskip("pyarrow") @@ -458,7 +460,7 @@ def test_use_nullable_dtypes_string(all_parsers, string_storage): a,x b, """ - result = parser.read_csv(StringIO(data), use_nullable_dtypes=True) + result = parser.read_csv(StringIO(data), dtype_backend="numpy_nullable") if string_storage == "python": expected = DataFrame( @@ -477,18 +479,20 @@ def test_use_nullable_dtypes_string(all_parsers, string_storage): tm.assert_frame_equal(result, expected) -def test_use_nullable_dtypes_ea_dtype_specified(all_parsers): +def test_dtype_backend_ea_dtype_specified(all_parsers): # GH#491496 data = """a,b 1,2 """ parser = all_parsers - result = parser.read_csv(StringIO(data), dtype="Int64", use_nullable_dtypes=True) + result = parser.read_csv( + StringIO(data), dtype="Int64", dtype_backend="numpy_nullable" + ) expected = DataFrame({"a": [1], "b": 2}, dtype="Int64") tm.assert_frame_equal(result, expected) -def test_use_nullable_dtypes_pyarrow_backend(all_parsers, request): +def test_dtype_backend_pyarrow(all_parsers, request): # GH#36712 pa = pytest.importorskip("pyarrow") parser = all_parsers @@ -498,43 +502,24 @@ def test_use_nullable_dtypes_pyarrow_backend(all_parsers, request): 1,2.5,True,a,,,,,12-31-2019, 3,4.5,False,b,6,7.5,True,a,12-31-2019, """ - with pd.option_context("mode.dtype_backend", "pyarrow"): - result = parser.read_csv( - StringIO(data), use_nullable_dtypes=True, parse_dates=["i"] - ) - expected = DataFrame( - { - "a": pd.Series([1, 3], dtype="int64[pyarrow]"), - "b": pd.Series([2.5, 4.5], dtype="float64[pyarrow]"), - "c": pd.Series([True, False], dtype="bool[pyarrow]"), - "d": pd.Series(["a", "b"], dtype=pd.ArrowDtype(pa.string())), - "e": pd.Series([pd.NA, 6], dtype="int64[pyarrow]"), - "f": pd.Series([pd.NA, 7.5], dtype="float64[pyarrow]"), - "g": pd.Series([pd.NA, True], dtype="bool[pyarrow]"), - "h": pd.Series( - [pd.NA if engine != "pyarrow" else "", "a"], - dtype=pd.ArrowDtype(pa.string()), - ), - "i": pd.Series([Timestamp("2019-12-31")] * 2), - "j": pd.Series([pd.NA, pd.NA], dtype="null[pyarrow]"), - } - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.usefixtures("pyarrow_xfail") -def test_use_nullable_dtypes_option(all_parsers): - # GH#50748 - - parser = all_parsers - - data = """a -1 -3 -""" - with pd.option_context("mode.nullable_dtypes", True): - result = parser.read_csv(StringIO(data)) - expected = DataFrame({"a": pd.Series([1, 3], dtype="Int64")}) + result = parser.read_csv(StringIO(data), dtype_backend="pyarrow", parse_dates=["i"]) + expected = DataFrame( + { + "a": pd.Series([1, 3], dtype="int64[pyarrow]"), + "b": pd.Series([2.5, 4.5], dtype="float64[pyarrow]"), + "c": pd.Series([True, False], dtype="bool[pyarrow]"), + "d": pd.Series(["a", "b"], dtype=pd.ArrowDtype(pa.string())), + "e": pd.Series([pd.NA, 6], dtype="int64[pyarrow]"), + "f": pd.Series([pd.NA, 7.5], dtype="float64[pyarrow]"), + "g": pd.Series([pd.NA, True], dtype="bool[pyarrow]"), + "h": pd.Series( + [pd.NA if engine != "pyarrow" else "", "a"], + dtype=pd.ArrowDtype(pa.string()), + ), + "i": pd.Series([Timestamp("2019-12-31")] * 2), + "j": pd.Series([pd.NA, pd.NA], dtype="null[pyarrow]"), + } + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 47379aaab6feb..fe2de5355a6be 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -959,7 +959,7 @@ def test_widths_and_usecols(): tm.assert_frame_equal(result, expected) -def test_use_nullable_dtypes(string_storage, dtype_backend): +def test_dtype_backend(string_storage, dtype_backend): # GH#50289 if string_storage == "python": arr = StringArray(np.array(["a", "b"], dtype=np.object_)) @@ -973,8 +973,7 @@ def test_use_nullable_dtypes(string_storage, dtype_backend): 1 2.5 True a 3 4.5 False b True 6 7.5 a""" with pd.option_context("mode.string_storage", string_storage): - with pd.option_context("mode.dtype_backend", dtype_backend): - result = read_fwf(StringIO(data), use_nullable_dtypes=True) + result = read_fwf(StringIO(data), dtype_backend=dtype_backend) expected = DataFrame( { @@ -1002,16 +1001,3 @@ def test_use_nullable_dtypes(string_storage, dtype_backend): expected["i"] = ArrowExtensionArray(pa.array([None, None])) tm.assert_frame_equal(result, expected) - - -def test_use_nullable_dtypes_option(): - # GH#50748 - - data = """a -1 -3""" - with pd.option_context("mode.nullable_dtypes", True): - result = read_fwf(StringIO(data)) - - expected = DataFrame({"a": pd.Series([1, 3], dtype="Int64")}) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py index b1419cfda1382..558822b84620a 100644 --- a/pandas/tests/io/parser/test_upcast.py +++ b/pandas/tests/io/parser/test_upcast.py @@ -25,7 +25,7 @@ def test_maybe_upcast(any_real_numpy_dtype): dtype = np.dtype(any_real_numpy_dtype) na_value = na_values[dtype] arr = np.array([1, 2, na_value], dtype=dtype) - result = _maybe_upcast(arr, use_nullable_dtypes=True) + result = _maybe_upcast(arr, use_dtype_backend=True) expected_mask = np.array([False, False, True]) if issubclass(dtype.type, np.integer): @@ -42,7 +42,7 @@ def test_maybe_upcast_no_na(any_real_numpy_dtype): pytest.skip() arr = np.array([1, 2, 3], dtype=any_real_numpy_dtype) - result = _maybe_upcast(arr, use_nullable_dtypes=True) + result = _maybe_upcast(arr, use_dtype_backend=True) expected_mask = np.array([False, False, False]) if issubclass(np.dtype(any_real_numpy_dtype).type, np.integer): @@ -58,7 +58,7 @@ def test_maybe_upcaste_bool(): dtype = np.bool_ na_value = na_values[dtype] arr = np.array([True, False, na_value], dtype="uint8").view(dtype) - result = _maybe_upcast(arr, use_nullable_dtypes=True) + result = _maybe_upcast(arr, use_dtype_backend=True) expected_mask = np.array([False, False, True]) expected = BooleanArray(arr, mask=expected_mask) @@ -69,7 +69,7 @@ def test_maybe_upcaste_bool_no_nan(): # GH#36712 dtype = np.bool_ arr = np.array([True, False, False], dtype="uint8").view(dtype) - result = _maybe_upcast(arr, use_nullable_dtypes=True) + result = _maybe_upcast(arr, use_dtype_backend=True) expected_mask = np.array([False, False, False]) expected = BooleanArray(arr, mask=expected_mask) @@ -81,7 +81,7 @@ def test_maybe_upcaste_all_nan(): dtype = np.int64 na_value = na_values[dtype] arr = np.array([na_value, na_value], dtype=dtype) - result = _maybe_upcast(arr, use_nullable_dtypes=True) + result = _maybe_upcast(arr, use_dtype_backend=True) expected_mask = np.array([True, True]) expected = IntegerArray(arr, mask=expected_mask) @@ -96,7 +96,7 @@ def test_maybe_upcast_object(val, string_storage): with pd.option_context("mode.string_storage", string_storage): arr = np.array(["a", "b", val], dtype=np.object_) - result = _maybe_upcast(arr, use_nullable_dtypes=True) + result = _maybe_upcast(arr, use_dtype_backend=True) if string_storage == "python": exp_val = "c" if val == "c" else NA diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index eeadd8bc56c74..94f073b1abb86 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -419,7 +419,7 @@ def test_raw_roundtrip(self, data): subprocess.run(["xsel", "--delete", "--clipboard"], check=True) @pytest.mark.parametrize("engine", ["c", "python"]) - def test_read_clipboard_nullable_dtypes( + def test_read_clipboard_dtype_backend( self, request, mock_clipboard, string_storage, dtype_backend, engine ): # GH#50502 @@ -440,10 +440,7 @@ def test_read_clipboard_nullable_dtypes( mock_clipboard[request.node.name] = text with pd.option_context("mode.string_storage", string_storage): - with pd.option_context("mode.dtype_backend", dtype_backend): - result = read_clipboard( - sep=",", use_nullable_dtypes=True, engine=engine - ) + result = read_clipboard(sep=",", dtype_backend=dtype_backend, engine=engine) expected = DataFrame( { @@ -470,20 +467,3 @@ def test_read_clipboard_nullable_dtypes( expected["g"] = ArrowExtensionArray(pa.array([None, None])) tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("engine", ["c", "python"]) - def test_read_clipboard_nullable_dtypes_option( - self, request, mock_clipboard, engine - ): - # GH#50748 - - text = """a -1 -2""" - mock_clipboard[request.node.name] = text - - with pd.option_context("mode.nullable_dtypes", True): - result = read_clipboard(sep=",", engine=engine) - - expected = DataFrame({"a": Series([1, 2], dtype="Int64")}) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index c29ade770a072..203472b0d0953 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -155,8 +155,7 @@ def test_http_path(self, feather_file): res = read_feather(url) tm.assert_frame_equal(expected, res) - @pytest.mark.parametrize("option", [True, False]) - def test_read_json_nullable(self, string_storage, dtype_backend, option): + def test_read_feather_dtype_backend(self, string_storage, dtype_backend): # GH#50765 pa = pytest.importorskip("pyarrow") df = pd.DataFrame( @@ -183,12 +182,7 @@ def test_read_json_nullable(self, string_storage, dtype_backend, option): with tm.ensure_clean() as path: to_feather(df, path) with pd.option_context("mode.string_storage", string_storage): - with pd.option_context("mode.dtype_backend", dtype_backend): - if option: - with pd.option_context("mode.nullable_dtypes", option): - result = read_feather(path) - else: - result = read_feather(path, use_nullable_dtypes=True) + result = read_feather(path, dtype_backend=dtype_backend) expected = pd.DataFrame( { diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 514d96a4d9ec6..1595fa86567c9 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -137,7 +137,7 @@ def test_to_html_compat(self): res = self.read_html(out, attrs={"class": "dataframe"}, index_col=0)[0] tm.assert_frame_equal(res, df) - def test_use_nullable_dtypes(self, string_storage, dtype_backend): + def test_dtype_backend(self, string_storage, dtype_backend): # GH#50286 df = DataFrame( { @@ -163,8 +163,7 @@ def test_use_nullable_dtypes(self, string_storage, dtype_backend): out = df.to_html(index=False) with pd.option_context("mode.string_storage", string_storage): - with pd.option_context("mode.dtype_backend", dtype_backend): - result = self.read_html(out, use_nullable_dtypes=True)[0] + result = self.read_html(out, dtype_backend=dtype_backend)[0] expected = DataFrame( { @@ -193,17 +192,6 @@ def test_use_nullable_dtypes(self, string_storage, dtype_backend): tm.assert_frame_equal(result, expected) - def test_use_nullable_dtypes_option(self): - # GH#50748 - df = DataFrame({"a": Series([1, np.nan, 3], dtype="Int64")}) - - out = df.to_html(index=False) - with pd.option_context("mode.nullable_dtypes", True): - result = self.read_html(out)[0] - - expected = DataFrame({"a": Series([1, np.nan, 3], dtype="Int64")}) - tm.assert_frame_equal(result, expected) - @pytest.mark.network @tm.network( url=( diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 9db19d4eb8448..35df047915255 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -307,7 +307,7 @@ def test_orc_writer_dtypes_not_supported(df_not_supported): @td.skip_if_no("pyarrow", min_version="7.0.0") -def test_orc_use_nullable_dtypes_pyarrow_backend(): +def test_orc_dtype_backend_pyarrow(): df = pd.DataFrame( { "string": list("abc"), @@ -329,8 +329,7 @@ def test_orc_use_nullable_dtypes_pyarrow_backend(): ) bytes_data = df.copy().to_orc() - with pd.option_context("mode.dtype_backend", "pyarrow"): - result = read_orc(BytesIO(bytes_data), use_nullable_dtypes=True) + result = read_orc(BytesIO(bytes_data), dtype_backend="pyarrow") expected = pd.DataFrame( { @@ -343,7 +342,7 @@ def test_orc_use_nullable_dtypes_pyarrow_backend(): @td.skip_if_no("pyarrow", min_version="7.0.0") -def test_orc_use_nullable_dtypes_pandas_backend(): +def test_orc_dtype_backend_numpy_nullable(): # GH#50503 df = pd.DataFrame( { @@ -361,8 +360,7 @@ def test_orc_use_nullable_dtypes_pandas_backend(): ) bytes_data = df.copy().to_orc() - with pd.option_context("mode.dtype_backend", "pandas"): - result = read_orc(BytesIO(bytes_data), use_nullable_dtypes=True) + result = read_orc(BytesIO(bytes_data), dtype_backend="numpy_nullable") expected = pd.DataFrame( { @@ -386,19 +384,6 @@ def test_orc_use_nullable_dtypes_pandas_backend(): tm.assert_frame_equal(result, expected) -@td.skip_if_no("pyarrow", min_version="7.0.0") -def test_orc_use_nullable_dtypes_option(): - # GH#50748 - df = pd.DataFrame({"int": list(range(1, 4))}) - - bytes_data = df.copy().to_orc() - with pd.option_context("mode.nullable_dtypes", True): - result = read_orc(BytesIO(bytes_data)) - - expected = pd.DataFrame({"int": pd.Series([1, 2, 3], dtype="Int64")}) - tm.assert_frame_equal(result, expected) - - def test_orc_uri_path(): expected = pd.DataFrame({"int": list(range(1, 4))}) with tm.ensure_clean("tmp.orc") as path: diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 2124787e8a80e..2276dd0136e87 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -592,7 +592,7 @@ def test_write_column_index_nonstring(self, pa): self.check_error_on_write(df, engine, ValueError, msg) @pytest.mark.skipif(pa_version_under7p0, reason="minimum pyarrow not installed") - def test_use_nullable_dtypes(self, engine, request): + def test_dtype_backend(self, engine, request): import pyarrow.parquet as pq if engine == "fastparquet": @@ -620,7 +620,7 @@ def test_use_nullable_dtypes(self, engine, request): # write manually with pyarrow to write integers pq.write_table(table, path) result1 = read_parquet(path, engine=engine) - result2 = read_parquet(path, engine=engine, use_nullable_dtypes=True) + result2 = read_parquet(path, engine=engine, dtype_backend="numpy_nullable") assert result1["a"].dtype == np.dtype("float64") expected = pd.DataFrame( @@ -641,29 +641,6 @@ def test_use_nullable_dtypes(self, engine, request): expected = expected.drop("c", axis=1) tm.assert_frame_equal(result2, expected) - @pytest.mark.skipif(pa_version_under7p0, reason="minimum pyarrow not installed") - def test_use_nullable_dtypes_option(self, engine, request): - # GH#50748 - import pyarrow.parquet as pq - - if engine == "fastparquet": - # We are manually disabling fastparquet's - # nullable dtype support pending discussion - mark = pytest.mark.xfail( - reason="Fastparquet nullable dtype support is disabled" - ) - request.node.add_marker(mark) - - table = pyarrow.table({"a": pyarrow.array([1, 2, 3, None], "int64")}) - with tm.ensure_clean() as path: - # write manually with pyarrow to write integers - pq.write_table(table, path) - with pd.option_context("mode.nullable_dtypes", True): - result2 = read_parquet(path, engine=engine) - - expected = pd.DataFrame({"a": pd.array([1, 2, 3, None], dtype="Int64")}) - tm.assert_frame_equal(result2, expected) - @pytest.mark.parametrize( "dtype", [ @@ -694,7 +671,7 @@ def test_read_empty_array(self, pa, dtype): } ) check_round_trip( - df, pa, read_kwargs={"use_nullable_dtypes": True}, expected=expected + df, pa, read_kwargs={"dtype_backend": "numpy_nullable"}, expected=expected ) @@ -1022,7 +999,7 @@ def test_read_parquet_manager(self, pa, using_array_manager): else: assert isinstance(result._mgr, pd.core.internals.BlockManager) - def test_read_use_nullable_types_pyarrow_config(self, pa, df_full): + def test_read_dtype_backend_pyarrow_config(self, pa, df_full): import pyarrow df = df_full @@ -1044,27 +1021,25 @@ def test_read_use_nullable_types_pyarrow_config(self, pa, df_full): pd.ArrowDtype(pyarrow.timestamp(unit="us", tz="Europe/Brussels")) ) - with pd.option_context("mode.dtype_backend", "pyarrow"): - check_round_trip( - df, - engine=pa, - read_kwargs={"use_nullable_dtypes": True}, - expected=expected, - ) + check_round_trip( + df, + engine=pa, + read_kwargs={"dtype_backend": "pyarrow"}, + expected=expected, + ) - def test_read_use_nullable_types_pyarrow_config_index(self, pa): + def test_read_dtype_backend_pyarrow_config_index(self, pa): df = pd.DataFrame( {"a": [1, 2]}, index=pd.Index([3, 4], name="test"), dtype="int64[pyarrow]" ) expected = df.copy() - with pd.option_context("mode.dtype_backend", "pyarrow"): - check_round_trip( - df, - engine=pa, - read_kwargs={"use_nullable_dtypes": True}, - expected=expected, - ) + check_round_trip( + df, + engine=pa, + read_kwargs={"dtype_backend": True}, + expected=expected, + ) class TestParquetFastParquet(Base): @@ -1213,7 +1188,10 @@ def test_use_nullable_dtypes_not_supported(self, fp): with tm.ensure_clean() as path: df.to_parquet(path) with pytest.raises(ValueError, match="not supported for the fastparquet"): - read_parquet(path, engine="fastparquet", use_nullable_dtypes=True) + with tm.assert_produces_warning(FutureWarning): + read_parquet(path, engine="fastparquet", use_nullable_dtypes=True) + with pytest.raises(ValueError, match="not supported for the fastparquet"): + read_parquet(path, engine="fastparquet", dtype_backend="pyarrow") def test_close_file_handle_on_read_error(self): with tm.ensure_clean("test.parquet") as path: diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index 7b19d2dafb34e..fe414f6c3d52c 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -82,12 +82,11 @@ def test_spss_usecols(datapath): pd.read_spss(fname, usecols="VAR00002") -def test_spss_umlauts_use_nullable_dtypes(datapath, dtype_backend): +def test_spss_umlauts_dtype_backend(datapath, dtype_backend): # test file from the Haven project (https://haven.tidyverse.org/) fname = datapath("io", "data", "spss", "umlauts.sav") - with pd.option_context("mode.dtype_backend", dtype_backend): - df = pd.read_spss(fname, convert_categoricals=False, use_nullable_dtypes=True) + df = pd.read_spss(fname, convert_categoricals=False, dtype_backend=dtype_backend) expected = pd.DataFrame({"var1": [1.0, 2.0, 1.0, 3.0]}, dtype="Int64") if dtype_backend == "pyarrow": diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index b6f88746d53ea..1691b6b72c40b 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -32,6 +32,7 @@ import numpy as np import pytest +from pandas._libs import lib import pandas.util._test_decorators as td from pandas.core.dtypes.common import ( @@ -2437,75 +2438,55 @@ def test_get_engine_auto_error_message(self): pass # TODO(GH#36893) fill this in when we add more engines - @pytest.mark.parametrize("option", [True, False]) @pytest.mark.parametrize("func", ["read_sql", "read_sql_query"]) - def test_read_sql_nullable_dtypes( - self, string_storage, func, option, dtype_backend - ): + def test_read_sql_dtype_backend(self, string_storage, func, dtype_backend): # GH#50048 table = "test" - df = self.nullable_data() + df = self.dtype_backend_data() df.to_sql(table, self.conn, index=False, if_exists="replace") with pd.option_context("mode.string_storage", string_storage): - with pd.option_context("mode.dtype_backend", dtype_backend): - if option: - with pd.option_context("mode.nullable_dtypes", True): - result = getattr(pd, func)(f"Select * from {table}", self.conn) - else: - result = getattr(pd, func)( - f"Select * from {table}", self.conn, use_nullable_dtypes=True - ) - expected = self.nullable_expected(string_storage, dtype_backend) + result = getattr(pd, func)( + f"Select * from {table}", self.conn, dtype_backend=dtype_backend + ) + expected = self.dtype_backend_expected(string_storage, dtype_backend) tm.assert_frame_equal(result, expected) with pd.option_context("mode.string_storage", string_storage): - with pd.option_context("mode.dtype_backend", dtype_backend): - iterator = getattr(pd, func)( - f"Select * from {table}", - self.conn, - use_nullable_dtypes=True, - chunksize=3, - ) - expected = self.nullable_expected(string_storage, dtype_backend) - for result in iterator: - tm.assert_frame_equal(result, expected) + iterator = getattr(pd, func)( + f"Select * from {table}", + self.conn, + dtype_backend=dtype_backend, + chunksize=3, + ) + expected = self.dtype_backend_expected(string_storage, dtype_backend) + for result in iterator: + tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("option", [True, False]) @pytest.mark.parametrize("func", ["read_sql", "read_sql_table"]) - def test_read_sql_nullable_dtypes_table( - self, string_storage, func, option, dtype_backend - ): + def test_read_sql_dtype_backend_table(self, string_storage, func, dtype_backend): # GH#50048 table = "test" - df = self.nullable_data() + df = self.dtype_backend_data() df.to_sql(table, self.conn, index=False, if_exists="replace") with pd.option_context("mode.string_storage", string_storage): - with pd.option_context("mode.dtype_backend", dtype_backend): - if option: - with pd.option_context("mode.nullable_dtypes", True): - result = getattr(pd, func)(table, self.conn) - else: - result = getattr(pd, func)( - table, self.conn, use_nullable_dtypes=True - ) - expected = self.nullable_expected(string_storage, dtype_backend) + result = getattr(pd, func)(table, self.conn, dtype_backend=dtype_backend) + expected = self.dtype_backend_expected(string_storage, dtype_backend) tm.assert_frame_equal(result, expected) with pd.option_context("mode.string_storage", string_storage): - with pd.option_context("mode.dtype_backend", dtype_backend): - iterator = getattr(pd, func)( - table, - self.conn, - use_nullable_dtypes=True, - chunksize=3, - ) - expected = self.nullable_expected(string_storage, dtype_backend) - for result in iterator: - tm.assert_frame_equal(result, expected) + iterator = getattr(pd, func)( + table, + self.conn, + dtype_backend=dtype_backend, + chunksize=3, + ) + expected = self.dtype_backend_expected(string_storage, dtype_backend) + for result in iterator: + tm.assert_frame_equal(result, expected) - def nullable_data(self) -> DataFrame: + def dtype_backend_data(self) -> DataFrame: return DataFrame( { "a": Series([1, np.nan, 3], dtype="Int64"), @@ -2519,7 +2500,7 @@ def nullable_data(self) -> DataFrame: } ) - def nullable_expected(self, storage, dtype_backend) -> DataFrame: + def dtype_backend_expected(self, storage, dtype_backend) -> DataFrame: string_array: StringArray | ArrowStringArray string_array_na: StringArray | ArrowStringArray if storage == "python": @@ -2571,9 +2552,9 @@ def test_chunksize_empty_dtypes(self): ): tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("use_nullable_dtypes", [True, False]) + @pytest.mark.parametrize("dtype_backend", [lib.no_default, "numpy_nullable"]) @pytest.mark.parametrize("func", ["read_sql", "read_sql_query"]) - def test_read_sql_dtype(self, func, use_nullable_dtypes): + def test_read_sql_dtype(self, func, dtype_backend): # GH#50797 table = "test" df = DataFrame({"a": [1, 2, 3], "b": 5}) @@ -2583,13 +2564,14 @@ def test_read_sql_dtype(self, func, use_nullable_dtypes): f"Select * from {table}", self.conn, dtype={"a": np.float64}, - use_nullable_dtypes=use_nullable_dtypes, + dtype_backend=dtype_backend, ) expected = DataFrame( { "a": Series([1, 2, 3], dtype=np.float64), "b": Series( - [5, 5, 5], dtype="int64" if not use_nullable_dtypes else "Int64" + [5, 5, 5], + dtype="int64" if not dtype_backend == "numpy_nullable" else "Int64", ), } ) @@ -2679,9 +2661,9 @@ class Test(BaseModel): assert list(df.columns) == ["id", "string_column"] - def nullable_expected(self, storage, dtype_backend) -> DataFrame: - df = super().nullable_expected(storage, dtype_backend) - if dtype_backend == "pandas": + def dtype_backend_expected(self, storage, dtype_backend) -> DataFrame: + df = super().dtype_backend_expected(storage, dtype_backend) + if dtype_backend == "numpy_nullable": df = df.astype({"e": "Int64", "f": "Int64"}) else: df = df.astype({"e": "int64[pyarrow]", "f": "int64[pyarrow]"}) @@ -2689,7 +2671,7 @@ def nullable_expected(self, storage, dtype_backend) -> DataFrame: return df @pytest.mark.parametrize("func", ["read_sql", "read_sql_table"]) - def test_read_sql_nullable_dtypes_table(self, string_storage, func): + def test_read_sql_dtype_backend_table(self, string_storage, func): # GH#50048 Not supported for sqlite pass @@ -2720,9 +2702,9 @@ def setup_driver(cls): def test_default_type_conversion(self): pass - def nullable_expected(self, storage, dtype_backend) -> DataFrame: - df = super().nullable_expected(storage, dtype_backend) - if dtype_backend == "pandas": + def dtype_backend_expected(self, storage, dtype_backend) -> DataFrame: + df = super().dtype_backend_expected(storage, dtype_backend) + if dtype_backend == "numpy_nullable": df = df.astype({"e": "Int64", "f": "Int64"}) else: df = df.astype({"e": "int64[pyarrow]", "f": "int64[pyarrow]"}) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index b73116519178e..eaadcd6cee11b 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -1860,8 +1860,7 @@ def test_read_xml_nullable_dtypes(parser, string_storage, dtype_backend): string_array_na = ArrowStringArray(pa.array(["x", None])) with pd.option_context("mode.string_storage", string_storage): - with pd.option_context("mode.dtype_backend", dtype_backend): - result = read_xml(data, parser=parser, use_nullable_dtypes=True) + result = read_xml(data, parser=parser, dtype_backend=dtype_backend) expected = DataFrame( { @@ -1890,21 +1889,3 @@ def test_read_xml_nullable_dtypes(parser, string_storage, dtype_backend): expected["g"] = ArrowExtensionArray(pa.array([None, None])) tm.assert_frame_equal(result, expected) - - -def test_use_nullable_dtypes_option(parser): - # GH#50748 - - data = """ - - - 1 - - - 3 - - """ - with pd.option_context("mode.nullable_dtypes", True): - result = read_xml(data, parser=parser) - expected = DataFrame({"a": Series([1, 3], dtype="Int64")}) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 18b8dd8394133..8f87313f26171 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -9,7 +9,6 @@ DataFrame, Index, Series, - option_context, to_numeric, ) import pandas._testing as tm @@ -805,10 +804,10 @@ def test_to_numeric_large_float_not_downcast_to_float_32(val): @pytest.mark.parametrize( "val, dtype", [(1, "Int64"), (1.5, "Float64"), (True, "boolean")] ) -def test_to_numeric_use_nullable_dtypes(val, dtype): +def test_to_numeric_dtype_backend(val, dtype): # GH#50505 ser = Series([val], dtype=object) - result = to_numeric(ser, use_nullable_dtypes=True) + result = to_numeric(ser, dtype_backend="numpy_nullable") expected = Series([val], dtype=dtype) tm.assert_series_equal(result, expected) @@ -824,7 +823,7 @@ def test_to_numeric_use_nullable_dtypes(val, dtype): (True, "bool[pyarrow]"), ], ) -def test_to_numeric_use_nullable_dtypes_na(val, dtype): +def test_to_numeric_dtype_backend_na(val, dtype): # GH#50505 if "pyarrow" in dtype: pytest.importorskip("pyarrow") @@ -832,21 +831,11 @@ def test_to_numeric_use_nullable_dtypes_na(val, dtype): else: dtype_backend = "pandas" ser = Series([val, None], dtype=object) - with option_context("mode.dtype_backend", dtype_backend): - result = to_numeric(ser, use_nullable_dtypes=True) + result = to_numeric(ser, dtype_backend=dtype_backend) expected = Series([val, pd.NA], dtype=dtype) tm.assert_series_equal(result, expected) -def test_to_numeric_use_nullable_dtypes_option(): - # GH#50748 - ser = Series([1, None], dtype=object) - with option_context("mode.nullable_dtypes", True): - result = to_numeric(ser) - expected = Series([1, pd.NA], dtype="Int64") - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize( "val, dtype, downcast", [ @@ -858,16 +847,15 @@ def test_to_numeric_use_nullable_dtypes_option(): (1, "int8[pyarrow]", "signed"), ], ) -def test_to_numeric_use_nullable_dtypes_downcasting(val, dtype, downcast): +def test_to_numeric_dtype_backend_downcasting(val, dtype, downcast): # GH#50505 if "pyarrow" in dtype: pytest.importorskip("pyarrow") dtype_backend = "pyarrow" else: - dtype_backend = "pandas" + dtype_backend = "numpy_nullable" ser = Series([val, None], dtype=object) - with option_context("mode.dtype_backend", dtype_backend): - result = to_numeric(ser, use_nullable_dtypes=True, downcast=downcast) + result = to_numeric(ser, dtype_backend=dtype_backend, downcast=downcast) expected = Series([val, pd.NA], dtype=dtype) tm.assert_series_equal(result, expected) @@ -875,13 +863,12 @@ def test_to_numeric_use_nullable_dtypes_downcasting(val, dtype, downcast): @pytest.mark.parametrize( "smaller, dtype_backend", [["UInt8", "pandas"], ["uint8[pyarrow]", "pyarrow"]] ) -def test_to_numeric_use_nullable_dtypes_downcasting_uint(smaller, dtype_backend): +def test_to_numeric_dtype_backend_downcasting_uint(smaller, dtype_backend): # GH#50505 if dtype_backend == "pyarrow": pytest.importorskip("pyarrow") ser = Series([1, pd.NA], dtype="UInt64") - with option_context("mode.dtype_backend", dtype_backend): - result = to_numeric(ser, use_nullable_dtypes=True, downcast="unsigned") + result = to_numeric(ser, dtype_backend=dtype_backend, downcast="unsigned") expected = Series([1, pd.NA], dtype=smaller) tm.assert_series_equal(result, expected) @@ -899,40 +886,30 @@ def test_to_numeric_use_nullable_dtypes_downcasting_uint(smaller, dtype_backend) "bool[pyarrow]", ], ) -def test_to_numeric_use_nullable_dtypes_already_nullable(dtype): +def test_to_numeric_dtype_backend_already_nullable(dtype): # GH#50505 if "pyarrow" in dtype: pytest.importorskip("pyarrow") ser = Series([1, pd.NA], dtype=dtype) - result = to_numeric(ser, use_nullable_dtypes=True) + result = to_numeric(ser, dtype_backend="numpy_nullable") expected = Series([1, pd.NA], dtype=dtype) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize( - "use_nullable_dtypes, dtype", [(True, "Float64"), (False, "float64")] -) -def test_to_numeric_use_nullable_dtypes_error( - use_nullable_dtypes, dtype, dtype_backend -): +def test_to_numeric_dtype_backend_error(dtype_backend): # GH#50505 ser = Series(["a", "b", ""]) expected = ser.copy() with pytest.raises(ValueError, match="Unable to parse string"): - with option_context("mode.dtype_backend", dtype_backend): - to_numeric(ser, use_nullable_dtypes=use_nullable_dtypes) + to_numeric(ser, dtype_backend=dtype_backend) - with option_context("mode.dtype_backend", dtype_backend): - result = to_numeric( - ser, use_nullable_dtypes=use_nullable_dtypes, errors="ignore" - ) + result = to_numeric(ser, dtype_backend=dtype_backend, errors="ignore") tm.assert_series_equal(result, expected) - with option_context("mode.dtype_backend", dtype_backend): - result = to_numeric( - ser, use_nullable_dtypes=use_nullable_dtypes, errors="coerce" - ) - if use_nullable_dtypes and dtype_backend == "pyarrow": + result = to_numeric(ser, dtype_backend=dtype_backend, errors="coerce") + if dtype_backend == "pyarrow": dtype = "double[pyarrow]" + else: + dtype = "Float64" expected = Series([np.nan, np.nan, np.nan], dtype=dtype) tm.assert_series_equal(result, expected) From d054ae4a54166c27d67c5826746e7624030ade6e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 9 Mar 2023 00:41:19 +0100 Subject: [PATCH 02/18] Add whatsnew --- doc/source/whatsnew/v2.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index ecea272b8a3f5..0e44a46dc19b9 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -795,6 +795,7 @@ Deprecations - Deprecated :meth:`Grouper.obj`, use :meth:`Groupby.obj` instead (:issue:`51206`) - Deprecated :meth:`Grouper.indexer`, use :meth:`Resampler.indexer` instead (:issue:`51206`) - Deprecated :meth:`Grouper.ax`, use :meth:`Resampler.ax` instead (:issue:`51206`) +- Deprecated keyword ``use_nullable_dtypes`` in :func:`read_parquet`, use ``dtype_backend`` instead (:issue:`51853`) - Deprecated :meth:`Series.pad` in favor of :meth:`Series.ffill` (:issue:`33396`) - Deprecated :meth:`Series.backfill` in favor of :meth:`Series.bfill` (:issue:`33396`) - Deprecated :meth:`DataFrame.pad` in favor of :meth:`DataFrame.ffill` (:issue:`33396`) From 0298f60bf8210a39f72239fab524369d05d5fe5b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 9 Mar 2023 01:22:36 +0100 Subject: [PATCH 03/18] Fix --- pandas/io/sql.py | 2 +- pandas/tests/io/test_sql.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 5950f080e84a3..8678ec83b61be 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -1217,7 +1217,7 @@ def _harmonize_columns( # floats support NA, can always convert! self.frame[col_name] = df_col.astype(col_type, copy=False) - elif not dtype_backend == "numpy" and len(df_col) == df_col.count(): + elif dtype_backend == "numpy" and len(df_col) == df_col.count(): # No NA values, can convert ints and bools if col_type is np.dtype("int64") or col_type is bool: self.frame[col_name] = df_col.astype(col_type, copy=False) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 1691b6b72c40b..7c700df16042c 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2191,6 +2191,7 @@ def test_get_schema_create_table(self, test_frame3): tbl = "test_get_schema_create_table" create_sql = sql.get_schema(test_frame3, tbl, con=self.conn) blank_test_df = test_frame3.iloc[:0] + print(blank_test_df.dtypes) self.drop_table(tbl, self.conn) create_sql = text(create_sql) From 111b42a7e32a95d76bfbfd91cdb4cef431e61683 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 9 Mar 2023 01:22:48 +0100 Subject: [PATCH 04/18] Fix --- pandas/tests/io/test_sql.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 7c700df16042c..1691b6b72c40b 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2191,7 +2191,6 @@ def test_get_schema_create_table(self, test_frame3): tbl = "test_get_schema_create_table" create_sql = sql.get_schema(test_frame3, tbl, con=self.conn) blank_test_df = test_frame3.iloc[:0] - print(blank_test_df.dtypes) self.drop_table(tbl, self.conn) create_sql = text(create_sql) From 8e112b121580cc37ed4280050f8721635855983d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 9 Mar 2023 21:24:38 +0100 Subject: [PATCH 05/18] Update doc/source/whatsnew/v2.0.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 0e44a46dc19b9..b0c72192811c8 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -127,7 +127,7 @@ The following functions gained a new keyword ``dtype_backend`` (:issue:`36712`) * :meth:`DataFrame.convert_dtypes` * :meth:`Series.convert_dtypes` -When this option is set to ``numpy_nullable`` it will return a :class:`DataFrame` that is +When this option is set to ``"numpy_nullable"`` it will return a :class:`DataFrame` that is backed by nullable dtypes. When this keyword is set to ``pyarrow``, then these functions will return pyarrow-backed nullable :class:`ArrowDtype` DataFrames (:issue:`48957`, :issue:`49997`): From 998b8074808c5097a6db925d974338389273f313 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 9 Mar 2023 21:24:46 +0100 Subject: [PATCH 06/18] Update doc/source/whatsnew/v2.0.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index b0c72192811c8..a8277a4336a2b 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -130,7 +130,7 @@ The following functions gained a new keyword ``dtype_backend`` (:issue:`36712`) When this option is set to ``"numpy_nullable"`` it will return a :class:`DataFrame` that is backed by nullable dtypes. -When this keyword is set to ``pyarrow``, then these functions will return pyarrow-backed nullable :class:`ArrowDtype` DataFrames (:issue:`48957`, :issue:`49997`): +When this keyword is set to ``"pyarrow"``, then these functions will return pyarrow-backed nullable :class:`ArrowDtype` DataFrames (:issue:`48957`, :issue:`49997`): * :func:`read_csv` * :func:`read_clipboard` From 0858a277a341f779a074737ca1e550f399b30722 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 9 Mar 2023 21:26:05 +0100 Subject: [PATCH 07/18] Adjust message --- pandas/io/parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 04a83c94ac737..ceaaa2888f773 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -514,7 +514,7 @@ def read_parquet( if use_nullable_dtypes is not lib.no_default: warnings.warn( "The argument 'use_nullable_dtypes' is deprecated. Use " - "dtype_backend='numpy_nullable' instead.", + "dtype_backend='numpy_nullable' instead of use_nullable_dtype=True.", FutureWarning, stacklevel=find_stack_level(), ) From fb42c2ec1cd8a54fe845dfd56b73cb220a693583 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 9 Mar 2023 22:43:01 +0100 Subject: [PATCH 08/18] Refactor message --- pandas/io/parquet.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index ceaaa2888f773..7bc9f6f26fa8a 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -512,12 +512,15 @@ def read_parquet( """ impl = get_engine(engine) if use_nullable_dtypes is not lib.no_default: - warnings.warn( - "The argument 'use_nullable_dtypes' is deprecated. Use " - "dtype_backend='numpy_nullable' instead of use_nullable_dtype=True.", - FutureWarning, - stacklevel=find_stack_level(), + msg = ( + "The argument 'use_nullable_dtypes' is deprecated and will be removed " + "in a future version." ) + if use_nullable_dtypes is True: + msg += ( + "Use dtype_backend='numpy_nullable' instead of use_nullable_dtype=True." + ) + warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) else: use_nullable_dtypes = False From 04ca96812df357146283b059c1d5d35715e055ec Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 9 Mar 2023 23:15:01 +0100 Subject: [PATCH 09/18] ERR: Check that dtype_backend is valid --- pandas/core/generic.py | 2 ++ pandas/core/tools/numeric.py | 3 +++ pandas/io/clipboards.py | 3 +++ pandas/io/excel/_base.py | 3 +++ pandas/io/feather_format.py | 3 +++ pandas/io/html.py | 2 ++ pandas/io/json/_json.py | 3 +++ pandas/io/orc.py | 3 +++ pandas/io/parquet.py | 3 +++ pandas/io/parsers/readers.py | 5 +++++ pandas/io/spss.py | 2 ++ pandas/io/sql.py | 4 ++++ pandas/io/xml.py | 2 ++ pandas/tests/frame/methods/test_convert_dtypes.py | 10 ++++++++++ pandas/tests/io/json/test_pandas.py | 8 ++++++++ pandas/tests/io/parser/test_read_fwf.py | 9 +++++++++ pandas/tests/io/parser/test_unsupported.py | 10 ++++++++++ pandas/tests/io/test_clipboard.py | 8 ++++++++ pandas/tests/io/test_feather.py | 11 +++++++++++ pandas/tests/io/test_html.py | 8 ++++++++ pandas/tests/io/test_orc.py | 12 ++++++++++++ pandas/tests/io/test_parquet.py | 11 +++++++++++ pandas/tests/io/test_spss.py | 9 +++++++++ pandas/tests/io/test_sql.py | 13 +++++++++++++ pandas/tests/io/xml/test_xml.py | 9 +++++++++ pandas/tests/tools/test_to_numeric.py | 10 ++++++++++ pandas/util/_validators.py | 11 +++++++++++ 27 files changed, 177 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c191ccd122e09..86752cb214b60 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -94,6 +94,7 @@ from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level from pandas.util._validators import ( + check_dtype_backend, validate_ascending, validate_bool_kwarg, validate_fillna_kwargs, @@ -6686,6 +6687,7 @@ def convert_dtypes( 2 dtype: string """ + check_dtype_backend(dtype_backend) if self.ndim == 1: return self._convert_dtypes( infer_objects, diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 38034436d579d..bd3ab7692e01a 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -8,6 +8,7 @@ import numpy as np from pandas._libs import lib +from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.cast import maybe_downcast_numeric from pandas.core.dtypes.common import ( @@ -162,6 +163,8 @@ def to_numeric( if errors not in ("ignore", "raise", "coerce"): raise ValueError("invalid error value specified") + check_dtype_backend(dtype_backend) + is_series = False is_index = False is_scalars = False diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index 15693b11f0b46..308d25b2be05f 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -7,6 +7,7 @@ from pandas._libs import lib from pandas.util._exceptions import find_stack_level +from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.generic import ABCDataFrame @@ -54,6 +55,8 @@ def read_clipboard( if encoding is not None and encoding.lower().replace("-", "") != "utf8": raise NotImplementedError("reading from clipboard only supports utf-8 encoding") + check_dtype_backend(dtype_backend) + from pandas.io.clipboard import clipboard_get from pandas.io.parsers import read_csv diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 1619a678ecfc8..f7b4d89f64c79 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -36,6 +36,7 @@ Appender, doc, ) +from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import ( is_bool, @@ -468,6 +469,8 @@ def read_excel( storage_options: StorageOptions = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | dict[IntStrT, DataFrame]: + check_dtype_backend(dtype_backend) + should_close = False if not isinstance(io, ExcelFile): should_close = True diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 0295ade472962..e8a8a30839bb3 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -10,6 +10,7 @@ from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc +from pandas.util._validators import check_dtype_backend import pandas as pd from pandas.core.api import DataFrame @@ -101,6 +102,8 @@ def read_feather( import_optional_dependency("pyarrow") from pyarrow import feather + check_dtype_backend(dtype_backend) + with get_handle( path, "rb", storage_options=storage_options, is_text=False ) as handles: diff --git a/pandas/io/html.py b/pandas/io/html.py index 15604101f2121..fdbe6e7091e25 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -24,6 +24,7 @@ AbstractMethodError, EmptyDataError, ) +from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import is_list_like @@ -1166,6 +1167,7 @@ def read_html( f'"{extract_links}"' ) validate_header_arg(header) + check_dtype_backend(dtype_backend) io = stringify_path(io) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 91f150be9776c..e87031dc550c9 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -29,6 +29,7 @@ from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError from pandas.util._decorators import doc +from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import ( ensure_str, @@ -743,6 +744,8 @@ def read_json( if orient == "table" and convert_axes: raise ValueError("cannot pass both convert_axes and orient='table'") + check_dtype_backend(dtype_backend) + if dtype is None and orient != "table": # error: Incompatible types in assignment (expression has type "bool", variable # has type "Union[ExtensionDtype, str, dtype[Any], Type[str], Type[float], diff --git a/pandas/io/orc.py b/pandas/io/orc.py index a4ac9fe9d3042..9bc519abe8a0e 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -12,6 +12,7 @@ from pandas._libs import lib from pandas.compat import pa_version_under8p0 from pandas.compat._optional import import_optional_dependency +from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -93,6 +94,8 @@ def read_orc( orc = import_optional_dependency("pyarrow.orc") + check_dtype_backend(dtype_backend) + with get_handle(path, "rb", is_text=False) as handles: source = handles.handle if is_fsspec_url(path) and filesystem is None: diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 7bc9f6f26fa8a..bbeaaf187bbab 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -16,6 +16,7 @@ from pandas.errors import AbstractMethodError from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level +from pandas.util._validators import check_dtype_backend import pandas as pd from pandas import ( @@ -524,6 +525,8 @@ def read_parquet( else: use_nullable_dtypes = False + check_dtype_backend(dtype_backend) + return impl.read( path, columns=columns, diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 88ff826d2bf81..7f4e0e8dd0a60 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -32,6 +32,7 @@ ) from pandas.util._decorators import Appender from pandas.util._exceptions import find_stack_level +from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import ( is_file_like, @@ -1341,6 +1342,8 @@ def read_fwf( kwds["colspecs"] = colspecs kwds["infer_nrows"] = infer_nrows kwds["engine"] = "python-fwf" + + check_dtype_backend(dtype_backend) kwds["dtype_backend"] = dtype_backend return _read(filepath_or_buffer, kwds) @@ -1994,6 +1997,8 @@ def _refine_defaults_read( else: raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines") + check_dtype_backend(dtype_backend) + kwds["dtype_backend"] = dtype_backend return kwds diff --git a/pandas/io/spss.py b/pandas/io/spss.py index a80e4497999e6..d2dcf6419c8f9 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -7,6 +7,7 @@ from pandas._libs import lib from pandas.compat._optional import import_optional_dependency +from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.inference import is_list_like @@ -48,6 +49,7 @@ def read_spss( DataFrame """ pyreadstat = import_optional_dependency("pyreadstat") + check_dtype_backend(dtype_backend) if usecols is not None: if not is_list_like(usecols): diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 8678ec83b61be..c17a6c085e4e9 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -39,6 +39,7 @@ DatabaseError, ) from pandas.util._exceptions import find_stack_level +from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import ( is_datetime64tz_dtype, @@ -323,6 +324,7 @@ def read_sql_table( >>> pd.read_sql_table('table_name', 'postgres:///db_name') # doctest:+SKIP """ + check_dtype_backend(dtype_backend) if dtype_backend is lib.no_default: dtype_backend = "numpy" # type: ignore[assignment] @@ -450,6 +452,7 @@ def read_sql_query( parameter will be converted to UTC. """ + check_dtype_backend(dtype_backend) if dtype_backend is lib.no_default: dtype_backend = "numpy" # type: ignore[assignment] @@ -610,6 +613,7 @@ def read_sql( 1 1 2010-11-12 """ + check_dtype_backend(dtype_backend) if dtype_backend is lib.no_default: dtype_backend = "numpy" # type: ignore[assignment] diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 03b31250fd6cb..5fc9498e16d97 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -19,6 +19,7 @@ ParserError, ) from pandas.util._decorators import doc +from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import is_list_like @@ -1108,6 +1109,7 @@ def read_xml( 1 circle 360 NaN 2 triangle 180 3.0 """ + check_dtype_backend(dtype_backend) return _parse( path_or_buffer=path_or_buffer, diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index ad3be3d4014a7..3fda3ae36f923 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -124,3 +124,13 @@ def test_pyarrow_dtype_empty_object(self): expected = pd.DataFrame(columns=[0]) result = expected.convert_dtypes(dtype_backend="pyarrow") tm.assert_frame_equal(result, expected) + + def test_pyarrow_engine_lines_false(self): + # GH 48893 + df = pd.DataFrame({"a": [1, 2, 3]}) + msg = ( + "dtype_backend numpy invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + df.convert_dtypes(dtype_backend="numpy") diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index fde62eb7a91a5..edd07f40487f8 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1944,6 +1944,14 @@ def test_read_json_nullable_series(self, string_storage, dtype_backend, orient): tm.assert_series_equal(result, expected) + def test_invalid_dtype_backend(self): + msg = ( + "dtype_backend numpy invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + read_json("test", dtype_backend="numpy") + def test_invalid_engine(): # GH 48893 diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index fe2de5355a6be..28e73df7abe17 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -1001,3 +1001,12 @@ def test_dtype_backend(string_storage, dtype_backend): expected["i"] = ArrowExtensionArray(pa.array([None, None])) tm.assert_frame_equal(result, expected) + + +def test_invalid_dtype_backend(): + msg = ( + "dtype_backend numpy invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + read_fwf("test", dtype_backend="numpy") diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 185dc733df3c2..75028f628f7f9 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -200,3 +200,13 @@ def test_invalid_file_inputs(request, all_parsers): with pytest.raises(ValueError, match="Invalid"): parser.read_csv([]) + + +def test_invalid_dtype_backend(all_parsers): + parser = all_parsers + msg = ( + "dtype_backend numpy invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + parser.read_csv("test", dtype_backend="numpy") diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 94f073b1abb86..599303cbbc55b 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -467,3 +467,11 @@ def test_read_clipboard_dtype_backend( expected["g"] = ArrowExtensionArray(pa.array([None, None])) tm.assert_frame_equal(result, expected) + + def test_invalid_dtype_backend(self): + msg = ( + "dtype_backend numpy invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + read_clipboard(dtype_backend="numpy") diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 203472b0d0953..69b24f11a1b03 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -212,3 +212,14 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend): def test_int_columns_and_index(self): df = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index([3, 4, 5], name="test")) self.check_round_trip(df) + + def test_invalid_dtype_backend(self): + msg = ( + "dtype_backend numpy invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + df = pd.DataFrame({"int": list(range(1, 4))}) + with tm.ensure_clean("tmp.feather") as path: + df.to_feather(path) + with pytest.raises(ValueError, match=msg): + read_feather(path, dtype_backend="numpy") diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 1595fa86567c9..18c8d061aeace 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1465,3 +1465,11 @@ def test_extract_links_all_no_header(self): result = self.read_html(data, extract_links="all")[0] expected = DataFrame([[("Google.com", "https://google.com")]]) tm.assert_frame_equal(result, expected) + + def test_invalid_dtype_backend(self): + msg = ( + "dtype_backend numpy invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + read_html("test", dtype_backend="numpy") diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 35df047915255..31e8903a61043 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -391,3 +391,15 @@ def test_orc_uri_path(): uri = pathlib.Path(path).as_uri() result = read_orc(uri) tm.assert_frame_equal(result, expected) + + +def test_invalid_dtype_backend(): + msg = ( + "dtype_backend numpy invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + df = pd.DataFrame({"int": list(range(1, 4))}) + with tm.ensure_clean("tmp.orc") as path: + df.to_orc(path) + with pytest.raises(ValueError, match=msg): + read_orc(path, dtype_backend="numpy") diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 2276dd0136e87..b7f34371b7071 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1210,3 +1210,14 @@ def test_bytes_file_name(self, engine): result = read_parquet(path, engine=engine) tm.assert_frame_equal(result, df) + + def test_invalid_dtype_backend(self): + msg = ( + "dtype_backend numpy invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + df = pd.DataFrame({"int": list(range(1, 4))}) + with tm.ensure_clean("tmp.parquet") as path: + df.to_parquet(path) + with pytest.raises(ValueError, match=msg): + read_parquet(path, dtype_backend="numpy") diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index fe414f6c3d52c..05b23889251fb 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -102,3 +102,12 @@ def test_spss_umlauts_dtype_backend(datapath, dtype_backend): ) tm.assert_frame_equal(df, expected) + + +def test_invalid_dtype_backend(): + msg = ( + "dtype_backend numpy invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + pd.read_spss("test", dtype_backend="numpy") diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 1691b6b72c40b..63facca718260 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2486,6 +2486,19 @@ def test_read_sql_dtype_backend_table(self, string_storage, func, dtype_backend) for result in iterator: tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("func", ["read_sql", "read_sql_table", "read_sql_query"]) + def test_read_sql_invalid_dtype_backend_table(self, func): + table = "test" + df = self.dtype_backend_data() + df.to_sql(table, self.conn, index=False, if_exists="replace") + + msg = ( + "dtype_backend numpy invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + getattr(pd, func)(table, self.conn, dtype_backend="numpy") + def dtype_backend_data(self) -> DataFrame: return DataFrame( { diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index eaadcd6cee11b..bf62ba7580087 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -1889,3 +1889,12 @@ def test_read_xml_nullable_dtypes(parser, string_storage, dtype_backend): expected["g"] = ArrowExtensionArray(pa.array([None, None])) tm.assert_frame_equal(result, expected) + + +def test_invalid_dtype_backend(): + msg = ( + "dtype_backend numpy invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + read_xml("test", dtype_backend="numpy") diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 8f87313f26171..4e6cc8863ca02 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -913,3 +913,13 @@ def test_to_numeric_dtype_backend_error(dtype_backend): dtype = "Float64" expected = Series([np.nan, np.nan, np.nan], dtype=dtype) tm.assert_series_equal(result, expected) + + +def test_invalid_dtype_backend(): + ser = Series([1, 2, 3]) + msg = ( + "dtype_backend numpy invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + to_numeric(ser, dtype_backend="numpy") diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index 7b1eca695c6d6..248bf59f6171d 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -13,6 +13,8 @@ import numpy as np +from pandas._libs import lib + from pandas.core.dtypes.common import ( is_bool, is_integer, @@ -437,3 +439,12 @@ def validate_insert_loc(loc: int, length: int) -> int: if not 0 <= loc <= length: raise IndexError(f"loc must be an integer between -{length} and {length}") return loc + + +def check_dtype_backend(dtype_backend) -> None: + if dtype_backend is not lib.no_default: + if dtype_backend not in ["numpy_nullable", "pyarrow"]: + raise ValueError( + f"dtype_backend {dtype_backend} invalid, only 'numpy_nullable' and " + f"'pyarrow' are allowed.", + ) From d90989f420e54be4f61b13c0cfbdc5cb6701edff Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 9 Mar 2023 23:54:48 +0100 Subject: [PATCH 10/18] Fix --- pandas/tests/io/test_parquet.py | 2 +- pandas/tests/tools/test_to_numeric.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 2276dd0136e87..af35b50ed50d8 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1037,7 +1037,7 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa): check_round_trip( df, engine=pa, - read_kwargs={"dtype_backend": True}, + read_kwargs={"dtype_backend": "pyarrow"}, expected=expected, ) diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 8f87313f26171..07569aa21dbe2 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -829,7 +829,7 @@ def test_to_numeric_dtype_backend_na(val, dtype): pytest.importorskip("pyarrow") dtype_backend = "pyarrow" else: - dtype_backend = "pandas" + dtype_backend = "numpy_nullable" ser = Series([val, None], dtype=object) result = to_numeric(ser, dtype_backend=dtype_backend) expected = Series([val, pd.NA], dtype=dtype) @@ -861,7 +861,8 @@ def test_to_numeric_dtype_backend_downcasting(val, dtype, downcast): @pytest.mark.parametrize( - "smaller, dtype_backend", [["UInt8", "pandas"], ["uint8[pyarrow]", "pyarrow"]] + "smaller, dtype_backend", + [["UInt8", "numpy_nullable"], ["uint8[pyarrow]", "pyarrow"]], ) def test_to_numeric_dtype_backend_downcasting_uint(smaller, dtype_backend): # GH#50505 From f251c8edf1b13db051c95225c9593c7c837a13e4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 13 Mar 2023 18:38:13 +0000 Subject: [PATCH 11/18] Update --- pandas/core/internals/construction.py | 2 +- pandas/io/json/_json.py | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 4458c1dc09d41..b114b8a1aa7aa 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -980,7 +980,7 @@ def convert_object_array( ---------- content: List[np.ndarray] dtype: np.dtype or ExtensionDtype - dtype_backend: Controls if nullable dtypes are returned. + dtype_backend: Controls if nullable/pyarrow dtypes are returned. coerce_float: Cast floats that are integers to int. Returns diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 81786c4e04885..37a107d79c578 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -950,14 +950,18 @@ def read(self) -> DataFrame | Series: if self.engine == "pyarrow": pyarrow_json = import_optional_dependency("pyarrow.json") pa_table = pyarrow_json.read_json(self.data) + + mapping: dict | type[ArrowDtype] | None if self.dtype_backend == "pyarrow": - return pa_table.to_pandas(types_mapper=ArrowDtype) + mapping = ArrowDtype elif self.dtype_backend == "numpy_nullable": from pandas.io._util import _arrow_dtype_mapping mapping = _arrow_dtype_mapping() - return pa_table.to_pandas(types_mapper=mapping.get) - return pa_table.to_pandas() + else: + mapping = None + + return pa_table.to_pandas(types_mapper=mapping) elif self.engine == "ujson": if self.lines: if self.chunksize: From 6b83aa096d46ed1a63aa3b00b4ccb58fe81c1205 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 13 Mar 2023 18:52:59 +0000 Subject: [PATCH 12/18] Add get --- pandas/io/json/_json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 37a107d79c578..d6a1257e5a246 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -957,7 +957,7 @@ def read(self) -> DataFrame | Series: elif self.dtype_backend == "numpy_nullable": from pandas.io._util import _arrow_dtype_mapping - mapping = _arrow_dtype_mapping() + mapping = _arrow_dtype_mapping().get else: mapping = None From 2eb5c88bce66417518e21a344a0fa65c7bc2efbc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 13 Mar 2023 18:53:39 +0000 Subject: [PATCH 13/18] Update pandas/util/_validators.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/util/_validators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index 248bf59f6171d..17ef583febc24 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -445,6 +445,6 @@ def check_dtype_backend(dtype_backend) -> None: if dtype_backend is not lib.no_default: if dtype_backend not in ["numpy_nullable", "pyarrow"]: raise ValueError( - f"dtype_backend {dtype_backend} invalid, only 'numpy_nullable' and " + f"dtype_backend {dtype_backend} is invalid, only 'numpy_nullable' and " f"'pyarrow' are allowed.", ) From 2e26467efd088bbf30598504eb7637c70cdc720f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 13 Mar 2023 19:55:36 +0000 Subject: [PATCH 14/18] Fix error message check --- pandas/tests/frame/methods/test_convert_dtypes.py | 2 +- pandas/tests/io/json/test_pandas.py | 2 +- pandas/tests/io/parser/test_read_fwf.py | 2 +- pandas/tests/io/parser/test_unsupported.py | 2 +- pandas/tests/io/test_clipboard.py | 2 +- pandas/tests/io/test_feather.py | 2 +- pandas/tests/io/test_html.py | 2 +- pandas/tests/io/test_orc.py | 2 +- pandas/tests/io/test_parquet.py | 2 +- pandas/tests/io/test_spss.py | 2 +- pandas/tests/io/test_sql.py | 2 +- pandas/tests/io/xml/test_xml.py | 2 +- pandas/tests/tools/test_to_numeric.py | 2 +- 13 files changed, 13 insertions(+), 13 deletions(-) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 3fda3ae36f923..6076933eecec4 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -129,7 +129,7 @@ def test_pyarrow_engine_lines_false(self): # GH 48893 df = pd.DataFrame({"a": [1, 2, 3]}) msg = ( - "dtype_backend numpy invalid, only 'numpy_nullable' and " + "dtype_backend numpy is invalid, only 'numpy_nullable' and " "'pyarrow' are allowed." ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index edd07f40487f8..08308ebd2f1cf 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1946,7 +1946,7 @@ def test_read_json_nullable_series(self, string_storage, dtype_backend, orient): def test_invalid_dtype_backend(self): msg = ( - "dtype_backend numpy invalid, only 'numpy_nullable' and " + "dtype_backend numpy is invalid, only 'numpy_nullable' and " "'pyarrow' are allowed." ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 28e73df7abe17..d166946704e13 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -1005,7 +1005,7 @@ def test_dtype_backend(string_storage, dtype_backend): def test_invalid_dtype_backend(): msg = ( - "dtype_backend numpy invalid, only 'numpy_nullable' and " + "dtype_backend numpy is invalid, only 'numpy_nullable' and " "'pyarrow' are allowed." ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 75028f628f7f9..1a9d99b0b5c1f 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -205,7 +205,7 @@ def test_invalid_file_inputs(request, all_parsers): def test_invalid_dtype_backend(all_parsers): parser = all_parsers msg = ( - "dtype_backend numpy invalid, only 'numpy_nullable' and " + "dtype_backend numpy is invalid, only 'numpy_nullable' and " "'pyarrow' are allowed." ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 599303cbbc55b..baf2bcdc9386f 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -470,7 +470,7 @@ def test_read_clipboard_dtype_backend( def test_invalid_dtype_backend(self): msg = ( - "dtype_backend numpy invalid, only 'numpy_nullable' and " + "dtype_backend numpy is invalid, only 'numpy_nullable' and " "'pyarrow' are allowed." ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 69b24f11a1b03..c5bd8341e1a54 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -215,7 +215,7 @@ def test_int_columns_and_index(self): def test_invalid_dtype_backend(self): msg = ( - "dtype_backend numpy invalid, only 'numpy_nullable' and " + "dtype_backend numpy is invalid, only 'numpy_nullable' and " "'pyarrow' are allowed." ) df = pd.DataFrame({"int": list(range(1, 4))}) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 18c8d061aeace..03f1bcb13d077 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1468,7 +1468,7 @@ def test_extract_links_all_no_header(self): def test_invalid_dtype_backend(self): msg = ( - "dtype_backend numpy invalid, only 'numpy_nullable' and " + "dtype_backend numpy is invalid, only 'numpy_nullable' and " "'pyarrow' are allowed." ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 31e8903a61043..1dc3f3156e362 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -395,7 +395,7 @@ def test_orc_uri_path(): def test_invalid_dtype_backend(): msg = ( - "dtype_backend numpy invalid, only 'numpy_nullable' and " + "dtype_backend numpy is invalid, only 'numpy_nullable' and " "'pyarrow' are allowed." ) df = pd.DataFrame({"int": list(range(1, 4))}) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index d0291fe1bbb5f..a1fef45f59838 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1213,7 +1213,7 @@ def test_bytes_file_name(self, engine): def test_invalid_dtype_backend(self): msg = ( - "dtype_backend numpy invalid, only 'numpy_nullable' and " + "dtype_backend numpy is invalid, only 'numpy_nullable' and " "'pyarrow' are allowed." ) df = pd.DataFrame({"int": list(range(1, 4))}) diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index 05b23889251fb..9e1f6cf7cd8d4 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -106,7 +106,7 @@ def test_spss_umlauts_dtype_backend(datapath, dtype_backend): def test_invalid_dtype_backend(): msg = ( - "dtype_backend numpy invalid, only 'numpy_nullable' and " + "dtype_backend numpy is invalid, only 'numpy_nullable' and " "'pyarrow' are allowed." ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 63facca718260..dc51a5b0a77fb 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2493,7 +2493,7 @@ def test_read_sql_invalid_dtype_backend_table(self, func): df.to_sql(table, self.conn, index=False, if_exists="replace") msg = ( - "dtype_backend numpy invalid, only 'numpy_nullable' and " + "dtype_backend numpy is invalid, only 'numpy_nullable' and " "'pyarrow' are allowed." ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index bf62ba7580087..a53e5f247c73a 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -1893,7 +1893,7 @@ def test_read_xml_nullable_dtypes(parser, string_storage, dtype_backend): def test_invalid_dtype_backend(): msg = ( - "dtype_backend numpy invalid, only 'numpy_nullable' and " + "dtype_backend numpy is invalid, only 'numpy_nullable' and " "'pyarrow' are allowed." ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 14a79940f0ba6..4a0b01a275523 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -919,7 +919,7 @@ def test_to_numeric_dtype_backend_error(dtype_backend): def test_invalid_dtype_backend(): ser = Series([1, 2, 3]) msg = ( - "dtype_backend numpy invalid, only 'numpy_nullable' and " + "dtype_backend numpy is invalid, only 'numpy_nullable' and " "'pyarrow' are allowed." ) with pytest.raises(ValueError, match=msg): From 2186e5ba17d060bd51db2086277c7bcf603ccf25 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 13 Mar 2023 23:30:19 +0100 Subject: [PATCH 15/18] Update docstring --- pandas/core/tools/numeric.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 62976f68cbdd4..b9555b67f441f 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -88,9 +88,9 @@ def to_numeric( the dtype it is to be cast to, so if none of the dtypes checked satisfy that specification, no downcasting will be performed on the data. - dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames - Which dtype_backend to use, e.g. whether a DataFrame should have NumPy - arrays, nullable dtypes are used for all dtypes that have a nullable + dtype_backend : {"numpy_nullable", "pyarrow"}, default "numpy_nullable" + Which dtype_backend to use, e.g. whether a DataFrame should have nullable + dtypes that are used for all dtypes that have a nullable implementation when "numpy_nullable" is set, pyarrow is used for all dtypes if "pyarrow" is set. From 612df732713f9c06f6d34aa65a00c07f6ac62af7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 13 Mar 2023 23:31:22 +0100 Subject: [PATCH 16/18] Update docstring --- pandas/core/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8b431dbf1bcd7..1bedf07103086 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6591,8 +6591,8 @@ def convert_dtypes( .. versionadded:: 1.2.0 dtype_backend : {"numpy_nullable", "pyarrow"}, default "numpy_nullable" - Which dtype_backend to use, e.g. whether a DataFrame should have NumPy - arrays, nullable dtypes are used for all dtypes that have a nullable + Which dtype_backend to use, e.g. whether a DataFrame should use nullable + dtypes for all dtypes that have a nullable implementation when "numpy_nullable" is set, pyarrow is used for all dtypes if "pyarrow" is set. From 40c4fb5c43d3e6a9cbbbc314f1cfd57e16772eaa Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 13 Mar 2023 23:31:44 +0100 Subject: [PATCH 17/18] Revert "Update docstring" This reverts commit 2186e5ba17d060bd51db2086277c7bcf603ccf25. --- pandas/core/tools/numeric.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index b9555b67f441f..62976f68cbdd4 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -88,9 +88,9 @@ def to_numeric( the dtype it is to be cast to, so if none of the dtypes checked satisfy that specification, no downcasting will be performed on the data. - dtype_backend : {"numpy_nullable", "pyarrow"}, default "numpy_nullable" - Which dtype_backend to use, e.g. whether a DataFrame should have nullable - dtypes that are used for all dtypes that have a nullable + dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames + Which dtype_backend to use, e.g. whether a DataFrame should have NumPy + arrays, nullable dtypes are used for all dtypes that have a nullable implementation when "numpy_nullable" is set, pyarrow is used for all dtypes if "pyarrow" is set. From 73592bec51032e3fa817be889487ebc5ac61c788 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 14 Mar 2023 11:54:06 +0100 Subject: [PATCH 18/18] Fix mypy and skip when no pyarrow --- pandas/io/json/_json.py | 2 +- pandas/tests/io/test_parquet.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index d6a1257e5a246..588ec639bc2fd 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -951,7 +951,7 @@ def read(self) -> DataFrame | Series: pyarrow_json = import_optional_dependency("pyarrow.json") pa_table = pyarrow_json.read_json(self.data) - mapping: dict | type[ArrowDtype] | None + mapping: type[ArrowDtype] | None | Callable if self.dtype_backend == "pyarrow": mapping = ArrowDtype elif self.dtype_backend == "numpy_nullable": diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index a1fef45f59838..1548208c7eeaa 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1211,7 +1211,7 @@ def test_bytes_file_name(self, engine): result = read_parquet(path, engine=engine) tm.assert_frame_equal(result, df) - def test_invalid_dtype_backend(self): + def test_invalid_dtype_backend(self, engine): msg = ( "dtype_backend numpy is invalid, only 'numpy_nullable' and " "'pyarrow' are allowed."