Skip to content

ENH: Add global nullable option #50748

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Jan 24, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,16 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following
* :func:`read_feather`
* :func:`to_numeric`

To simplify the global opt-in a new option ``nullable_dtypes`` was added that allows to set
the keyword argument globally to ``True`` if not specified directly. The option can be enabled
through:

.. ipython:: python

pd.options.mode.nullable_dtypes = True

The option will only work in context with the keyword ``use_nullable_dtypes``.

Additionally a new global configuration, ``mode.dtype_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions
to select the nullable dtypes implementation.

Expand Down
5 changes: 5 additions & 0 deletions pandas/_config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,8 @@
def using_copy_on_write():
_mode_options = _global_config["mode"]
return _mode_options["copy_on_write"] and _mode_options["data_manager"] == "block"


def using_nullable_dtypes():
_mode_options = _global_config["mode"]
return _mode_options["nullable_dtypes"]
16 changes: 16 additions & 0 deletions pandas/core/config_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,6 +560,22 @@ def use_inf_as_na_cb(key) -> None:
validator=is_one_of_factory(["pandas", "pyarrow"]),
)


nullable_dtypes_doc = """
: bool
If nullable dtypes should be returned. This is only applicable to functions
where ``use_nullable_dtypes`` is implemented.
"""

with cf.config_prefix("mode"):
cf.register_option(
"nullable_dtypes",
False,
nullable_dtypes_doc,
validator=is_bool,
)


# Set up the io.excel specific reader configuration.
reader_engine_doc = """
: string
Expand Down
16 changes: 12 additions & 4 deletions pandas/core/tools/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import numpy as np

from pandas._config import using_nullable_dtypes

from pandas._libs import lib
from pandas._typing import (
DateTimeErrorChoices,
Expand Down Expand Up @@ -36,7 +38,7 @@ def to_numeric(
arg,
errors: DateTimeErrorChoices = "raise",
downcast: Literal["integer", "signed", "unsigned", "float"] | None = None,
use_nullable_dtypes: bool = False,
use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
):
"""
Convert argument to a numeric type.
Expand Down Expand Up @@ -155,6 +157,12 @@ def to_numeric(
if errors not in ("ignore", "raise", "coerce"):
raise ValueError("invalid error value specified")

_use_nullable_dtypes = (
use_nullable_dtypes
if use_nullable_dtypes is not lib.no_default
else using_nullable_dtypes()
)

is_series = False
is_index = False
is_scalars = False
Expand Down Expand Up @@ -199,11 +207,11 @@ def to_numeric(
values = ensure_object(values)
coerce_numeric = errors not in ("ignore", "raise")
try:
values, new_mask = lib.maybe_convert_numeric( # type: ignore[call-overload]
values, new_mask = lib.maybe_convert_numeric(
values,
set(),
coerce_numeric=coerce_numeric,
convert_to_masked_nullable=use_nullable_dtypes,
convert_to_masked_nullable=_use_nullable_dtypes,
)
except (ValueError, TypeError):
if errors == "raise":
Expand All @@ -213,7 +221,7 @@ def to_numeric(
# Remove unnecessary values, is expected later anyway and enables
# downcasting
values = values[~new_mask]
elif use_nullable_dtypes and new_mask is None:
elif _use_nullable_dtypes and new_mask is None:
new_mask = np.zeros(values.shape, dtype=np.bool_)

# attempt downcast only if the data has been successfully converted
Expand Down
13 changes: 12 additions & 1 deletion pandas/io/clipboards.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
from io import StringIO
import warnings

from pandas._config import using_nullable_dtypes

from pandas._libs import lib
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.generic import ABCDataFrame
Expand All @@ -15,7 +18,9 @@


def read_clipboard(
sep: str = r"\s+", use_nullable_dtypes: bool = False, **kwargs
sep: str = r"\s+",
use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
**kwargs,
): # pragma: no cover
r"""
Read text from clipboard and pass to read_csv.
Expand Down Expand Up @@ -56,6 +61,12 @@ def read_clipboard(
if encoding is not None and encoding.lower().replace("-", "") != "utf8":
raise NotImplementedError("reading from clipboard only supports utf-8 encoding")

use_nullable_dtypes = (
use_nullable_dtypes
if use_nullable_dtypes is not lib.no_default
else using_nullable_dtypes()
)

from pandas.io.clipboard import clipboard_get
from pandas.io.parsers import read_csv

Expand Down
18 changes: 14 additions & 4 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,12 @@
)
import zipfile

from pandas._config import config
from pandas._config import (
config,
using_nullable_dtypes,
)

from pandas._libs import lib
from pandas._libs.parsers import STR_NA_VALUES
from pandas._typing import (
DtypeArg,
Expand Down Expand Up @@ -380,7 +384,7 @@ def read_excel(
comment: str | None = ...,
skipfooter: int = ...,
storage_options: StorageOptions = ...,
use_nullable_dtypes: bool = ...,
use_nullable_dtypes: bool | lib.NoDefault = ...,
) -> DataFrame:
...

Expand Down Expand Up @@ -419,7 +423,7 @@ def read_excel(
comment: str | None = ...,
skipfooter: int = ...,
storage_options: StorageOptions = ...,
use_nullable_dtypes: bool = ...,
use_nullable_dtypes: bool | lib.NoDefault = ...,
) -> dict[IntStrT, DataFrame]:
...

Expand Down Expand Up @@ -458,7 +462,7 @@ def read_excel(
comment: str | None = None,
skipfooter: int = 0,
storage_options: StorageOptions = None,
use_nullable_dtypes: bool = False,
use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
) -> DataFrame | dict[IntStrT, DataFrame]:

should_close = False
Expand All @@ -471,6 +475,12 @@ def read_excel(
"an ExcelFile - ExcelFile already has the engine set"
)

use_nullable_dtypes = (
use_nullable_dtypes
if use_nullable_dtypes is not lib.no_default
else using_nullable_dtypes()
)

try:
data = io.parse(
sheet_name=sheet_name,
Expand Down
11 changes: 10 additions & 1 deletion pandas/io/feather_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
Sequence,
)

from pandas._config import using_nullable_dtypes

from pandas._libs import lib
from pandas._typing import (
FilePath,
ReadBuffer,
Expand Down Expand Up @@ -103,7 +106,7 @@ def read_feather(
columns: Sequence[Hashable] | None = None,
use_threads: bool = True,
storage_options: StorageOptions = None,
use_nullable_dtypes: bool = False,
use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
):
"""
Load a feather-format object from the file path.
Expand Down Expand Up @@ -143,6 +146,12 @@ def read_feather(
import_optional_dependency("pyarrow")
from pyarrow import feather

use_nullable_dtypes = (
use_nullable_dtypes
if use_nullable_dtypes is not lib.no_default
else using_nullable_dtypes()
)

with get_handle(
path, "rb", storage_options=storage_options, is_text=False
) as handles:
Expand Down
11 changes: 10 additions & 1 deletion pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
cast,
)

from pandas._config import using_nullable_dtypes

from pandas._libs import lib
from pandas._typing import (
FilePath,
ReadBuffer,
Expand Down Expand Up @@ -1043,7 +1046,7 @@ def read_html(
keep_default_na: bool = True,
displayed_only: bool = True,
extract_links: Literal[None, "header", "footer", "body", "all"] = None,
use_nullable_dtypes: bool = False,
use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
) -> list[DataFrame]:
r"""
Read HTML tables into a ``list`` of ``DataFrame`` objects.
Expand Down Expand Up @@ -1213,6 +1216,12 @@ def read_html(
)
validate_header_arg(header)

use_nullable_dtypes = (
use_nullable_dtypes
if use_nullable_dtypes is not lib.no_default
else using_nullable_dtypes()
)

io = stringify_path(io)

return _parse(
Expand Down
11 changes: 10 additions & 1 deletion pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@

import numpy as np

from pandas._config import using_nullable_dtypes

from pandas._libs import lib
from pandas._libs.json import (
dumps,
loads,
Expand Down Expand Up @@ -496,7 +499,7 @@ def read_json(
compression: CompressionOptions = "infer",
nrows: int | None = None,
storage_options: StorageOptions = None,
use_nullable_dtypes: bool = False,
use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
) -> DataFrame | Series | JsonReader:
"""
Convert a JSON string to pandas object.
Expand Down Expand Up @@ -732,6 +735,12 @@ def read_json(
if orient == "table" and convert_axes:
raise ValueError("cannot pass both convert_axes and orient='table'")

use_nullable_dtypes = (
use_nullable_dtypes
if use_nullable_dtypes is not lib.no_default
else using_nullable_dtypes()
)

if dtype is None and orient != "table":
# error: Incompatible types in assignment (expression has type "bool", variable
# has type "Union[ExtensionDtype, str, dtype[Any], Type[str], Type[float],
Expand Down
14 changes: 12 additions & 2 deletions pandas/io/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@
Literal,
)

from pandas._config import get_option
from pandas._config import (
get_option,
using_nullable_dtypes,
)

from pandas._libs import lib
from pandas._typing import (
FilePath,
ReadBuffer,
Expand All @@ -33,7 +37,7 @@
def read_orc(
path: FilePath | ReadBuffer[bytes],
columns: list[str] | None = None,
use_nullable_dtypes: bool = False,
use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
**kwargs,
) -> DataFrame:
"""
Expand Down Expand Up @@ -86,6 +90,12 @@ def read_orc(

orc = import_optional_dependency("pyarrow.orc")

use_nullable_dtypes = (
use_nullable_dtypes
if use_nullable_dtypes is not lib.no_default
else using_nullable_dtypes()
)

with get_handle(path, "rb", is_text=False) as handles:
orc_file = orc.ORCFile(handles.handle)
pa_table = orc_file.read(columns=columns, **kwargs)
Expand Down
11 changes: 10 additions & 1 deletion pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
)
from warnings import catch_warnings

from pandas._config import using_nullable_dtypes

from pandas._libs import lib
from pandas._typing import (
FilePath,
ReadBuffer,
Expand Down Expand Up @@ -453,7 +456,7 @@ def read_parquet(
engine: str = "auto",
columns: list[str] | None = None,
storage_options: StorageOptions = None,
use_nullable_dtypes: bool = False,
use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
**kwargs,
) -> DataFrame:
"""
Expand Down Expand Up @@ -511,6 +514,12 @@ def read_parquet(
"""
impl = get_engine(engine)

use_nullable_dtypes = (
use_nullable_dtypes
if use_nullable_dtypes is not lib.no_default
else using_nullable_dtypes()
)

return impl.read(
path,
columns=columns,
Expand Down
Loading