Skip to content

String dtype: rename the storage options and add na_value keyword in StringDtype() #59330

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Jul 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2702,7 +2702,7 @@ def maybe_convert_objects(ndarray[object] objects,
if using_string_dtype() and is_string_array(objects, skipna=True):
from pandas.core.arrays.string_ import StringDtype

dtype = StringDtype(storage="pyarrow_numpy")
dtype = StringDtype(storage="pyarrow", na_value=np.nan)
return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)

elif convert_to_nullable_dtype and is_string_array(objects, skipna=True):
Expand Down
4 changes: 2 additions & 2 deletions pandas/_testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,14 +509,14 @@ def shares_memory(left, right) -> bool:
if (
isinstance(left, ExtensionArray)
and is_string_dtype(left.dtype)
and left.dtype.storage in ("pyarrow", "pyarrow_numpy") # type: ignore[attr-defined]
and left.dtype.storage == "pyarrow" # type: ignore[attr-defined]
):
# https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
left = cast("ArrowExtensionArray", left)
if (
isinstance(right, ExtensionArray)
and is_string_dtype(right.dtype)
and right.dtype.storage in ("pyarrow", "pyarrow_numpy") # type: ignore[attr-defined]
and right.dtype.storage == "pyarrow" # type: ignore[attr-defined]
):
right = cast("ArrowExtensionArray", right)
left_pa_data = left._pa_array
Expand Down
6 changes: 2 additions & 4 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,10 +575,8 @@ def __getitem__(self, item: PositionalIndexer):
if isinstance(item, np.ndarray):
if not len(item):
# Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string]
if self._dtype.name == "string" and self._dtype.storage in (
"pyarrow",
"pyarrow_numpy",
):
if self._dtype.name == "string" and self._dtype.storage == "pyarrow":
# TODO(infer_string) should this be large_string?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I think this was overlooked when the large_string transition happened. Might be nice if this pyarrow type was an attribute on StringDtype?

pa_dtype = pa.string()
else:
pa_dtype = self._dtype.pyarrow_dtype
Expand Down
89 changes: 68 additions & 21 deletions pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@

import numpy as np

from pandas._config import get_option
from pandas._config import (
get_option,
using_string_dtype,
)

from pandas._libs import (
lib,
Expand Down Expand Up @@ -81,8 +84,10 @@ class StringDtype(StorageExtensionDtype):

Parameters
----------
storage : {"python", "pyarrow", "pyarrow_numpy"}, optional
storage : {"python", "pyarrow"}, optional
If not given, the value of ``pd.options.mode.string_storage``.
na_value : {np.nan, pd.NA}, default pd.NA
Whether the dtype follows NaN or NA missing value semantics.

Attributes
----------
Expand Down Expand Up @@ -113,30 +118,67 @@ class StringDtype(StorageExtensionDtype):
# follows NumPy semantics, which uses nan.
@property
def na_value(self) -> libmissing.NAType | float: # type: ignore[override]
if self.storage == "pyarrow_numpy":
return np.nan
else:
return libmissing.NA
return self._na_value

_metadata = ("storage",)
_metadata = ("storage", "_na_value") # type: ignore[assignment]

def __init__(self, storage=None) -> None:
def __init__(
self,
storage: str | None = None,
na_value: libmissing.NAType | float = libmissing.NA,
) -> None:
# infer defaults
if storage is None:
infer_string = get_option("future.infer_string")
if infer_string:
storage = "pyarrow_numpy"
if using_string_dtype():
storage = "pyarrow"
else:
storage = get_option("mode.string_storage")
if storage not in {"python", "pyarrow", "pyarrow_numpy"}:

if storage == "pyarrow_numpy":
# TODO raise a deprecation warning
storage = "pyarrow"
na_value = np.nan

# validate options
if storage not in {"python", "pyarrow"}:
raise ValueError(
f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. "
f"Got {storage} instead."
f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
)
if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under10p1:
if storage == "pyarrow" and pa_version_under10p1:
raise ImportError(
"pyarrow>=10.0.1 is required for PyArrow backed StringArray."
)

if isinstance(na_value, float) and np.isnan(na_value):
# when passed a NaN value, always set to np.nan to ensure we use
# a consistent NaN value (and we can use `dtype.na_value is np.nan`)
na_value = np.nan
elif na_value is not libmissing.NA:
raise ValueError("'na_value' must be np.nan or pd.NA, got {na_value}")

self.storage = storage
self._na_value = na_value

def __eq__(self, other: object) -> bool:
# we need to override the base class __eq__ because na_value (NA or NaN)
# cannot be checked with normal `==`
if isinstance(other, str):
if other == self.name:
return True
try:
other = self.construct_from_string(other)
except TypeError:
return False
if isinstance(other, type(self)):
return self.storage == other.storage and self.na_value is other.na_value
return False

def __hash__(self) -> int:
# need to override __hash__ as well because of overriding __eq__
return super().__hash__()

def __reduce__(self):
return StringDtype, (self.storage, self.na_value)

@property
def type(self) -> type[str]:
Expand Down Expand Up @@ -181,6 +223,7 @@ def construct_from_string(cls, string) -> Self:
elif string == "string[pyarrow]":
return cls(storage="pyarrow")
elif string == "string[pyarrow_numpy]":
# TODO deprecate
return cls(storage="pyarrow_numpy")
else:
raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
Expand All @@ -205,7 +248,7 @@ def construct_array_type( # type: ignore[override]

if self.storage == "python":
return StringArray
elif self.storage == "pyarrow":
elif self.storage == "pyarrow" and self._na_value is libmissing.NA:
return ArrowStringArray
else:
return ArrowStringArrayNumpySemantics
Expand All @@ -217,13 +260,17 @@ def __from_arrow__(
Construct StringArray from pyarrow Array/ChunkedArray.
"""
if self.storage == "pyarrow":
from pandas.core.arrays.string_arrow import ArrowStringArray
if self._na_value is libmissing.NA:
from pandas.core.arrays.string_arrow import ArrowStringArray

return ArrowStringArray(array)
else:
from pandas.core.arrays.string_arrow import (
ArrowStringArrayNumpySemantics,
)

return ArrowStringArray(array)
elif self.storage == "pyarrow_numpy":
from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
return ArrowStringArrayNumpySemantics(array)

return ArrowStringArrayNumpySemantics(array)
else:
import pyarrow

Expand Down
11 changes: 5 additions & 6 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr
# base class "ArrowExtensionArray" defined the type as "ArrowDtype")
_dtype: StringDtype # type: ignore[assignment]
_storage = "pyarrow"
_na_value: libmissing.NAType | float = libmissing.NA

def __init__(self, values) -> None:
_chk_pyarrow_available()
Expand All @@ -140,7 +141,7 @@ def __init__(self, values) -> None:
values = pc.cast(values, pa.large_string())

super().__init__(values)
self._dtype = StringDtype(storage=self._storage)
self._dtype = StringDtype(storage=self._storage, na_value=self._na_value)

if not pa.types.is_large_string(self._pa_array.type) and not (
pa.types.is_dictionary(self._pa_array.type)
Expand Down Expand Up @@ -187,10 +188,7 @@ def _from_sequence(

if dtype and not (isinstance(dtype, str) and dtype == "string"):
dtype = pandas_dtype(dtype)
assert isinstance(dtype, StringDtype) and dtype.storage in (
"pyarrow",
"pyarrow_numpy",
)
assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow"

if isinstance(scalars, BaseMaskedArray):
# avoid costly conversion to object dtype in ensure_string_array and
Expand Down Expand Up @@ -597,7 +595,8 @@ def _rank(


class ArrowStringArrayNumpySemantics(ArrowStringArray):
_storage = "pyarrow_numpy"
_storage = "pyarrow"
_na_value = np.nan

@classmethod
def _result_converter(cls, values, na=None):
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,7 +574,7 @@ def sanitize_array(
if isinstance(data, str) and using_string_dtype() and original_dtype is None:
from pandas.core.arrays.string_ import StringDtype

dtype = StringDtype("pyarrow_numpy")
dtype = StringDtype("pyarrow", na_value=np.nan)
data = construct_1d_arraylike_from_scalar(data, len(index), dtype)

return data
Expand Down Expand Up @@ -608,7 +608,7 @@ def sanitize_array(
elif data.dtype.kind == "U" and using_string_dtype():
from pandas.core.arrays.string_ import StringDtype

dtype = StringDtype(storage="pyarrow_numpy")
dtype = StringDtype(storage="pyarrow", na_value=np.nan)
subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype)

if subarr is data and copy:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -801,7 +801,7 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
if using_string_dtype():
from pandas.core.arrays.string_ import StringDtype

dtype = StringDtype(storage="pyarrow_numpy")
dtype = StringDtype(storage="pyarrow", na_value=np.nan)

elif isinstance(val, (np.datetime64, dt.datetime)):
try:
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5453,9 +5453,10 @@ def equals(self, other: Any) -> bool:

if (
isinstance(self.dtype, StringDtype)
and self.dtype.storage == "pyarrow_numpy"
and self.dtype.na_value is np.nan
and other.dtype != self.dtype
):
# TODO(infer_string) can we avoid this special case?
# special case for object behavior
return other.equals(self.astype(object))

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ def ndarray_to_mgr(
nb = new_block_2d(values, placement=bp, refs=refs)
block_values = [nb]
elif dtype is None and values.dtype.kind == "U" and using_string_dtype():
dtype = StringDtype(storage="pyarrow_numpy")
dtype = StringDtype(storage="pyarrow", na_value=np.nan)

obj_columns = list(values)
block_values = [
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/reshape/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import numpy as np

from pandas._libs import missing as libmissing
from pandas._libs.sparse import IntIndex

from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -256,7 +257,7 @@ def _get_dummies_1d(
dtype = ArrowDtype(pa.bool_()) # type: ignore[assignment]
elif (
isinstance(input_dtype, StringDtype)
and input_dtype.storage != "pyarrow_numpy"
and input_dtype.na_value is libmissing.NA
):
dtype = pandas_dtype("boolean") # type: ignore[assignment]
else:
Expand Down
3 changes: 1 addition & 2 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2677,8 +2677,7 @@ def _factorize_keys(

elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype:
if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or (
isinstance(lk.dtype, StringDtype)
and lk.dtype.storage in ["pyarrow", "pyarrow_numpy"]
isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow"
):
import pyarrow as pa
import pyarrow.compute as pc
Expand Down
9 changes: 6 additions & 3 deletions pandas/core/tools/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@

import numpy as np

from pandas._libs import lib
from pandas._libs import (
lib,
missing as libmissing,
)
from pandas.util._validators import check_dtype_backend

from pandas.core.dtypes.cast import maybe_downcast_numeric
Expand Down Expand Up @@ -218,7 +221,7 @@ def to_numeric(
coerce_numeric=coerce_numeric,
convert_to_masked_nullable=dtype_backend is not lib.no_default
or isinstance(values_dtype, StringDtype)
and not values_dtype.storage == "pyarrow_numpy",
and values_dtype.na_value is libmissing.NA,
)

if new_mask is not None:
Expand All @@ -229,7 +232,7 @@ def to_numeric(
dtype_backend is not lib.no_default
and new_mask is None
or isinstance(values_dtype, StringDtype)
and not values_dtype.storage == "pyarrow_numpy"
and values_dtype.na_value is libmissing.NA
):
new_mask = np.zeros(values.shape, dtype=np.bool_)

Expand Down
6 changes: 4 additions & 2 deletions pandas/io/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from typing import TYPE_CHECKING

import numpy as np

from pandas.compat._optional import import_optional_dependency

import pandas as pd
Expand Down Expand Up @@ -32,6 +34,6 @@ def arrow_string_types_mapper() -> Callable:
pa = import_optional_dependency("pyarrow")

return {
pa.string(): pd.StringDtype(storage="pyarrow_numpy"),
pa.large_string(): pd.StringDtype(storage="pyarrow_numpy"),
pa.string(): pd.StringDtype(storage="pyarrow", na_value=np.nan),
pa.large_string(): pd.StringDtype(storage="pyarrow", na_value=np.nan),
}.get
Loading
Loading