Skip to content

Commit 56ae252

Browse files
committed
WIP: preliminary support for stringdtype
1 parent f676c5f commit 56ae252

File tree

16 files changed

+175
-54
lines changed

16 files changed

+175
-54
lines changed

asv_bench/asv.conf.json

+24-17
Original file line numberDiff line numberDiff line change
@@ -41,23 +41,30 @@
4141
// pip (with all the conda available packages installed first,
4242
// followed by the pip installed packages).
4343
"matrix": {
44-
"Cython": ["0.29.33"],
45-
"matplotlib": [],
46-
"sqlalchemy": [],
47-
"scipy": [],
48-
"numba": [],
49-
"numexpr": [],
50-
"pytables": [null, ""], // platform dependent, see excludes below
51-
"pyarrow": [],
52-
"tables": [null, ""],
53-
"openpyxl": [],
54-
"xlsxwriter": [],
55-
"xlrd": [],
56-
"odfpy": [],
57-
"jinja2": [],
58-
"meson": [],
59-
"meson-python": [],
60-
"python-build": [],
44+
"req": {
45+
"pip+/home/nathan/Documents/numpy": [],
46+
"Cython": ["0.29.33"],
47+
"matplotlib": [],
48+
"sqlalchemy": [],
49+
"scipy": [],
50+
"numba": [],
51+
"numexpr": [],
52+
"pytables": [null, ""], // platform dependent, see excludes below
53+
"pyarrow": [],
54+
"tables": [null, ""],
55+
"openpyxl": [],
56+
"xlsxwriter": [],
57+
"xlrd": [],
58+
"odfpy": [],
59+
"jinja2": [],
60+
"meson": [],
61+
"meson-python": [],
62+
"python-build": [],
63+
"pip+/home/nathan/Documents/numpy-user-dtypes/stringdtype": []
64+
},
65+
"env": {
66+
"NUMPY_EXPERIMENTAL_DTYPE_API": "1"
67+
}
6168
},
6269
"conda_channels": ["conda-forge"],
6370
// Combinations of libraries/python versions can be excluded/included

asv_bench/benchmarks/strings.py

+19-4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import warnings
22

33
import numpy as np
4+
from stringdtype import StringDType
45

56
from pandas import (
67
NA,
@@ -14,24 +15,38 @@
1415

1516

1617
class Dtypes:
17-
params = ["str", "string[python]", "string[pyarrow]"]
18+
params = ["str", "string[python]", "string[pyarrow]", StringDType()]
1819
param_names = ["dtype"]
20+
dtype_mapping = {
21+
"str": "str",
22+
"string[python]": object,
23+
"string[pyarrow]": object,
24+
StringDType(): StringDType(),
25+
}
1926

2027
def setup(self, dtype):
2128
try:
22-
self.s = Series(tm.makeStringIndex(10**5), dtype=dtype)
29+
self.s = Series(
30+
tm.makeStringIndex(10**5, dtype=self.dtype_mapping[dtype]),
31+
dtype=dtype,
32+
)
2333
except ImportError:
2434
raise NotImplementedError
2535

2636

2737
class Construction:
2838
params = (
2939
["series", "frame", "categorical_series"],
30-
["str", "string[python]", "string[pyarrow]"],
40+
["str", "string[python]", "string[pyarrow]", StringDType()],
3141
)
3242
param_names = ["pd_type", "dtype"]
3343
pd_mapping = {"series": Series, "frame": DataFrame, "categorical_series": Series}
34-
dtype_mapping = {"str": "str", "string[python]": object, "string[pyarrow]": object}
44+
dtype_mapping = {
45+
"str": "str",
46+
"string[python]": object,
47+
"string[pyarrow]": object,
48+
StringDType(): StringDType(),
49+
}
3550

3651
def setup(self, pd_type, dtype):
3752
series_arr = tm.rands_array(

pandas/_libs/lib.pyx

+8-3
Original file line numberDiff line numberDiff line change
@@ -1391,9 +1391,9 @@ cdef object _try_infer_map(object dtype):
13911391
return None
13921392

13931393

1394-
def infer_dtype(value: object, skipna: bool = True) -> str:
1394+
def infer_dtype(value: object, skipna: bool = True) -> object:
13951395
"""
1396-
Return a string label of the type of a scalar or list-like of values.
1396+
Return the type of a scalar or list-like of values.
13971397

13981398
Parameters
13991399
----------
@@ -1403,7 +1403,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
14031403

14041404
Returns
14051405
-------
1406-
str
1406+
str or dtype object
14071407
Describing the common type of the input data.
14081408
Results can include:
14091409

@@ -1427,6 +1427,8 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
14271427
- mixed
14281428
- unknown-array
14291429

1430+
Returns a dtype object for non-legacy numpy dtypes
1431+
14301432
Raises
14311433
------
14321434
TypeError
@@ -1529,6 +1531,9 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
15291531
if inferred is not None:
15301532
# Anything other than object-dtype should return here.
15311533
return inferred
1534+
elif not getattr(type(values.dtype), "_legacy", True):
1535+
if issubclass(values.dtype.type, str):
1536+
return values.dtype
15321537

15331538
if values.descr.type_num != NPY_OBJECT:
15341539
# i.e. values.dtype != np.object_

pandas/_testing/__init__.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -356,8 +356,8 @@ def getCols(k) -> str:
356356

357357

358358
# make index
359-
def makeStringIndex(k: int = 10, name=None) -> Index:
360-
return Index(rands_array(nchars=10, size=k), name=name)
359+
def makeStringIndex(k: int = 10, name=None, dtype: NpDtype = "O") -> Index:
360+
return Index(rands_array(nchars=10, size=k, dtype=dtype), name=name)
361361

362362

363363
def makeCategoricalIndex(

pandas/core/common.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
from pandas.core.dtypes.common import (
3636
is_bool_dtype,
3737
is_integer,
38+
is_legacy_string_dtype,
3839
)
3940
from pandas.core.dtypes.generic import (
4041
ABCExtensionArray,
@@ -243,7 +244,7 @@ def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLi
243244
# has incompatible type "Iterable[Any]"; expected "Sized"
244245
return construct_1d_object_array_from_listlike(values) # type: ignore[arg-type]
245246

246-
if issubclass(result.dtype.type, str):
247+
if is_legacy_string_dtype(result.dtype):
247248
result = np.asarray(values, dtype=object)
248249

249250
if result.ndim == 2:

pandas/core/construction.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
maybe_promote,
4444
)
4545
from pandas.core.dtypes.common import (
46+
is_legacy_string_dtype,
4647
is_list_like,
4748
is_object_dtype,
4849
pandas_dtype,
@@ -708,7 +709,7 @@ def _sanitize_str_dtypes(
708709

709710
# This is to prevent mixed-type Series getting all casted to
710711
# NumPy string type, e.g. NaN --> '-1#IND'.
711-
if issubclass(result.dtype.type, str):
712+
if is_legacy_string_dtype(result.dtype):
712713
# GH#16605
713714
# If not empty convert the data to dtype
714715
# GH#19853: If data is a scalar, result has already the result

pandas/core/dtypes/astype.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from pandas.errors import IntCastingNaNError
1919

2020
from pandas.core.dtypes.common import (
21+
is_legacy_string_dtype,
2122
is_object_dtype,
2223
is_string_dtype,
2324
pandas_dtype,
@@ -89,7 +90,7 @@ def _astype_nansafe(
8990
res = arr.astype(dtype, copy=copy)
9091
return np.asarray(res)
9192

92-
if issubclass(dtype.type, str):
93+
if issubclass(dtype.type, str) and is_legacy_string_dtype(dtype):
9394
shape = arr.shape
9495
if arr.ndim > 1:
9596
arr = arr.ravel()
@@ -183,7 +184,7 @@ def astype_array(values: ArrayLike, dtype: DtypeObj, copy: bool = False) -> Arra
183184
values = _astype_nansafe(values, dtype, copy=copy)
184185

185186
# in pandas we don't store numpy str dtypes, so convert to object
186-
if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str):
187+
if isinstance(dtype, np.dtype) and is_legacy_string_dtype(values.dtype):
187188
values = np.array(values, dtype=object)
188189

189190
return values

pandas/core/dtypes/common.py

+40-1
Original file line numberDiff line numberDiff line change
@@ -515,7 +515,7 @@ def is_string_or_object_np_dtype(dtype: np.dtype) -> bool:
515515
"""
516516
Faster alternative to is_string_dtype, assumes we have a np.dtype object.
517517
"""
518-
return dtype == object or dtype.kind in "SU"
518+
return dtype == object or dtype.kind in "SU" or issubclass(dtype.type, str)
519519

520520

521521
def is_string_dtype(arr_or_dtype) -> bool:
@@ -1662,6 +1662,44 @@ def is_all_strings(value: ArrayLike) -> bool:
16621662
return dtype == "string"
16631663

16641664

1665+
def is_legacy_string_dtype(arr_or_dtype, include_bytes=False) -> bool:
1666+
"""Check if the dtype is a numpy legacy string dtype
1667+
1668+
Parameters
1669+
----------
1670+
arr_or_dtype : array-like or dtype
1671+
The array-like or dtype to check
1672+
1673+
include_bytes : boolean
1674+
whether or not to include bytestring dtypes
1675+
1676+
Returns
1677+
-------
1678+
boolean
1679+
True for legacy numpy dtypes that represent python strings,
1680+
False otherwise. If include_bytes is True, also true for
1681+
legacy bytes dtypes.
1682+
1683+
"""
1684+
if arr_or_dtype is None:
1685+
return False
1686+
1687+
dtype = getattr(arr_or_dtype, "dtype", arr_or_dtype)
1688+
1689+
if not isinstance(dtype, np.dtype):
1690+
return False
1691+
1692+
# the _legacy attribute was added in Numpy 1.25. If the attribute isn't
1693+
# defined on the dtype class, Numpy isn't sufficiently new, so we have to be
1694+
# dealing with a legacy dtype.
1695+
is_legacy = getattr(type(dtype), "_legacy", True)
1696+
if not is_legacy:
1697+
return False
1698+
if include_bytes:
1699+
return issubclass(dtype.type, (str, bytes))
1700+
return issubclass(dtype.type, str)
1701+
1702+
16651703
__all__ = [
16661704
"classes",
16671705
"DT64NS_DTYPE",
@@ -1696,6 +1734,7 @@ def is_all_strings(value: ArrayLike) -> bool:
16961734
"is_interval",
16971735
"is_interval_dtype",
16981736
"is_iterator",
1737+
"is_legacy_string_dtype",
16991738
"is_named_tuple",
17001739
"is_nested_list_like",
17011740
"is_number",

pandas/core/dtypes/dtypes.py

-1
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,6 @@ def _from_values_or_dtype(
275275
>>> pd.CategoricalDtype._from_values_or_dtype(c, dtype=dtype2)
276276
CategoricalDtype(categories=['x', 'y'], ordered=False, categories_dtype=object)
277277
"""
278-
279278
if dtype is not None:
280279
# The dtype argument takes precedence over values.dtype (if any)
281280
if isinstance(dtype, str):

pandas/core/dtypes/missing.py

+6
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
)
1212

1313
import numpy as np
14+
from stringdtype import StringDType
1415

1516
from pandas._config import get_option
1617

@@ -305,6 +306,11 @@ def _isna_string_dtype(values: np.ndarray, inf_as_na: bool) -> npt.NDArray[np.bo
305306

306307
if dtype.kind in ("S", "U"):
307308
result = np.zeros(values.shape, dtype=bool)
309+
elif type(dtype) is StringDType:
310+
if inf_as_na:
311+
result = ~np.isfinite(values)
312+
else:
313+
result = np.isnan(values)
308314
else:
309315
if values.ndim in {1, 2}:
310316
result = libmissing.isnaobj(values, inf_as_na=inf_as_na)

pandas/core/indexes/base.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import warnings
2222

2323
import numpy as np
24+
from stringdtype import StringDType
2425

2526
from pandas._config import get_option
2627

@@ -506,7 +507,7 @@ def __new__(
506507
if isinstance(data, ABCMultiIndex):
507508
data = data._values
508509

509-
if data.dtype.kind not in "iufcbmM":
510+
if data.dtype.kind not in "iufcbmM" and type(data.dtype) != StringDType:
510511
# GH#11836 we need to avoid having numpy coerce
511512
# things that look like ints/floats to ints unless
512513
# they are actually ints, e.g. '0' and 0.0

pandas/core/internals/blocks.py

+24-9
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
from pandas.core.dtypes.common import (
5858
ensure_platform_int,
5959
is_1d_only_ea_dtype,
60+
is_legacy_string_dtype,
6061
is_list_like,
6162
is_string_dtype,
6263
)
@@ -2317,7 +2318,7 @@ def maybe_coerce_values(values: ArrayLike) -> ArrayLike:
23172318
if isinstance(values, np.ndarray):
23182319
values = ensure_wrapped_if_datetimelike(values)
23192320

2320-
if issubclass(values.dtype.type, str):
2321+
if is_legacy_string_dtype(values.dtype):
23212322
values = np.array(values, dtype=object)
23222323

23232324
if isinstance(values, (DatetimeArray, TimedeltaArray)) and values.freq is not None:
@@ -2347,15 +2348,29 @@ def get_block_type(dtype: DtypeObj) -> type[Block]:
23472348
# Note: need to be sure PandasArray is unwrapped before we get here
23482349
return ExtensionBlock
23492350

2350-
# We use kind checks because it is much more performant
2351-
# than is_foo_dtype
2352-
kind = dtype.kind
2353-
if kind in "Mm":
2354-
return DatetimeLikeBlock
2355-
elif kind in "fciub":
2356-
return NumericBlock
2351+
dtype_class = type(dtype)
2352+
2353+
# the _is_numeric attribute was added in Numpy 1.25, default to checking
2354+
# dtype.kind and finally use ObjectBlock if numpy isn't sufficiently new.
2355+
try:
2356+
is_numeric = dtype_class._is_numeric
2357+
except AttributeError:
2358+
# We use kind checks because it is much more performant
2359+
# than is_foo_dtype
2360+
kind = dtype.kind
2361+
if kind in "Mm":
2362+
return DatetimeLikeBlock
2363+
elif kind in "fciub":
2364+
return NumericBlock
2365+
else:
2366+
return ObjectBlock
23572367

2358-
return ObjectBlock
2368+
if is_numeric:
2369+
return NumericBlock
2370+
else:
2371+
if is_legacy_string_dtype(dtype):
2372+
return ObjectBlock
2373+
return NumpyBlock
23592374

23602375

23612376
def new_block_2d(

pandas/core/internals/construction.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from pandas.core.dtypes.common import (
2929
is_1d_only_ea_dtype,
3030
is_integer_dtype,
31+
is_legacy_string_dtype,
3132
is_list_like,
3233
is_named_tuple,
3334
is_object_dtype,
@@ -330,7 +331,7 @@ def ndarray_to_mgr(
330331
_check_values_indices_shape_match(values, index, columns)
331332

332333
if typ == "array":
333-
if issubclass(values.dtype.type, str):
334+
if is_legacy_string_dtype(values.dtype):
334335
values = np.array(values, dtype=object)
335336

336337
if dtype is None and is_object_dtype(values.dtype):

0 commit comments

Comments
 (0)