From c24b5b668bff8e73917c6238455a9f547362b20b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 31 Jul 2019 15:52:00 -0500 Subject: [PATCH 01/49] API: Add string extension type This adds a new extension type 'string' for storing string data. The data model is essentially unchanged from master. String are still stored in an object-dtype ndarray. Scalar elements are still Python strs, and `np.nan` is still used as the string dtype. --- doc/source/getting_started/basics.rst | 9 +- doc/source/reference/arrays.rst | 26 ++- doc/source/user_guide/text.rst | 121 +++++++++++--- doc/source/whatsnew/v1.0.0.rst | 27 +++ pandas/__init__.py | 1 + pandas/arrays/__init__.py | 2 + pandas/core/api.py | 1 + pandas/core/arrays/__init__.py | 1 + pandas/core/arrays/numpy_.py | 6 +- pandas/core/arrays/string_.py | 181 +++++++++++++++++++++ pandas/core/dtypes/missing.py | 1 + pandas/core/strings.py | 118 ++++++++++---- pandas/tests/api/test_api.py | 1 + pandas/tests/arrays/string_/test_string.py | 60 +++++++ pandas/tests/extension/test_string.py | 105 ++++++++++++ pandas/tests/test_strings.py | 25 +++ pandas/util/testing.py | 3 + 17 files changed, 629 insertions(+), 59 deletions(-) create mode 100644 pandas/core/arrays/string_.py create mode 100644 pandas/tests/arrays/string_/test_string.py create mode 100644 pandas/tests/extension/test_string.py diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index 3f6f56376861f..bffd2c575e5ba 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -1704,7 +1704,8 @@ built-in string methods. For example: .. ipython:: python - s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) + s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'], + dtype="string") s.str.lower() Powerful pattern-matching methods are provided as well, but note that @@ -1712,6 +1713,12 @@ pattern-matching generally uses `regular expressions `__ by default (and in some cases always uses them). +.. note:: + + Prior to pandas 1.0, string methods were only available on ``object`` -dtype + ``Series``. Pandas 1.0 added the :class:`StringDtype` which is dedicated + to strings. See :ref:`text.types` for more. + Please see :ref:`Vectorized String Methods ` for a complete description. diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index 7f464bf952bfb..f1a155ca85cbf 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -24,6 +24,7 @@ Intervals :class:`IntervalDtype` :class:`Interval` :ref:`api.array Nullable Integer :class:`Int64Dtype`, ... (none) :ref:`api.arrays.integer_na` Categorical :class:`CategoricalDtype` (none) :ref:`api.arrays.categorical` Sparse :class:`SparseDtype` (none) :ref:`api.arrays.sparse` +Text :class:`StringDtype` :class:`str` :ref:`api.arrays.string` =================== ========================= ================== ============================= Pandas and third-party libraries can extend NumPy's type system (see :ref:`extending.extension-types`). @@ -460,6 +461,29 @@ and methods if the :class:`Series` contains sparse values. See :ref:`api.series.sparse` for more. +.. _api.arrays.string: + +Text data +--------- + +When working with text data, where each valid element is a string, we recommend using +:ref:`StringDtype` (with the alias ``"string"``). + +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + arrays.StringArray + +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + StringDtype + +The ``Series.str`` accessor is available for ``Series`` backed by a :ref:`arrays.StringArray`. +See :ref:`api.series.str` for more. + .. Dtype attributes which are manually listed in their docstrings: including .. it here to make sure a docstring page is built for them @@ -471,4 +495,4 @@ and methods if the :class:`Series` contains sparse values. See DatetimeTZDtype.unit DatetimeTZDtype.tz PeriodDtype.freq - IntervalDtype.subtype \ No newline at end of file + IntervalDtype.subtype diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index acb5810e5252a..762c16e9a84eb 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -6,8 +6,61 @@ Working with text data ====================== +.. _text.types: + +Text Data Types +--------------- + +.. versionadded:: 1.0.0 + +There are two main ways to store text data + +1. ``object`` -dtype NumPy array. +2. As an :class:`arrays.StringArray` extension type. + +We recommend using :class:`arrays.StringArray` to store text data. + +Prior to pandas 1.0, ``object`` dtype was the only option. This was unfortunate +for many reasons: + +1. You can accidentally store a *mixture* of strings and non-strings in an + ``object`` dtype array. It's better to have a dedicated dtype. +2. ``object`` dtype breaks dtype-specific operations like ``select_dtypes``. + There isn't a clear way to select *just* text while excluding non-text + but still object-dtype columns. +3. When reading code, the contents of an ``object`` dtype array is less clear + than ``string``. + +For backwards-compatibility, ``object`` dtype remains the default type we +infer a list of strings to + +.. ipython:: python + + pd.Series(['a', 'b', 'c']) + +To explicitly request ``string`` dtype, specify the ``dtype`` + +.. ipython:: python + + pd.Series(['a', 'b', 'c'], dtype="string") + pd.Series(['a', 'b', 'c'], dtype=pd.StringDtype()) + +Or ``astype`` after the ``Series`` or ``DataFrame`` is created + +.. ipython:: python + + s = pd.Series(['a', 'b', 'c']) + s + s.astype("string") + +Everything that follows in the rest of this document applies equally to +``string`` and ``object`` dtype. + .. _text.string_methods: +String Methods +-------------- + Series and Index are equipped with a set of string processing methods that make it easy to operate on each element of the array. Perhaps most importantly, these methods exclude missing/NA values automatically. These are @@ -16,7 +69,8 @@ the equivalent (scalar) built-in string methods: .. ipython:: python - s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) + s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'], + dtype="string") s.str.lower() s.str.upper() s.str.len() @@ -90,7 +144,7 @@ Methods like ``split`` return a Series of lists: .. ipython:: python - s2 = pd.Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h']) + s2 = pd.Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'], dtype="string") s2.str.split('_') Elements in the split lists can be accessed using ``get`` or ``[]`` notation: @@ -106,6 +160,9 @@ It is easy to expand this to return a DataFrame using ``expand``. s2.str.split('_', expand=True) +When original ``Series`` has :ref:`StringDtype`, the output columns will all +be :ref:`StringDtype` as well. + It is also possible to limit the number of splits: .. ipython:: python @@ -125,7 +182,8 @@ i.e., from the end of the string to the beginning of the string: .. ipython:: python s3 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', - '', np.nan, 'CABA', 'dog', 'cat']) + '', np.nan, 'CABA', 'dog', 'cat'], + dtype="string") s3 s3.str.replace('^.a|dog', 'XX-XX ', case=False) @@ -136,7 +194,7 @@ following code will cause trouble because of the regular expression meaning of .. ipython:: python # Consider the following badly formatted financial data - dollars = pd.Series(['12', '-$10', '$10,000']) + dollars = pd.Series(['12', '-$10', '$10,000'], dtype="string") # This does what you'd naively expect: dollars.str.replace('$', '') @@ -174,7 +232,7 @@ positional argument (a regex object) and return a string. def repl(m): return m.group(0)[::-1] - pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(pat, repl) + pd.Series(['foo 123', 'bar baz', np.nan], dtype="string").str.replace(pat, repl) # Using regex groups pat = r"(?P\w+) (?P\w+) (?P\w+)" @@ -182,7 +240,7 @@ positional argument (a regex object) and return a string. def repl(m): return m.group('two').swapcase() - pd.Series(['Foo Bar Baz', np.nan]).str.replace(pat, repl) + pd.Series(['Foo Bar Baz', np.nan], dtype="string").str.replace(pat, repl) .. versionadded:: 0.20.0 @@ -221,7 +279,7 @@ The content of a ``Series`` (or ``Index``) can be concatenated: .. ipython:: python - s = pd.Series(['a', 'b', 'c', 'd']) + s = pd.Series(['a', 'b', 'c', 'd'], dtype="string") s.str.cat(sep=',') If not specified, the keyword ``sep`` for the separator defaults to the empty string, ``sep=''``: @@ -234,7 +292,7 @@ By default, missing values are ignored. Using ``na_rep``, they can be given a re .. ipython:: python - t = pd.Series(['a', 'b', np.nan, 'd']) + t = pd.Series(['a', 'b', np.nan, 'd'], dtype="string") t.str.cat(sep=',') t.str.cat(sep=',', na_rep='-') @@ -279,7 +337,8 @@ the ``join``-keyword. .. ipython:: python :okwarning: - u = pd.Series(['b', 'd', 'a', 'c'], index=[1, 3, 0, 2]) + u = pd.Series(['b', 'd', 'a', 'c'], index=[1, 3, 0, 2], + dtype="string") s u s.str.cat(u) @@ -295,7 +354,8 @@ In particular, alignment also means that the different lengths do not need to co .. ipython:: python - v = pd.Series(['z', 'a', 'b', 'd', 'e'], index=[-1, 0, 1, 3, 4]) + v = pd.Series(['z', 'a', 'b', 'd', 'e'], index=[-1, 0, 1, 3, 4], + dtype="string") s v s.str.cat(v, join='left', na_rep='-') @@ -351,7 +411,8 @@ of the string, the result will be a ``NaN``. .. ipython:: python s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, - 'CABA', 'dog', 'cat']) + 'CABA', 'dog', 'cat'], + dtype="string") s.str[0] s.str[1] @@ -382,7 +443,8 @@ DataFrame with one column per group. .. ipython:: python - pd.Series(['a1', 'b2', 'c3']).str.extract(r'([ab])(\d)', expand=False) + pd.Series(['a1', 'b2', 'c3'], + dtype="string").str.extract(r'([ab])(\d)', expand=False) Elements that do not match return a row filled with ``NaN``. Thus, a Series of messy strings can be "converted" into a like-indexed Series @@ -395,14 +457,16 @@ Named groups like .. ipython:: python - pd.Series(['a1', 'b2', 'c3']).str.extract(r'(?P[ab])(?P\d)', - expand=False) + pd.Series(['a1', 'b2', 'c3'], + dtype="string").str.extract(r'(?P[ab])(?P\d)', + expand=False) and optional groups like .. ipython:: python - pd.Series(['a1', 'b2', '3']).str.extract(r'([ab])?(\d)', expand=False) + pd.Series(['a1', 'b2', '3'], + dtype="string").str.extract(r'([ab])?(\d)', expand=False) can also be used. Note that any capture group names in the regular expression will be used for column names; otherwise capture group @@ -413,20 +477,23 @@ with one column if ``expand=True``. .. ipython:: python - pd.Series(['a1', 'b2', 'c3']).str.extract(r'[ab](\d)', expand=True) + pd.Series(['a1', 'b2', 'c3'], + dtype="string").str.extract(r'[ab](\d)', expand=True) It returns a Series if ``expand=False``. .. ipython:: python - pd.Series(['a1', 'b2', 'c3']).str.extract(r'[ab](\d)', expand=False) + pd.Series(['a1', 'b2', 'c3'], + dtype="string").str.extract(r'[ab](\d)', expand=False) Calling on an ``Index`` with a regex with exactly one capture group returns a ``DataFrame`` with one column if ``expand=True``. .. ipython:: python - s = pd.Series(["a1", "b2", "c3"], ["A11", "B22", "C33"]) + s = pd.Series(["a1", "b2", "c3"], ["A11", "B22", "C33"], + dtype="string") s s.index.str.extract("(?P[a-zA-Z])", expand=True) @@ -471,7 +538,8 @@ Unlike ``extract`` (which returns only the first match), .. ipython:: python - s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"]) + s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"], + dtype="string") s two_groups = '(?P[a-z])(?P[0-9])' s.str.extract(two_groups, expand=True) @@ -489,7 +557,7 @@ When each subject string in the Series has exactly one match, .. ipython:: python - s = pd.Series(['a3', 'b3', 'c2']) + s = pd.Series(['a3', 'b3', 'c2'], dtype="string") s then ``extractall(pat).xs(0, level='match')`` gives the same result as @@ -510,7 +578,7 @@ same result as a ``Series.str.extractall`` with a default index (starts from 0). pd.Index(["a1a2", "b1", "c1"]).str.extractall(two_groups) - pd.Series(["a1a2", "b1", "c1"]).str.extractall(two_groups) + pd.Series(["a1a2", "b1", "c1"], dtype="string").str.extractall(two_groups) Testing for Strings that match or contain a pattern @@ -521,13 +589,15 @@ You can check whether elements contain a pattern: .. ipython:: python pattern = r'[0-9][a-z]' - pd.Series(['1', '2', '3a', '3b', '03c']).str.contains(pattern) + pd.Series(['1', '2', '3a', '3b', '03c'], + dtype="string").str.contains(pattern) Or whether elements match a pattern: .. ipython:: python - pd.Series(['1', '2', '3a', '3b', '03c']).str.match(pattern) + pd.Series(['1', '2', '3a', '3b', '03c'], + dtype="string").str.match(pattern) The distinction between ``match`` and ``contains`` is strictness: ``match`` relies on strict ``re.match``, while ``contains`` relies on ``re.search``. @@ -537,7 +607,8 @@ an extra ``na`` argument so missing values can be considered True or False: .. ipython:: python - s4 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) + s4 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'], + dtype="string") s4.str.contains('A', na=False) .. _text.indicator: @@ -550,7 +621,7 @@ For example if they are separated by a ``'|'``: .. ipython:: python - s = pd.Series(['a', 'a|b', np.nan, 'a|c']) + s = pd.Series(['a', 'a|b', np.nan, 'a|c'], dtype="string") s.str.get_dummies(sep='|') String ``Index`` also supports ``get_dummies`` which returns a ``MultiIndex``. diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 0be4ebc627b30..4e34d0e5efa02 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -21,6 +21,33 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +Dedicated string data type +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We've added :ref:`StringDtype`, an extension type dedicated to string data. +Previously, strings were typically stored in object-dtype NumPy arrays. + +.. ipython:: python + + pd.Series(['abc', None, 'def'], dtype=pd.StringDtype()) + +You can use the alias ``'string'`` as well. + +.. ipython:: python + + s = pd.Series(['abc', None, 'def'], dtype="string") + s + +The usual string accessor methods work. Where appropriate, the return type +of the Series or columns of a DataFrame will also have string dtype. + + s.str.upper() + s.str.split('b', expand=True).dtypes + +We recommend explicitly using the ``string`` data type when working with strings. +See :ref:`text.types` for more. + + - - diff --git a/pandas/__init__.py b/pandas/__init__.py index 6351b508fb0e5..5db1814943cf9 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -66,6 +66,7 @@ PeriodDtype, IntervalDtype, DatetimeTZDtype, + StringDtype, # missing isna, isnull, diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py index db01f2a0c674f..9870b5bed076d 100644 --- a/pandas/arrays/__init__.py +++ b/pandas/arrays/__init__.py @@ -11,6 +11,7 @@ PandasArray, PeriodArray, SparseArray, + StringArray, TimedeltaArray, ) @@ -22,5 +23,6 @@ "PandasArray", "PeriodArray", "SparseArray", + "StringArray", "TimedeltaArray", ] diff --git a/pandas/core/api.py b/pandas/core/api.py index 73323d93b8215..fabd4ee3414ea 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -21,6 +21,7 @@ DatetimeTZDtype, ) from pandas.core.arrays import Categorical +from pandas.core.arrays.string_ import StringDtype from pandas.core.construction import array from pandas.core.groupby import Grouper, NamedAgg from pandas.io.formats.format import set_eng_float_format diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 5c83ed8cf5e24..868118bac6a7b 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -10,4 +10,5 @@ from .numpy_ import PandasArray, PandasDtype # noqa: F401 from .period import PeriodArray, period_array # noqa: F401 from .sparse import SparseArray # noqa: F401 +from .string_ import StringArray # noqa: F401 from .timedeltas import TimedeltaArray # noqa: F401 diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 4e2e37d88eb9a..e3a0ff0eceb1a 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -236,7 +236,11 @@ def __setitem__(self, key, value): value = np.asarray(value) values = self._ndarray - t = np.result_type(value, values) + if isinstance(value, str): + # Avoid issues with result_type and typecodes. + t = object + else: + t = np.result_type(value, values) if t != self._ndarray.dtype: values = values.astype(t, casting="safe") values[key] = value diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py new file mode 100644 index 0000000000000..9a7cfb45d68ea --- /dev/null +++ b/pandas/core/arrays/string_.py @@ -0,0 +1,181 @@ +from typing import Type + +import numpy as np + +from pandas._libs import lib + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.common import pandas_dtype +from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries +from pandas.core.dtypes.inference import is_array_like + +from pandas.core.arrays import PandasArray +from pandas.core.construction import extract_array + + +@register_extension_dtype +class StringDtype(ExtensionDtype): + """ + Extension dtype for text data. + + .. versionadded:: 1.0.0 + + Examples + -------- + >>> pd.StringDtype() + StringDtype + """ + + @property + def na_value(self): + return np.nan + + @property + def type(self) -> Type: + return str + + @property + def name(self) -> str: + return "string" + + @classmethod + def construct_from_string(cls, string: str): + if string in {"string", "str"}: + return cls() + return super().construct_from_string(string) + + @classmethod + def construct_array_type(cls) -> "Type[StringArray]": + return StringArray + + def __repr__(self) -> str: + return "StringDtype" + + +class StringArray(PandasArray): + """ + Extension array for text data. + + .. versionadded:: 1.0.0 + + Parameters + ---------- + values : ndarray + copy : bool, default False + + Examples + -------- + >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string") + + ['This is', 'some text', nan, 'data.'] + Length: 4, dtype: string + + Unlike ``object`` dtype arrays, ``StringArray`` doesn't allow non-string + values. + + >>> pd.array(['1', 1], dtype="string") + Traceback (most recent call last): + ... + ValueError: Must provide strings + """ + + # undo the PandasArray hack + _typ = "extension" + + def __init__(self, values, copy=False): + super().__init__(values, copy=copy) + self._dtype = StringDtype() + self._validate() + + def _validate(self): + """Validate that we only store NA or strings.""" + if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): + raise ValueError("Must provide strings") + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + if dtype: + assert dtype == "string" + result = super()._from_sequence(scalars, dtype=object, copy=copy) + # convert None to np.nan + # TODO: it would be nice to do this in _validate / lib.is_string_array + # We are already doing a scan over the values there. + result[result.isna()] = np.nan + return result + + @classmethod + def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + return cls._from_sequence(strings, dtype=dtype, copy=copy) + + def __setitem__(self, key, value): + value = extract_array(value, extract_numpy=True) + if isinstance(value, type(self)): + value = value._ndarray + scalar_key = lib.is_scalar(key) + scalar_value = lib.is_scalar(value) + if scalar_key and not scalar_value: + raise ValueError("setting an array element with a sequence.") + + # validate new items + if scalar_value: + if scalar_value is None: + value = np.nan + elif not (isinstance(value, str) or np.isnan(value)): + raise ValueError( + "Cannot set value '{}' into a StringArray.".format(value) + ) + else: + if not is_array_like(value): + value = np.asarray(value, dtype=object) + if len(value) and not lib.is_string_array(value, skipna=True): + raise ValueError("Must provide strings.") + + super().__setitem__(key, value) + + def fillna(self, value=None, method=None, limit=None): + # TODO: validate dtype + return super().fillna(value, method, limit) + + def astype(self, dtype, copy=True): + dtype = pandas_dtype(dtype) + if isinstance(dtype, StringDtype): + if copy: + return self.copy() + return self + return super().astype(dtype, copy) + + def __add__(self, other): + return _add(self, other) + + def __radd__(self, other): + return _add(self, other, reversed=True) + + def _reduce(self, name, skipna=True, **kwargs): + raise TypeError("Cannot perform reduction '{}' with string dtype".format(name)) + + def value_counts(self, dropna=False): + from pandas import value_counts + + return value_counts(self._ndarray, dropna=dropna) + + +def _add(array, other, reversed=False): + if isinstance(other, (ABCIndexClass, ABCSeries)): + return NotImplemented + + mask = array.isna() + if isinstance(other, type(array)): + mask |= other.isna() + other = other._ndarray[~mask] + + valid = ~mask + + out = np.empty_like(array._ndarray, dtype="object") + out[mask] = np.nan + if reversed: + out[valid] = other + array._ndarray[valid] + else: + out[valid] = array._ndarray[valid] + other + + return type(array)(out) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 6f599a6be6021..c4d791c2be20e 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -128,6 +128,7 @@ def isna(obj): def _isna_new(obj): + if is_scalar(obj): return libmissing.checknull(obj) # hack (for now) because MI registers as ndarray diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 25350119f9df5..7edf3abc409e0 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -15,6 +15,7 @@ ensure_object, is_bool_dtype, is_categorical_dtype, + is_extension_array_dtype, is_integer, is_list_like, is_re, @@ -817,12 +818,15 @@ def _str_extract_frame(arr, pat, flags=0): result_index = arr.index except AttributeError: result_index = None - return DataFrame( + result = DataFrame( [groups_or_na(val) for val in arr], columns=columns, index=result_index, dtype=object, ) + if arr.dtype.name == "string": + result = result.astype("string") + return result def str_extract(arr, pat, flags=0, expand=True): @@ -912,7 +916,7 @@ def str_extract(arr, pat, flags=0, expand=True): return _str_extract_frame(arr._orig, pat, flags=flags) else: result, name = _str_extract_noexpand(arr._parent, pat, flags=flags) - return arr._wrap_result(result, name=name, expand=expand) + return arr._wrap_result(result, name=name, expand=expand, returns_string=True) def str_extractall(arr, pat, flags=0): @@ -1020,7 +1024,9 @@ def str_extractall(arr, pat, flags=0): index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"]) - result = arr._constructor_expanddim(match_list, index=index, columns=columns) + result = arr._constructor_expanddim( + match_list, index=index, columns=columns, dtype=arr.dtype + ) return result @@ -1858,11 +1864,18 @@ def wrapper(self, *args, **kwargs): return _forbid_nonstring_types -def _noarg_wrapper(f, name=None, docstring=None, forbidden_types=["bytes"], **kargs): +def _noarg_wrapper( + f, + name=None, + docstring=None, + forbidden_types=["bytes"], + returns_string=True, + **kargs +): @forbid_nonstring_types(forbidden_types, name=name) def wrapper(self): result = _na_map(f, self._parent, **kargs) - return self._wrap_result(result) + return self._wrap_result(result, returns_string=returns_string) wrapper.__name__ = f.__name__ if name is None else name if docstring is not None: @@ -1874,22 +1887,28 @@ def wrapper(self): def _pat_wrapper( - f, flags=False, na=False, name=None, forbidden_types=["bytes"], **kwargs + f, + flags=False, + na=False, + name=None, + forbidden_types=["bytes"], + returns_string=True, + **kwargs ): @forbid_nonstring_types(forbidden_types, name=name) def wrapper1(self, pat): result = f(self._parent, pat) - return self._wrap_result(result) + return self._wrap_result(result, returns_string=returns_string) @forbid_nonstring_types(forbidden_types, name=name) def wrapper2(self, pat, flags=0, **kwargs): result = f(self._parent, pat, flags=flags, **kwargs) - return self._wrap_result(result) + return self._wrap_result(result, returns_string=returns_string) @forbid_nonstring_types(forbidden_types, name=name) def wrapper3(self, pat, na=np.nan): result = f(self._parent, pat, na=na) - return self._wrap_result(result) + return self._wrap_result(result, returns_string=returns_string) wrapper = wrapper3 if na else wrapper2 if flags else wrapper1 @@ -1926,6 +1945,7 @@ class StringMethods(NoNewAttributesMixin): def __init__(self, data): self._inferred_dtype = self._validate(data) self._is_categorical = is_categorical_dtype(data) + self._is_string = data.dtype.name == "string" # .values.categories works for both Series/Index self._parent = data.values.categories if self._is_categorical else data @@ -1967,6 +1987,9 @@ def _validate(data): values = getattr(data, "values", data) # Series / Index values = getattr(values, "categories", values) # categorical / normal + if is_extension_array_dtype(values.dtype): + return str(values.dtype) + try: inferred_dtype = lib.infer_dtype(values, skipna=True) except ValueError: @@ -1992,7 +2015,13 @@ def __iter__(self): g = self.get(i) def _wrap_result( - self, result, use_codes=True, name=None, expand=None, fill_value=np.nan + self, + result, + use_codes=True, + name=None, + expand=None, + fill_value=np.nan, + returns_string=True, ): from pandas import Index, Series, MultiIndex @@ -2069,11 +2098,14 @@ def cons_row(x): index = self._orig.index if expand: cons = self._orig._constructor_expanddim - return cons(result, columns=name, index=index) + result = cons(result, columns=name, index=index) else: # Must be a Series cons = self._orig._constructor - return cons(result, name=name, index=index) + result = cons(result, name=name, index=index) + if self._is_string and returns_string: + result = result.astype("string") + return result def _get_series_list(self, others): """ @@ -2339,7 +2371,7 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): result = Index(result, dtype=object, name=self._orig.name) else: # Series result = Series( - result, dtype=object, index=data.index, name=self._orig.name + result, dtype=self._orig.dtype, index=data.index, name=self._orig.name ) return result @@ -2479,13 +2511,13 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): @forbid_nonstring_types(["bytes"]) def split(self, pat=None, n=-1, expand=False): result = str_split(self._parent, pat, n=n) - return self._wrap_result(result, expand=expand) + return self._wrap_result(result, expand=expand, returns_string=expand) @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"}) @forbid_nonstring_types(["bytes"]) def rsplit(self, pat=None, n=-1, expand=False): result = str_rsplit(self._parent, pat, n=n) - return self._wrap_result(result, expand=expand) + return self._wrap_result(result, expand=expand, returns_string=expand) _shared_docs[ "str_partition" @@ -2586,7 +2618,7 @@ def rsplit(self, pat=None, n=-1, expand=False): def partition(self, sep=" ", expand=True): f = lambda x: x.partition(sep) result = _na_map(f, self._parent) - return self._wrap_result(result, expand=expand) + return self._wrap_result(result, expand=expand, returns_string=expand) @Appender( _shared_docs["str_partition"] @@ -2602,7 +2634,7 @@ def partition(self, sep=" ", expand=True): def rpartition(self, sep=" ", expand=True): f = lambda x: x.rpartition(sep) result = _na_map(f, self._parent) - return self._wrap_result(result, expand=expand) + return self._wrap_result(result, expand=expand, returns_string=expand) @copy(str_get) def get(self, i): @@ -2621,13 +2653,13 @@ def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): result = str_contains( self._parent, pat, case=case, flags=flags, na=na, regex=regex ) - return self._wrap_result(result, fill_value=na) + return self._wrap_result(result, fill_value=na, returns_string=False) @copy(str_match) @forbid_nonstring_types(["bytes"]) def match(self, pat, case=True, flags=0, na=np.nan): result = str_match(self._parent, pat, case=case, flags=flags, na=na) - return self._wrap_result(result, fill_value=na) + return self._wrap_result(result, fill_value=na, returns_string=False) @copy(str_replace) @forbid_nonstring_types(["bytes"]) @@ -2762,13 +2794,14 @@ def slice_replace(self, start=None, stop=None, repl=None): def decode(self, encoding, errors="strict"): # need to allow bytes here result = str_decode(self._parent, encoding, errors) - return self._wrap_result(result) + # TODO: Not sure how to handle this. + return self._wrap_result(result, returns_string=False) @copy(str_encode) @forbid_nonstring_types(["bytes"]) def encode(self, encoding, errors="strict"): result = str_encode(self._parent, encoding, errors) - return self._wrap_result(result) + return self._wrap_result(result, returns_string=False) _shared_docs[ "str_strip" @@ -2869,7 +2902,11 @@ def get_dummies(self, sep="|"): data = self._orig.astype(str) if self._is_categorical else self._parent result, name = str_get_dummies(data, sep) return self._wrap_result( - result, use_codes=(not self._is_categorical), name=name, expand=True + result, + use_codes=(not self._is_categorical), + name=name, + expand=True, + returns_string=False, ) @copy(str_translate) @@ -2878,10 +2915,16 @@ def translate(self, table): result = str_translate(self._parent, table) return self._wrap_result(result) - count = _pat_wrapper(str_count, flags=True, name="count") - startswith = _pat_wrapper(str_startswith, na=True, name="startswith") - endswith = _pat_wrapper(str_endswith, na=True, name="endswith") - findall = _pat_wrapper(str_findall, flags=True, name="findall") + count = _pat_wrapper(str_count, flags=True, name="count", returns_string=False) + startswith = _pat_wrapper( + str_startswith, na=True, name="startswith", returns_string=False + ) + endswith = _pat_wrapper( + str_endswith, na=True, name="endswith", returns_string=False + ) + findall = _pat_wrapper( + str_findall, flags=True, name="findall", returns_string=False + ) @copy(str_extract) @forbid_nonstring_types(["bytes"]) @@ -2929,7 +2972,7 @@ def extractall(self, pat, flags=0): @forbid_nonstring_types(["bytes"]) def find(self, sub, start=0, end=None): result = str_find(self._parent, sub, start=start, end=end, side="left") - return self._wrap_result(result) + return self._wrap_result(result, returns_string=False) @Appender( _shared_docs["find"] @@ -2942,7 +2985,7 @@ def find(self, sub, start=0, end=None): @forbid_nonstring_types(["bytes"]) def rfind(self, sub, start=0, end=None): result = str_find(self._parent, sub, start=start, end=end, side="right") - return self._wrap_result(result) + return self._wrap_result(result, returns_string=False) @forbid_nonstring_types(["bytes"]) def normalize(self, form): @@ -3004,7 +3047,7 @@ def normalize(self, form): @forbid_nonstring_types(["bytes"]) def index(self, sub, start=0, end=None): result = str_index(self._parent, sub, start=start, end=end, side="left") - return self._wrap_result(result) + return self._wrap_result(result, returns_string=False) @Appender( _shared_docs["index"] @@ -3018,7 +3061,7 @@ def index(self, sub, start=0, end=None): @forbid_nonstring_types(["bytes"]) def rindex(self, sub, start=0, end=None): result = str_index(self._parent, sub, start=start, end=end, side="right") - return self._wrap_result(result) + return self._wrap_result(result, returns_string=False) _shared_docs[ "len" @@ -3067,7 +3110,11 @@ def rindex(self, sub, start=0, end=None): dtype: float64 """ len = _noarg_wrapper( - len, docstring=_shared_docs["len"], forbidden_types=None, dtype=int + len, + docstring=_shared_docs["len"], + forbidden_types=None, + dtype=int, + returns_string=False, ) _shared_docs[ @@ -3339,46 +3386,55 @@ def rindex(self, sub, start=0, end=None): lambda x: x.isalnum(), name="isalnum", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"], + returns_string=False, ) isalpha = _noarg_wrapper( lambda x: x.isalpha(), name="isalpha", docstring=_shared_docs["ismethods"] % _doc_args["isalpha"], + returns_string=False, ) isdigit = _noarg_wrapper( lambda x: x.isdigit(), name="isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"], + returns_string=False, ) isspace = _noarg_wrapper( lambda x: x.isspace(), name="isspace", docstring=_shared_docs["ismethods"] % _doc_args["isspace"], + returns_string=False, ) islower = _noarg_wrapper( lambda x: x.islower(), name="islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"], + returns_string=False, ) isupper = _noarg_wrapper( lambda x: x.isupper(), name="isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"], + returns_string=False, ) istitle = _noarg_wrapper( lambda x: x.istitle(), name="istitle", docstring=_shared_docs["ismethods"] % _doc_args["istitle"], + returns_string=False, ) isnumeric = _noarg_wrapper( lambda x: x.isnumeric(), name="isnumeric", docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"], + returns_string=False, ) isdecimal = _noarg_wrapper( lambda x: x.isdecimal(), name="isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"], + returns_string=False, ) @classmethod diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 326bef7f4b480..db7888f4b7dca 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -70,6 +70,7 @@ class TestPDApi(Base): "SparseDataFrame", "SparseDtype", "SparseSeries", + "StringDtype", "Timedelta", "TimedeltaIndex", "Timestamp", diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py new file mode 100644 index 0000000000000..2ab9488461e16 --- /dev/null +++ b/pandas/tests/arrays/string_/test_string.py @@ -0,0 +1,60 @@ +import operator + +import numpy as np +import pytest + +import pandas as pd +import pandas.util.testing as tm + + +def test_none_to_nan(): + a = pd.arrays.StringArray._from_sequence(["a", None, "b"]) + assert a[1] is not None + assert np.isnan(a[1]) + + +def test_setitem_validates(): + a = pd.arrays.StringArray._from_sequence(["a", "b"]) + with pytest.raises(ValueError, match="10"): + a[0] = 10 + + with pytest.raises(ValueError, match="strings"): + a[:] = np.array([1, 2]) + + +@pytest.mark.parametrize( + "input, method", + [ + (["a", "b", "c"], operator.methodcaller("capitalize")), + (["a", "b", "c"], operator.methodcaller("capitalize")), + (["a b", "a bc. de"], operator.methodcaller("capitalize")), + ], +) +def test_string_methods(input, method): + a = pd.Series(input, dtype="string") + b = pd.Series(input, dtype="object") + result = method(a.str) + expected = method(b.str) + + assert result.dtype.name == "string" + tm.assert_series_equal(result.astype(object), expected) + + +def test_add(): + a = pd.Series(["a", "b", "c", None, None], dtype="string") + b = pd.Series(["x", "y", None, "z", None], dtype="string") + + result = a + b + expected = pd.Series(["ax", "by", None, None, None], dtype="string") + tm.assert_series_equal(result, expected) + + result = a.add(b) + tm.assert_series_equal(result, expected) + + result = a.radd(b) + expected = pd.Series(["xa", "yb", None, None, None], dtype="string") + tm.assert_series_equal(result, expected) + + result = a.add(b, fill_value="-") + expected = pd.Series(["ax", "by", "c-", "-z", None], dtype="string") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py new file mode 100644 index 0000000000000..a09b3b424269a --- /dev/null +++ b/pandas/tests/extension/test_string.py @@ -0,0 +1,105 @@ +import random +import string + +import numpy as np +import pytest + +import pandas as pd +from pandas.core.arrays.string_ import StringArray, StringDtype +from pandas.tests.extension import base + + +@pytest.fixture +def dtype(): + return StringDtype() + + +@pytest.fixture +def data(): + strings = random.choices(string.ascii_letters, k=100) + while strings[0] == strings[1]: + strings = random.choices(string.ascii_letters, k=100) + + return StringArray._from_sequence(strings) + + +@pytest.fixture +def data_missing(): + """Length 2 array with [NA, Valid]""" + return StringArray._from_sequence([np.nan, "A"]) + + +@pytest.fixture +def data_for_sorting(): + return StringArray._from_sequence(["B", "C", "A"]) + + +@pytest.fixture +def data_missing_for_sorting(): + return StringArray._from_sequence(["B", np.nan, "A"]) + + +@pytest.fixture +def na_value(): + return np.nan + + +@pytest.fixture +def data_for_grouping(): + return StringArray._from_sequence(["B", "B", np.nan, np.nan, "A", "A", "B", "C"]) + + +class TestDtype(base.BaseDtypeTests): + pass + + +class TestInterface(base.BaseInterfaceTests): + pass + + +class TestConstructors(base.BaseConstructorsTests): + pass + + +class TestReshaping(base.BaseReshapingTests): + pass + + +class TestGetitem(base.BaseGetitemTests): + pass + + +class TestSetitem(base.BaseSetitemTests): + pass + + +class TestMissing(base.BaseMissingTests): + pass + + +class TestReduce(base.BaseNoReduceTests): + pass + + +class TestMethods(base.BaseMethodsTests): + pass + + +class TestCasting(base.BaseCastingTests): + pass + + +class TestComparisonOps(base.BaseComparisonOpsTests): + def _compare_other(self, s, data, op_name, other): + result = getattr(s, op_name)(other) + expected = getattr(s.astype(object), op_name)(other) + self.assert_series_equal(result, expected) + + def test_compare_scalar(self, data, all_compare_operators): + op_name = all_compare_operators + s = pd.Series(data) + self._compare_other(s, data, op_name, "abc") + + +class TestParsing(base.BaseParsingTests): + pass diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index bc8dc7272a83a..0b51fd8682913 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -6,6 +6,8 @@ from numpy.random import randint import pytest +from pandas._libs import lib + from pandas import DataFrame, Index, MultiIndex, Series, concat, isna, notna import pandas.core.strings as strings import pandas.util.testing as tm @@ -3269,3 +3271,26 @@ def test_casefold(self): result = s.str.casefold() tm.assert_series_equal(result, expected) + + +def test_string_array(any_string_method): + data = ["a", "bb", np.nan, "ccc"] + a = Series(data, dtype=object) + b = Series(data, dtype="string") + method_name, args, kwargs = any_string_method + + expected = getattr(a.str, method_name)(*args, **kwargs) + result = getattr(b.str, method_name)(*args, **kwargs) + + if isinstance(expected, Series): + if expected.dtype == "object" and lib.is_string_array( + expected.values, skipna=True + ): + assert result.dtype == "string" + result = result.astype(object) + tm.assert_series_equal(result, expected) + elif isinstance(expected, DataFrame): + columns = expected.select_dtypes(include="object").columns + assert all(result[columns].dtypes == "string") + result[columns] = result[columns].astype(object) + tm.assert_equal(result, expected) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index cf8452cdd0c59..73f07a83dd4fa 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1434,6 +1434,9 @@ def assert_equal(left, right, **kwargs): assert_extension_array_equal(left, right, **kwargs) elif isinstance(left, np.ndarray): assert_numpy_array_equal(left, right, **kwargs) + elif isinstance(left, str): + assert kwargs == {} + return left == right else: raise NotImplementedError(type(left)) From 3ecb5cc9610500bd588d9b2134120b4562d58642 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Aug 2019 13:16:46 -0500 Subject: [PATCH 02/49] test fixups --- pandas/core/arrays/string_.py | 2 +- pandas/core/strings.py | 21 ++++++++++++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 9a7cfb45d68ea..2f641c9fcd53c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -41,7 +41,7 @@ def name(self) -> str: @classmethod def construct_from_string(cls, string: str): - if string in {"string", "str"}: + if string == "string": return cls() return super().construct_from_string(string) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 7edf3abc409e0..aa3f72969d366 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1024,8 +1024,16 @@ def str_extractall(arr, pat, flags=0): index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"]) + # workaround #27953 + from pandas import StringDtype + + if isinstance(arr.dtype, StringDtype): + dtype = arr.dtype + else: + dtype = None + result = arr._constructor_expanddim( - match_list, index=index, columns=columns, dtype=arr.dtype + match_list, index=index, columns=columns, dtype=dtype ) return result @@ -1079,7 +1087,7 @@ def str_get_dummies(arr, sep="|"): for i, t in enumerate(tags): pat = sep + t + sep - dummies[:, i] = lib.map_infer(arr.values, lambda x: pat in x) + dummies[:, i] = lib.map_infer(arr.to_numpy(), lambda x: pat in x) return dummies, tags @@ -2370,9 +2378,12 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): # add dtype for case that result is all-NA result = Index(result, dtype=object, name=self._orig.name) else: # Series - result = Series( - result, dtype=self._orig.dtype, index=data.index, name=self._orig.name - ) + if is_categorical_dtype(self._orig.dtype): + # We need to infer the new categories. + dtype = None + else: + dtype = self._orig.dtype + result = Series(result, dtype=dtype, index=data.index, name=self._orig.name) return result _shared_docs[ From 59a7d398ecdb1f85c3edc2249e7b02ecea0d3110 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Aug 2019 14:20:30 -0500 Subject: [PATCH 03/49] string dtype --- pandas/core/strings.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index aa3f72969d366..64953878d978f 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -15,7 +15,6 @@ ensure_object, is_bool_dtype, is_categorical_dtype, - is_extension_array_dtype, is_integer, is_list_like, is_re, @@ -1984,6 +1983,8 @@ def _validate(data): ------- dtype : inferred dtype of data """ + from pandas import StringDtype + if isinstance(data, ABCMultiIndex): raise AttributeError( "Can only use .str accessor with Index, not MultiIndex" @@ -1995,8 +1996,9 @@ def _validate(data): values = getattr(data, "values", data) # Series / Index values = getattr(values, "categories", values) # categorical / normal - if is_extension_array_dtype(values.dtype): - return str(values.dtype) + # explicitly allow StringDtype + if isinstance(values.dtype, StringDtype): + return "string" try: inferred_dtype = lib.infer_dtype(values, skipna=True) From 7c07070053eba9149979458d3ae084b8ea035190 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Aug 2019 14:52:27 -0500 Subject: [PATCH 04/49] 35 compat --- pandas/tests/extension/test_string.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index a09b3b424269a..91dbd8d801c3e 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -16,7 +16,8 @@ def dtype(): @pytest.fixture def data(): - strings = random.choices(string.ascii_letters, k=100) + # strings = random.choices(string.ascii_letters, k=100) + strings = np.random.choice(list(string.ascii_letters), size=100) while strings[0] == strings[1]: strings = random.choices(string.ascii_letters, k=100) From 9e1a73b803d598ee6ac7699508dd8046ef5f2d0a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Aug 2019 14:53:08 -0500 Subject: [PATCH 05/49] doc --- doc/source/user_guide/text.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 762c16e9a84eb..6e76945f19bed 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -232,7 +232,8 @@ positional argument (a regex object) and return a string. def repl(m): return m.group(0)[::-1] - pd.Series(['foo 123', 'bar baz', np.nan], dtype="string").str.replace(pat, repl) + pd.Series(['foo 123', 'bar baz', np.nan], + dtype="string").str.replace(pat, repl) # Using regex groups pat = r"(?P\w+) (?P\w+) (?P\w+)" @@ -240,7 +241,8 @@ positional argument (a regex object) and return a string. def repl(m): return m.group('two').swapcase() - pd.Series(['Foo Bar Baz', np.nan], dtype="string").str.replace(pat, repl) + pd.Series(['Foo Bar Baz', np.nan], + dtype="string").str.replace(pat, repl) .. versionadded:: 0.20.0 From 16ccad817bfccefe76f8fc42106cae7e5edaed19 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Aug 2019 15:46:38 -0500 Subject: [PATCH 06/49] fixups --- doc/source/user_guide/text.rst | 4 ++-- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/tests/extension/test_string.py | 4 +--- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 6e76945f19bed..2125e68840da3 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -160,8 +160,8 @@ It is easy to expand this to return a DataFrame using ``expand``. s2.str.split('_', expand=True) -When original ``Series`` has :ref:`StringDtype`, the output columns will all -be :ref:`StringDtype` as well. +When original ``Series`` has :class:`StringDtype`, the output columns will all +be :class:`StringDtype` as well. It is also possible to limit the number of splits: diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 4e34d0e5efa02..6eed40c69c80c 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -24,7 +24,7 @@ Enhancements Dedicated string data type ^^^^^^^^^^^^^^^^^^^^^^^^^^ -We've added :ref:`StringDtype`, an extension type dedicated to string data. +We've added :class:`StringDtype`, an extension type dedicated to string data. Previously, strings were typically stored in object-dtype NumPy arrays. .. ipython:: python diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 91dbd8d801c3e..cba10b1b7f88b 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -1,4 +1,3 @@ -import random import string import numpy as np @@ -16,10 +15,9 @@ def dtype(): @pytest.fixture def data(): - # strings = random.choices(string.ascii_letters, k=100) strings = np.random.choice(list(string.ascii_letters), size=100) while strings[0] == strings[1]: - strings = random.choices(string.ascii_letters, k=100) + strings = np.random.choice(list(string.ascii_letters), size=100) return StringArray._from_sequence(strings) From 1027463a6a051ded62df8724836d49ea2b86d578 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Aug 2019 16:21:36 -0500 Subject: [PATCH 07/49] doc --- pandas/core/arrays/string_.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 2f641c9fcd53c..bde590ab94a86 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -21,6 +21,11 @@ class StringDtype(ExtensionDtype): .. versionadded:: 1.0.0 + Attributes + ---------- + na_value + name + Examples -------- >>> pd.StringDtype() @@ -29,6 +34,9 @@ class StringDtype(ExtensionDtype): @property def na_value(self): + """ + StringDtype uses :attr:`numpy.nan` as the missing NA value. + """ return np.nan @property @@ -37,6 +45,9 @@ def type(self) -> Type: @property def name(self) -> str: + """ + The alias for StringDtype is ``'string'``. + """ return "string" @classmethod From aafb53bc9bbf476c2eb15942a71450bde152efa4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 18 Aug 2019 21:26:58 -0500 Subject: [PATCH 08/49] doc --- pandas/core/arrays/string_.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index bde590ab94a86..bddfea77f4795 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -23,7 +23,6 @@ class StringDtype(ExtensionDtype): Attributes ---------- - na_value name Examples From ab49169be613ba811a1e617c59d1d6d299592ba0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 19 Aug 2019 10:03:58 -0500 Subject: [PATCH 09/49] fix doc warnings --- doc/source/reference/arrays.rst | 4 ++-- pandas/core/arrays/string_.py | 12 ++++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index f1a155ca85cbf..db620e73301cb 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -467,7 +467,7 @@ Text data --------- When working with text data, where each valid element is a string, we recommend using -:ref:`StringDtype` (with the alias ``"string"``). +:class:`StringDtype` (with the alias ``"string"``). .. autosummary:: :toctree: api/ @@ -481,7 +481,7 @@ When working with text data, where each valid element is a string, we recommend StringDtype -The ``Series.str`` accessor is available for ``Series`` backed by a :ref:`arrays.StringArray`. +The ``Series.str`` accessor is available for ``Series`` backed by a :class:`arrays.StringArray`. See :ref:`api.series.str` for more. diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index bddfea77f4795..8966612312312 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -21,6 +21,10 @@ class StringDtype(ExtensionDtype): .. versionadded:: 1.0.0 + Parameters + ---------- + None + Attributes ---------- name @@ -74,6 +78,14 @@ class StringArray(PandasArray): values : ndarray copy : bool, default False + Attributes + ---------- + None + + Methods + ------- + None + Examples -------- >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string") From 978fb55dedeb8e9aa998a1c00572ab906b4a0668 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 19 Aug 2019 11:11:03 -0500 Subject: [PATCH 10/49] fixup docstrings --- pandas/core/arrays/string_.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 8966612312312..0efb7ca97ab3c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -21,14 +21,14 @@ class StringDtype(ExtensionDtype): .. versionadded:: 1.0.0 - Parameters - ---------- - None - Attributes ---------- name + Methods + ------- + None + Examples -------- >>> pd.StringDtype() @@ -75,8 +75,16 @@ class StringArray(PandasArray): Parameters ---------- - values : ndarray + values : array-like + The array of data. + + .. warning:: + + Currently, this expects an object-dtype ndarray + where the elements are Python strings. This may + change without warning in the future. copy : bool, default False + Whether to copy the array of data. Attributes ---------- @@ -86,10 +94,16 @@ class StringArray(PandasArray): ------- None + See Also + -------- + Series.str + The string methods are available on Series backed by + a StringArray. + Examples -------- >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string") - + ['This is', 'some text', nan, 'data.'] Length: 4, dtype: string From aebc68870acc4ca2d7b3940d6e0e3e7f1bd456cc Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 19 Aug 2019 11:43:10 -0500 Subject: [PATCH 11/49] fixup docstrings --- pandas/core/arrays/string_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 0efb7ca97ab3c..5e20f3d2e3c91 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -23,7 +23,7 @@ class StringDtype(ExtensionDtype): Attributes ---------- - name + None Methods ------- From 41dc0f96cd93e133e329c6c08129e4da8d111133 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 9 Sep 2019 09:40:18 -0500 Subject: [PATCH 12/49] lint --- pandas/core/api.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/api.py b/pandas/core/api.py index ea6cfa7210bb9..04f2f84c92a15 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -10,6 +10,7 @@ ) from pandas.core.dtypes.missing import isna, isnull, notna, notnull +# TODO: Remove get_dummies import when statsmodels updates #18264 from pandas.core.algorithms import factorize, unique, value_counts from pandas.core.arrays import Categorical from pandas.core.arrays.integer import ( @@ -44,9 +45,7 @@ from pandas.core.indexes.period import Period, period_range from pandas.core.indexes.timedeltas import Timedelta, timedelta_range from pandas.core.indexing import IndexSlice -from pandas.core.reshape.reshape import ( # TODO: Remove get_dummies import when statsmodels updates #18264 - get_dummies, -) +from pandas.core.reshape.reshape import get_dummies from pandas.core.series import Series from pandas.core.tools.datetimes import to_datetime from pandas.core.tools.numeric import to_numeric From 13cdddd9bbf59fb01f2665706a5e35c1b5ca64a1 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 16 Sep 2019 13:12:25 -0500 Subject: [PATCH 13/49] typing --- pandas/core/arrays/string_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 5e20f3d2e3c91..4fbcbd43a2595 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -54,7 +54,7 @@ def name(self) -> str: return "string" @classmethod - def construct_from_string(cls, string: str): + def construct_from_string(cls, string: str) -> ExtensionDtype: if string == "string": return cls() return super().construct_from_string(string) From 78c2eaa44269c2b29aa8849bd7f464e264bb0aff Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 18 Sep 2019 16:40:14 -0500 Subject: [PATCH 14/49] removed double assert --- pandas/tests/test_strings.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 0b51fd8682913..b50f1a0fd2f2a 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3288,7 +3288,6 @@ def test_string_array(any_string_method): ): assert result.dtype == "string" result = result.astype(object) - tm.assert_series_equal(result, expected) elif isinstance(expected, DataFrame): columns = expected.select_dtypes(include="object").columns assert all(result[columns].dtypes == "string") From 726d0afcfbe896caf8bd5010f362fdbd55fd3236 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 19 Sep 2019 06:12:42 -0500 Subject: [PATCH 15/49] experimental --- doc/source/user_guide/text.rst | 5 +++++ pandas/core/arrays/string_.py | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 2125e68840da3..4405497d7c376 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -31,6 +31,11 @@ for many reasons: 3. When reading code, the contents of an ``object`` dtype array is less clear than ``string``. + +.. warning:: + + StringArray is currently considered experimental. + For backwards-compatibility, ``object`` dtype remains the default type we infer a list of strings to diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 4fbcbd43a2595..81d135401eda0 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -73,6 +73,11 @@ class StringArray(PandasArray): .. versionadded:: 1.0.0 + .. warning:: + + StringArray is considered experimental. The implementation and + parts of the API may change without warning. + Parameters ---------- values : array-like From 9cd99459889d5aa4b7e8e662cd91ff90af94250e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 19 Sep 2019 07:50:45 -0500 Subject: [PATCH 16/49] failing --- pandas/core/arrays/numpy_.py | 10 +++++---- pandas/core/arrays/string_.py | 26 +++++++++++++--------- pandas/tests/arrays/string_/test_string.py | 19 ++++++++++++++++ 3 files changed, 41 insertions(+), 14 deletions(-) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 32da0199e28f8..a979135f1b9d7 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -229,13 +229,15 @@ def __getitem__(self, item): def __setitem__(self, key, value): value = extract_array(value, extract_numpy=True) - if not lib.is_scalar(key) and is_list_like(key): + scalar_key = lib.is_scalar(key) + scalar_value = lib.is_scalar(value) + + if not scalar_key and is_list_like(key): key = np.asarray(key) - if not lib.is_scalar(value): - value = np.asarray(value) + if not scalar_value: + value = np.asarray(value, dtype=self._ndarray.dtype) - value = np.asarray(value, dtype=self._ndarray.dtype) self._ndarray[key] = value def __len__(self) -> int: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 81d135401eda0..432a125b8ec5e 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,3 +1,4 @@ +import operator from typing import Type import numpy as np @@ -7,9 +8,10 @@ from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.dtypes import register_extension_dtype -from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import is_array_like +from pandas.core import ops from pandas.core.arrays import PandasArray from pandas.core.construction import extract_array @@ -132,7 +134,12 @@ def __init__(self, values, copy=False): def _validate(self): """Validate that we only store NA or strings.""" if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): - raise ValueError("Must provide strings") + raise ValueError("StringArray requires an object-dtype ndarray of strings.") + if self._ndarray.dtype != "object": + raise ValueError( + "StringArray requires an object-dtype ndarray. Got " + "'{}' instead.".format(self._ndarray.dtype) + ) @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): @@ -152,7 +159,9 @@ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): def __setitem__(self, key, value): value = extract_array(value, extract_numpy=True) if isinstance(value, type(self)): + # extract_array doesn't extract PandasArray subclasses value = value._ndarray + scalar_key = lib.is_scalar(key) scalar_value = lib.is_scalar(value) if scalar_key and not scalar_value: @@ -187,10 +196,10 @@ def astype(self, dtype, copy=True): return super().astype(dtype, copy) def __add__(self, other): - return _add(self, other) + return _add(self, other, operator.add) def __radd__(self, other): - return _add(self, other, reversed=True) + return _add(self, other, ops.radd) def _reduce(self, name, skipna=True, **kwargs): raise TypeError("Cannot perform reduction '{}' with string dtype".format(name)) @@ -201,8 +210,8 @@ def value_counts(self, dropna=False): return value_counts(self._ndarray, dropna=dropna) -def _add(array, other, reversed=False): - if isinstance(other, (ABCIndexClass, ABCSeries)): +def _add(array, other, op): + if isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)): return NotImplemented mask = array.isna() @@ -214,9 +223,6 @@ def _add(array, other, reversed=False): out = np.empty_like(array._ndarray, dtype="object") out[mask] = np.nan - if reversed: - out[valid] = other + array._ndarray[valid] - else: - out[valid] = array._ndarray[valid] + other + out[valid] = op(array._ndarray[valid], other) return type(array)(out) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 2ab9488461e16..4846ea1860ad3 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -58,3 +58,22 @@ def test_add(): result = a.add(b, fill_value="-") expected = pd.Series(["ax", "by", "c-", "-z", None], dtype="string") tm.assert_series_equal(result, expected) + + +@pytest.mark.xfail(reason="TODO") # failing when adding np.nan to ndarray(['y']) +def test_add_frame(): + array = pd.array(["a", "b", np.nan, np.nan], dtype="string") + df = pd.DataFrame([["x", np.nan, "y", np.nan]]) + + assert array.__add__(df) is NotImplemented + result = array + df + expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]) + tm.assert_frame_equal(result, expected) + + +def test_constructor_raises(): + with pytest.raises(ValueError, match="object-dtype ndarray"): + pd.arrays.StringArray(np.array(["a", "b"], dtype="S1")) + + with pytest.raises(ValueError, match="object-dtype ndarray"): + pd.arrays.StringArray(np.array([])) From 070fb76e437d165951050e946f7500a7d344f0d4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 19 Sep 2019 09:29:02 -0500 Subject: [PATCH 17/49] xfails --- pandas/tests/arrays/string_/test_string.py | 24 ++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 4846ea1860ad3..91414414f18ad 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -60,14 +60,34 @@ def test_add(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(reason="TODO") # failing when adding np.nan to ndarray(['y']) +@pytest.mark.xfail(reason="GH-28527") +def test_add_strings(): + array = pd.array(["a", "b", "c", "d"], dtype="string") + df = pd.DataFrame([["t", "u", "v", "w"]]) + assert array.__add__(df) is NotImplemented + + result = array + df + expected = pd.DataFrame([["at", "bu", "cv", "dw"]]).astype("string") + tm.assert_frame_equal(result, expected) + + result = df + array + expected = pd.DataFrame([["ta", "ub", "vc", "wd"]]).astype("string") + tm.assert_frame_equal(result, expected) + + +@pytest.mark.xfail(reason="GH-28527") def test_add_frame(): array = pd.array(["a", "b", np.nan, np.nan], dtype="string") df = pd.DataFrame([["x", np.nan, "y", np.nan]]) assert array.__add__(df) is NotImplemented + result = array + df - expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]) + expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype("string") + tm.assert_frame_equal(result, expected) + + result = df + array + expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype("string") tm.assert_frame_equal(result, expected) From 2b90639f57fcf939d2a795ea7d9256d428cd83a0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 19 Sep 2019 09:38:51 -0500 Subject: [PATCH 18/49] Handle non-ndarray in add --- pandas/core/arrays/string_.py | 22 ++++++++++++++++------ pandas/tests/arrays/string_/test_string.py | 13 +++++++++++++ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 432a125b8ec5e..63c1ddad4140b 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,5 +1,5 @@ import operator -from typing import Type +from typing import TYPE_CHECKING, Type import numpy as np @@ -14,6 +14,10 @@ from pandas.core import ops from pandas.core.arrays import PandasArray from pandas.core.construction import extract_array +from pandas.core.missing import isna + +if TYPE_CHECKING: + from pandas._typing import Scalar, ArrayLike @register_extension_dtype @@ -38,7 +42,7 @@ class StringDtype(ExtensionDtype): """ @property - def na_value(self): + def na_value(self) -> "Scalar": """ StringDtype uses :attr:`numpy.nan` as the missing NA value. """ @@ -210,14 +214,20 @@ def value_counts(self, dropna=False): return value_counts(self._ndarray, dropna=dropna) -def _add(array, other, op): +def _add(array: StringArray, other: "ArrayLike", op): if isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)): return NotImplemented - mask = array.isna() if isinstance(other, type(array)): - mask |= other.isna() - other = other._ndarray[~mask] + other = other._ndarray + + other = np.asarray(other) + + mask = array.isna() + if not lib.is_scalar(other): + mask |= isna(other) + + other = other[~mask] valid = ~mask diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 91414414f18ad..75db0ba6c988b 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -60,6 +60,19 @@ def test_add(): tm.assert_series_equal(result, expected) +def test_add_sequence(): + a = pd.array(["a", "b", None, None], dtype="string") + other = ["x", None, "y", None] + + result = a + other + expected = pd.array(["ax", None, None, None], dtype="string") + tm.assert_extension_array_equal(result, expected) + + result = other + a + expected = pd.array(["xa", None, None, None], dtype="string") + tm.assert_extension_array_equal(result, expected) + + @pytest.mark.xfail(reason="GH-28527") def test_add_strings(): array = pd.array(["a", "b", "c", "d"], dtype="string") From 381c8892cec39ee30da04fab5a92f659402c9220 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 19 Sep 2019 10:25:42 -0500 Subject: [PATCH 19/49] fixup --- pandas/core/arrays/string_.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 63c1ddad4140b..198884e70bd88 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -221,10 +221,9 @@ def _add(array: StringArray, other: "ArrayLike", op): if isinstance(other, type(array)): other = other._ndarray - other = np.asarray(other) - mask = array.isna() if not lib.is_scalar(other): + other = np.asarray(other) mask |= isna(other) other = other[~mask] From bf82aad452f107cd959b1e9c808d183f472310c0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 19 Sep 2019 13:26:42 -0500 Subject: [PATCH 20/49] fixup --- pandas/core/arrays/string_.py | 62 ++++++++++++++-------- pandas/tests/arrays/string_/test_string.py | 18 +++++++ 2 files changed, 58 insertions(+), 22 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 198884e70bd88..a1580cbd2c228 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -11,13 +11,14 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import is_array_like +from pandas import compat from pandas.core import ops from pandas.core.arrays import PandasArray from pandas.core.construction import extract_array from pandas.core.missing import isna if TYPE_CHECKING: - from pandas._typing import Scalar, ArrayLike + from pandas._typing import Scalar @register_extension_dtype @@ -199,12 +200,6 @@ def astype(self, dtype, copy=True): return self return super().astype(dtype, copy) - def __add__(self, other): - return _add(self, other, operator.add) - - def __radd__(self, other): - return _add(self, other, ops.radd) - def _reduce(self, name, skipna=True, **kwargs): raise TypeError("Cannot perform reduction '{}' with string dtype".format(name)) @@ -213,25 +208,48 @@ def value_counts(self, dropna=False): return value_counts(self._ndarray, dropna=dropna) + # Overrride parent, because we have different return types. + @classmethod + def _create_arithmetic_method(cls, op): + def method(self, other): + if isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)): + return NotImplemented + + elif isinstance(other, cls): + other = other._ndarray + + mask = isna(self) | isna(other) + valid = ~mask -def _add(array: StringArray, other: "ArrayLike", op): - if isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)): - return NotImplemented + if not lib.is_scalar(other): + other = np.asarray(other) + other = other[valid] - if isinstance(other, type(array)): - other = other._ndarray + result = np.empty_like(self._ndarray, dtype="object") + result[mask] = np.nan + result[valid] = op(self._ndarray[valid], other) - mask = array.isna() - if not lib.is_scalar(other): - other = np.asarray(other) - mask |= isna(other) + if op.__name__ in {"add", "radd", "mul", "rmul"}: + new = StringArray + elif mask.any(): + new = lambda x: np.asarray(x, dtype="object") + else: + new = lambda x: np.asarray(x, dtype="bool") + + return new(result) + + return compat.set_function_name(method, "__{}__".format(op.__name__), cls) + + @classmethod + def _add_arithmetic_ops(cls): + cls.__add__ = cls._create_arithmetic_method(operator.add) + cls.__radd__ = cls._create_arithmetic_method(ops.radd) - other = other[~mask] + cls.__mul__ = cls._create_arithmetic_method(operator.mul) + cls.__rmul__ = cls._create_arithmetic_method(ops.rmul) - valid = ~mask + _create_comparison_method = _create_arithmetic_method - out = np.empty_like(array._ndarray, dtype="object") - out[mask] = np.nan - out[valid] = op(array._ndarray[valid], other) - return type(array)(out) +StringArray._add_arithmetic_ops() +StringArray._add_comparison_ops() diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 75db0ba6c988b..3fa4a08023ebe 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -40,6 +40,14 @@ def test_string_methods(input, method): tm.assert_series_equal(result.astype(object), expected) +def test_astype_roundtrip(): + s = pd.Series(pd.date_range("2000", periods=12)) + s[0] = None + + result = s.astype("string").astype("datetime64[ns]") + tm.assert_series_equal(result, s) + + def test_add(): a = pd.Series(["a", "b", "c", None, None], dtype="string") b = pd.Series(["x", "y", None, "z", None], dtype="string") @@ -73,6 +81,16 @@ def test_add_sequence(): tm.assert_extension_array_equal(result, expected) +def test_mul(): + a = pd.array(["a", "b", None], dtype="string") + result = a * 2 + expected = pd.array(["aa", "bb", None], dtype="string") + tm.assert_extension_array_equal(result, expected) + + result = 2 * a + tm.assert_extension_array_equal(result, expected) + + @pytest.mark.xfail(reason="GH-28527") def test_add_strings(): array = pd.array(["a", "b", "c", "d"], dtype="string") From 79bd87a835dd9cdfcd50e6731019edefb49e95c9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 19 Sep 2019 13:28:13 -0500 Subject: [PATCH 21/49] note --- pandas/core/arrays/string_.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index a1580cbd2c228..73d718973da72 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -46,6 +46,10 @@ class StringDtype(ExtensionDtype): def na_value(self) -> "Scalar": """ StringDtype uses :attr:`numpy.nan` as the missing NA value. + + .. warning:: + + `na_value` may change in a future release. """ return np.nan @@ -85,6 +89,9 @@ class StringArray(PandasArray): StringArray is considered experimental. The implementation and parts of the API may change without warning. + In particular, the NA value used may change to no longer be + ``numpy.nan``. + Parameters ---------- values : array-like From fd242749cbaec3d87f05a33aeda692a023dd7978 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 23 Sep 2019 08:12:23 -0500 Subject: [PATCH 22/49] spacing --- doc/source/user_guide/text.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 4405497d7c376..ce68966995a8b 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -187,7 +187,7 @@ i.e., from the end of the string to the beginning of the string: .. ipython:: python s3 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', - '', np.nan, 'CABA', 'dog', 'cat'], + '', np.nan, 'CABA', 'dog', 'cat'], dtype="string") s3 s3.str.replace('^.a|dog', 'XX-XX ', case=False) From 0635ede35a8b5db03e7f1406be55ea485d4b2314 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 23 Sep 2019 08:14:30 -0500 Subject: [PATCH 23/49] warning note --- pandas/core/arrays/string_.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 73d718973da72..7a57caed8585b 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -28,6 +28,14 @@ class StringDtype(ExtensionDtype): .. versionadded:: 1.0.0 + .. warning:: + + StringDtype is considered experimental. The implementation and + parts of the API may change without warning. + + In particular, StringDtype.na_value may change to no longer be + ``numpy.nan``. + Attributes ---------- None From d3311ee9e7d3a7a9c5f7e67d597b98c299fcbe0a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 23 Sep 2019 08:16:12 -0500 Subject: [PATCH 24/49] update doc --- doc/source/getting_started/basics.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index bfe88dc7df1c3..b913a70983e9c 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -1932,6 +1932,7 @@ period (time spans) :class:`PeriodDtype` :class:`Period` :class:`arrays. sparse :class:`SparseDtype` (none) :class:`arrays.SparseArray` :ref:`sparse` intervals :class:`IntervalDtype` :class:`Interval` :class:`arrays.IntervalArray` :ref:`advanced.intervalindex` nullable integer :class:`Int64Dtype`, ... (none) :class:`arrays.IntegerArray` :ref:`integer_na` +Text :class:`StringDtype` :class:`str` :class:`arrays.StringArray` :ref:`text` =================== ========================= ================== ============================= ============================= Pandas uses the ``object`` dtype for storing strings. From dce9258d7c457ea1b9086edad0766ba22c691639 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 23 Sep 2019 08:20:05 -0500 Subject: [PATCH 25/49] doc updates --- doc/source/getting_started/basics.rst | 7 ++++++- doc/source/user_guide/text.rst | 4 ++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index b913a70983e9c..7f2a60b51cae2 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -1935,7 +1935,12 @@ nullable integer :class:`Int64Dtype`, ... (none) :class:`arrays. Text :class:`StringDtype` :class:`str` :class:`arrays.StringArray` :ref:`text` =================== ========================= ================== ============================= ============================= -Pandas uses the ``object`` dtype for storing strings. +Pandas has two ways to store strings. + +1. ``object`` dtype, which can hold any Python object, including strings. +2. :class:`arrays.StringArray`, which is dedicated to strings. + +Generally, we recommend using :class:`arrays.StringArray`. See :ref:`text.types` fore more. Finally, arbitrary objects may be stored using the ``object`` dtype, but should be avoided to the extent possible (for performance and interoperability with diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index ce68966995a8b..e71dd540a45c0 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -31,6 +31,10 @@ for many reasons: 3. When reading code, the contents of an ``object`` dtype array is less clear than ``string``. +Currently, the performance of ``object`` dtype arrays of strings and +:class:`arrays.StringArray` are about the same. We expect future enhancements +to significantly increase the performance and lower the memory overhead of +:class:`arrays.StringArray`. .. warning:: From 0524f7ebf356f8b591f334d38741f075b4f63a90 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 23 Sep 2019 10:16:09 -0500 Subject: [PATCH 26/49] update ctor --- doc/source/user_guide/text.rst | 2 +- pandas/core/strings.py | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index e71dd540a45c0..f757f3894c716 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -25,7 +25,7 @@ for many reasons: 1. You can accidentally store a *mixture* of strings and non-strings in an ``object`` dtype array. It's better to have a dedicated dtype. -2. ``object`` dtype breaks dtype-specific operations like ``select_dtypes``. +2. ``object`` dtype breaks dtype-specific operations like :meth:`DataFrame.select_dtypes`. There isn't a clear way to select *just* text while excluding non-text but still object-dtype columns. 3. When reading code, the contents of an ``object`` dtype array is less clear diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 64953878d978f..e1bb4b7a500c1 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -817,15 +817,16 @@ def _str_extract_frame(arr, pat, flags=0): result_index = arr.index except AttributeError: result_index = None - result = DataFrame( + if arr.dtype.name == "string": + dtype = "string" + else: + dtype = object + return DataFrame( [groups_or_na(val) for val in arr], columns=columns, index=result_index, - dtype=object, + dtype=dtype, ) - if arr.dtype.name == "string": - result = result.astype("string") - return result def str_extract(arr, pat, flags=0, expand=True): From 292a8f34c10fa47542ea4718111e2ba1fd04ddc9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 23 Sep 2019 10:24:21 -0500 Subject: [PATCH 27/49] clean up wrapping --- pandas/core/strings.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index e1bb4b7a500c1..f807b26ab01e9 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -916,7 +916,7 @@ def str_extract(arr, pat, flags=0, expand=True): return _str_extract_frame(arr._orig, pat, flags=flags) else: result, name = _str_extract_noexpand(arr._parent, pat, flags=flags) - return arr._wrap_result(result, name=name, expand=expand, returns_string=True) + return arr._wrap_result(result, name=name, expand=expand) def str_extractall(arr, pat, flags=0): @@ -2052,6 +2052,15 @@ def _wrap_result( return result assert result.ndim < 3 + # We can be wrapping a string / object / categorical result, in which + # case we'll want to return the same dtype as the input. + # Or we can be wrapping a numeric output, in which case we don't want + # to return a StringArray. + if self._is_string and returns_string: + dtype = "string" + else: + dtype = None + if expand is None: # infer from ndim if expand is not specified expand = result.ndim != 1 @@ -2109,13 +2118,11 @@ def cons_row(x): index = self._orig.index if expand: cons = self._orig._constructor_expanddim - result = cons(result, columns=name, index=index) + result = cons(result, columns=name, index=index, dtype=dtype) else: # Must be a Series cons = self._orig._constructor - result = cons(result, name=name, index=index) - if self._is_string and returns_string: - result = result.astype("string") + result = cons(result, name=name, index=index, dtype=dtype) return result def _get_series_list(self, others): From 2c88e3b26fee576b62c3a83c3cc2c2358333834c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 23 Sep 2019 10:30:27 -0500 Subject: [PATCH 28/49] clarify --- pandas/core/strings.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index f807b26ab01e9..2fb09182cc6cf 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1025,9 +1025,9 @@ def str_extractall(arr, pat, flags=0): index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"]) # workaround #27953 - from pandas import StringDtype - - if isinstance(arr.dtype, StringDtype): + # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails + # when the list of values is empty. + if arr.dtype.name == "string": dtype = arr.dtype else: dtype = None From 1b8c83afd49a9d168a750241f29fda2c9a2a982d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 23 Sep 2019 10:48:34 -0500 Subject: [PATCH 29/49] reduce sum --- pandas/core/arrays/string_.py | 9 +++++++++ pandas/tests/arrays/string_/test_string.py | 17 +++++++++++++++++ pandas/tests/extension/test_string.py | 2 +- 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 7a57caed8585b..9d1549e6c5299 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -216,6 +216,15 @@ def astype(self, dtype, copy=True): return super().astype(dtype, copy) def _reduce(self, name, skipna=True, **kwargs): + if name == "sum": + vals = self._ndarray + missing = self.isna() + if skipna: + vals = vals[~missing] + elif missing.any(): + return np.nan + return vals.sum() + raise TypeError("Cannot perform reduction '{}' with string dtype".format(name)) def value_counts(self, dropna=False): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 3fa4a08023ebe..854fe0b4fc0cf 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -128,3 +128,20 @@ def test_constructor_raises(): with pytest.raises(ValueError, match="object-dtype ndarray"): pd.arrays.StringArray(np.array([])) + + +@pytest.mark.parametrize("skipna", [True, False]) +def test_reduce(skipna): + arr = pd.Series(["a", "b", "c"], dtype="string") + result = arr.sum(skipna=skipna) + assert result == "abc" + + +@pytest.mark.parametrize("skipna", [True, False]) +def test_reduce_missing(skipna): + arr = pd.Series([None, "a", None, "b", "c", None], dtype="string") + result = arr.sum(skipna=skipna) + if skipna: + assert result == "abc" + else: + assert pd.isna(result) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index cba10b1b7f88b..d2596e75d5583 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -76,7 +76,7 @@ class TestMissing(base.BaseMissingTests): pass -class TestReduce(base.BaseNoReduceTests): +class TestNoReduce(base.BaseNoReduceTests): pass From f1dad2a0972f57c79f9e2b1fccb21f4d2be8eaaa Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 23 Sep 2019 11:42:00 -0500 Subject: [PATCH 30/49] skip reduce sum --- pandas/tests/extension/test_string.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index d2596e75d5583..ae1008dce81d1 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -77,7 +77,14 @@ class TestMissing(base.BaseMissingTests): class TestNoReduce(base.BaseNoReduceTests): - pass + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): + if all_numeric_reductions == "sum": + pytest.skip("StringArray implements sum") + else: + return super().test_reduce_series_numeric( + data, all_numeric_reductions, skipna + ) class TestMethods(base.BaseMethodsTests): From be95ecb23c6b40963043172de6151b71a0b772bd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 23 Sep 2019 16:44:22 -0500 Subject: [PATCH 31/49] rename --- doc/source/getting_started/basics.rst | 10 ++-- doc/source/reference/arrays.rst | 10 ++-- doc/source/user_guide/text.rst | 70 +++++++++++----------- doc/source/whatsnew/v1.0.0.rst | 4 +- pandas/__init__.py | 2 +- pandas/arrays/__init__.py | 4 +- pandas/core/api.py | 2 +- pandas/core/arrays/__init__.py | 2 +- pandas/core/arrays/string_.py | 56 ++++++++--------- pandas/core/strings.py | 18 +++--- pandas/tests/api/test_api.py | 2 +- pandas/tests/arrays/string_/test_string.py | 50 ++++++++-------- pandas/tests/extension/test_string.py | 16 ++--- pandas/tests/test_strings.py | 11 ++-- 14 files changed, 130 insertions(+), 127 deletions(-) diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index 7f2a60b51cae2..37ced806df406 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -1705,7 +1705,7 @@ built-in string methods. For example: .. ipython:: python s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'], - dtype="string") + dtype="text") s.str.lower() Powerful pattern-matching methods are provided as well, but note that @@ -1716,7 +1716,7 @@ always uses them). .. note:: Prior to pandas 1.0, string methods were only available on ``object`` -dtype - ``Series``. Pandas 1.0 added the :class:`StringDtype` which is dedicated + ``Series``. Pandas 1.0 added the :class:`TextDtype` which is dedicated to strings. See :ref:`text.types` for more. Please see :ref:`Vectorized String Methods ` for a complete @@ -1932,15 +1932,15 @@ period (time spans) :class:`PeriodDtype` :class:`Period` :class:`arrays. sparse :class:`SparseDtype` (none) :class:`arrays.SparseArray` :ref:`sparse` intervals :class:`IntervalDtype` :class:`Interval` :class:`arrays.IntervalArray` :ref:`advanced.intervalindex` nullable integer :class:`Int64Dtype`, ... (none) :class:`arrays.IntegerArray` :ref:`integer_na` -Text :class:`StringDtype` :class:`str` :class:`arrays.StringArray` :ref:`text` +Text :class:`TextDtype` :class:`str` :class:`arrays.TextArray` :ref:`text` =================== ========================= ================== ============================= ============================= Pandas has two ways to store strings. 1. ``object`` dtype, which can hold any Python object, including strings. -2. :class:`arrays.StringArray`, which is dedicated to strings. +2. :class:`arrays.TextArray`, which is dedicated to strings. -Generally, we recommend using :class:`arrays.StringArray`. See :ref:`text.types` fore more. +Generally, we recommend using :class:`arrays.TextArray`. See :ref:`text.types` fore more. Finally, arbitrary objects may be stored using the ``object`` dtype, but should be avoided to the extent possible (for performance and interoperability with diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index db620e73301cb..8f8e1e1c1ea2e 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -24,7 +24,7 @@ Intervals :class:`IntervalDtype` :class:`Interval` :ref:`api.array Nullable Integer :class:`Int64Dtype`, ... (none) :ref:`api.arrays.integer_na` Categorical :class:`CategoricalDtype` (none) :ref:`api.arrays.categorical` Sparse :class:`SparseDtype` (none) :ref:`api.arrays.sparse` -Text :class:`StringDtype` :class:`str` :ref:`api.arrays.string` +Text :class:`TextDtype` :class:`str` :ref:`api.arrays.string` =================== ========================= ================== ============================= Pandas and third-party libraries can extend NumPy's type system (see :ref:`extending.extension-types`). @@ -467,21 +467,21 @@ Text data --------- When working with text data, where each valid element is a string, we recommend using -:class:`StringDtype` (with the alias ``"string"``). +:class:`TextDtype` (with the alias ``"string"``). .. autosummary:: :toctree: api/ :template: autosummary/class_without_autosummary.rst - arrays.StringArray + arrays.TextArray .. autosummary:: :toctree: api/ :template: autosummary/class_without_autosummary.rst - StringDtype + TextDtype -The ``Series.str`` accessor is available for ``Series`` backed by a :class:`arrays.StringArray`. +The ``Series.str`` accessor is available for ``Series`` backed by a :class:`arrays.TextArray`. See :ref:`api.series.str` for more. diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index f757f3894c716..646dac579fe54 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -16,9 +16,9 @@ Text Data Types There are two main ways to store text data 1. ``object`` -dtype NumPy array. -2. As an :class:`arrays.StringArray` extension type. +2. As an :class:`arrays.TextArray` extension type. -We recommend using :class:`arrays.StringArray` to store text data. +We recommend using :class:`arrays.TextArray` to store text data. Prior to pandas 1.0, ``object`` dtype was the only option. This was unfortunate for many reasons: @@ -32,13 +32,13 @@ for many reasons: than ``string``. Currently, the performance of ``object`` dtype arrays of strings and -:class:`arrays.StringArray` are about the same. We expect future enhancements +:class:`arrays.TextArray` are about the same. We expect future enhancements to significantly increase the performance and lower the memory overhead of -:class:`arrays.StringArray`. +:class:`arrays.TextArray`. .. warning:: - StringArray is currently considered experimental. + TextArray is currently considered experimental. For backwards-compatibility, ``object`` dtype remains the default type we infer a list of strings to @@ -47,12 +47,12 @@ infer a list of strings to pd.Series(['a', 'b', 'c']) -To explicitly request ``string`` dtype, specify the ``dtype`` +To explicitly request ``text`` dtype, specify the ``dtype`` .. ipython:: python - pd.Series(['a', 'b', 'c'], dtype="string") - pd.Series(['a', 'b', 'c'], dtype=pd.StringDtype()) + pd.Series(['a', 'b', 'c'], dtype="text") + pd.Series(['a', 'b', 'c'], dtype=pd.TextDtype()) Or ``astype`` after the ``Series`` or ``DataFrame`` is created @@ -60,7 +60,7 @@ Or ``astype`` after the ``Series`` or ``DataFrame`` is created s = pd.Series(['a', 'b', 'c']) s - s.astype("string") + s.astype("text") Everything that follows in the rest of this document applies equally to ``string`` and ``object`` dtype. @@ -79,7 +79,7 @@ the equivalent (scalar) built-in string methods: .. ipython:: python s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'], - dtype="string") + dtype="text") s.str.lower() s.str.upper() s.str.len() @@ -153,7 +153,7 @@ Methods like ``split`` return a Series of lists: .. ipython:: python - s2 = pd.Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'], dtype="string") + s2 = pd.Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'], dtype="text") s2.str.split('_') Elements in the split lists can be accessed using ``get`` or ``[]`` notation: @@ -169,8 +169,8 @@ It is easy to expand this to return a DataFrame using ``expand``. s2.str.split('_', expand=True) -When original ``Series`` has :class:`StringDtype`, the output columns will all -be :class:`StringDtype` as well. +When original ``Series`` has :class:`TextDtype`, the output columns will all +be :class:`TextDtype` as well. It is also possible to limit the number of splits: @@ -192,7 +192,7 @@ i.e., from the end of the string to the beginning of the string: s3 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', '', np.nan, 'CABA', 'dog', 'cat'], - dtype="string") + dtype="text") s3 s3.str.replace('^.a|dog', 'XX-XX ', case=False) @@ -203,7 +203,7 @@ following code will cause trouble because of the regular expression meaning of .. ipython:: python # Consider the following badly formatted financial data - dollars = pd.Series(['12', '-$10', '$10,000'], dtype="string") + dollars = pd.Series(['12', '-$10', '$10,000'], dtype="text") # This does what you'd naively expect: dollars.str.replace('$', '') @@ -242,7 +242,7 @@ positional argument (a regex object) and return a string. return m.group(0)[::-1] pd.Series(['foo 123', 'bar baz', np.nan], - dtype="string").str.replace(pat, repl) + dtype="text").str.replace(pat, repl) # Using regex groups pat = r"(?P\w+) (?P\w+) (?P\w+)" @@ -251,7 +251,7 @@ positional argument (a regex object) and return a string. return m.group('two').swapcase() pd.Series(['Foo Bar Baz', np.nan], - dtype="string").str.replace(pat, repl) + dtype="text").str.replace(pat, repl) .. versionadded:: 0.20.0 @@ -290,7 +290,7 @@ The content of a ``Series`` (or ``Index``) can be concatenated: .. ipython:: python - s = pd.Series(['a', 'b', 'c', 'd'], dtype="string") + s = pd.Series(['a', 'b', 'c', 'd'], dtype="text") s.str.cat(sep=',') If not specified, the keyword ``sep`` for the separator defaults to the empty string, ``sep=''``: @@ -303,7 +303,7 @@ By default, missing values are ignored. Using ``na_rep``, they can be given a re .. ipython:: python - t = pd.Series(['a', 'b', np.nan, 'd'], dtype="string") + t = pd.Series(['a', 'b', np.nan, 'd'], dtype="text") t.str.cat(sep=',') t.str.cat(sep=',', na_rep='-') @@ -349,7 +349,7 @@ the ``join``-keyword. :okwarning: u = pd.Series(['b', 'd', 'a', 'c'], index=[1, 3, 0, 2], - dtype="string") + dtype="text") s u s.str.cat(u) @@ -366,7 +366,7 @@ In particular, alignment also means that the different lengths do not need to co .. ipython:: python v = pd.Series(['z', 'a', 'b', 'd', 'e'], index=[-1, 0, 1, 3, 4], - dtype="string") + dtype="text") s v s.str.cat(v, join='left', na_rep='-') @@ -423,7 +423,7 @@ of the string, the result will be a ``NaN``. s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'], - dtype="string") + dtype="text") s.str[0] s.str[1] @@ -455,7 +455,7 @@ DataFrame with one column per group. .. ipython:: python pd.Series(['a1', 'b2', 'c3'], - dtype="string").str.extract(r'([ab])(\d)', expand=False) + dtype="text").str.extract(r'([ab])(\d)', expand=False) Elements that do not match return a row filled with ``NaN``. Thus, a Series of messy strings can be "converted" into a like-indexed Series @@ -469,7 +469,7 @@ Named groups like .. ipython:: python pd.Series(['a1', 'b2', 'c3'], - dtype="string").str.extract(r'(?P[ab])(?P\d)', + dtype="text").str.extract(r'(?P[ab])(?P\d)', expand=False) and optional groups like @@ -477,7 +477,7 @@ and optional groups like .. ipython:: python pd.Series(['a1', 'b2', '3'], - dtype="string").str.extract(r'([ab])?(\d)', expand=False) + dtype="text").str.extract(r'([ab])?(\d)', expand=False) can also be used. Note that any capture group names in the regular expression will be used for column names; otherwise capture group @@ -489,14 +489,14 @@ with one column if ``expand=True``. .. ipython:: python pd.Series(['a1', 'b2', 'c3'], - dtype="string").str.extract(r'[ab](\d)', expand=True) + dtype="text").str.extract(r'[ab](\d)', expand=True) It returns a Series if ``expand=False``. .. ipython:: python pd.Series(['a1', 'b2', 'c3'], - dtype="string").str.extract(r'[ab](\d)', expand=False) + dtype="text").str.extract(r'[ab](\d)', expand=False) Calling on an ``Index`` with a regex with exactly one capture group returns a ``DataFrame`` with one column if ``expand=True``. @@ -504,7 +504,7 @@ returns a ``DataFrame`` with one column if ``expand=True``. .. ipython:: python s = pd.Series(["a1", "b2", "c3"], ["A11", "B22", "C33"], - dtype="string") + dtype="text") s s.index.str.extract("(?P[a-zA-Z])", expand=True) @@ -550,7 +550,7 @@ Unlike ``extract`` (which returns only the first match), .. ipython:: python s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"], - dtype="string") + dtype="text") s two_groups = '(?P[a-z])(?P[0-9])' s.str.extract(two_groups, expand=True) @@ -568,7 +568,7 @@ When each subject string in the Series has exactly one match, .. ipython:: python - s = pd.Series(['a3', 'b3', 'c2'], dtype="string") + s = pd.Series(['a3', 'b3', 'c2'], dtype="text") s then ``extractall(pat).xs(0, level='match')`` gives the same result as @@ -589,7 +589,7 @@ same result as a ``Series.str.extractall`` with a default index (starts from 0). pd.Index(["a1a2", "b1", "c1"]).str.extractall(two_groups) - pd.Series(["a1a2", "b1", "c1"], dtype="string").str.extractall(two_groups) + pd.Series(["a1a2", "b1", "c1"], dtype="text").str.extractall(two_groups) Testing for Strings that match or contain a pattern @@ -601,14 +601,14 @@ You can check whether elements contain a pattern: pattern = r'[0-9][a-z]' pd.Series(['1', '2', '3a', '3b', '03c'], - dtype="string").str.contains(pattern) + dtype="text").str.contains(pattern) Or whether elements match a pattern: .. ipython:: python pd.Series(['1', '2', '3a', '3b', '03c'], - dtype="string").str.match(pattern) + dtype="text").str.match(pattern) The distinction between ``match`` and ``contains`` is strictness: ``match`` relies on strict ``re.match``, while ``contains`` relies on ``re.search``. @@ -619,7 +619,7 @@ an extra ``na`` argument so missing values can be considered True or False: .. ipython:: python s4 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'], - dtype="string") + dtype="text") s4.str.contains('A', na=False) .. _text.indicator: @@ -632,7 +632,7 @@ For example if they are separated by a ``'|'``: .. ipython:: python - s = pd.Series(['a', 'a|b', np.nan, 'a|c'], dtype="string") + s = pd.Series(['a', 'a|b', np.nan, 'a|c'], dtype="text") s.str.get_dummies(sep='|') String ``Index`` also supports ``get_dummies`` which returns a ``MultiIndex``. diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 925ef1d16ea2f..3f26e661660a8 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -55,12 +55,12 @@ Enhancements Dedicated string data type ^^^^^^^^^^^^^^^^^^^^^^^^^^ -We've added :class:`StringDtype`, an extension type dedicated to string data. +We've added :class:`TextDtype`, an extension type dedicated to string data. Previously, strings were typically stored in object-dtype NumPy arrays. .. ipython:: python - pd.Series(['abc', None, 'def'], dtype=pd.StringDtype()) + pd.Series(['abc', None, 'def'], dtype=pd.TextDtype()) You can use the alias ``'string'`` as well. diff --git a/pandas/__init__.py b/pandas/__init__.py index 2cd891c696203..f987f1e2f5273 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -66,7 +66,7 @@ PeriodDtype, IntervalDtype, DatetimeTZDtype, - StringDtype, + TextDtype, # missing isna, isnull, diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py index 9870b5bed076d..24cb6a9c5acf7 100644 --- a/pandas/arrays/__init__.py +++ b/pandas/arrays/__init__.py @@ -11,7 +11,7 @@ PandasArray, PeriodArray, SparseArray, - StringArray, + TextArray, TimedeltaArray, ) @@ -23,6 +23,6 @@ "PandasArray", "PeriodArray", "SparseArray", - "StringArray", + "TextArray", "TimedeltaArray", ] diff --git a/pandas/core/api.py b/pandas/core/api.py index 04f2f84c92a15..e17602b8943f5 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -23,7 +23,7 @@ UInt32Dtype, UInt64Dtype, ) -from pandas.core.arrays.string_ import StringDtype +from pandas.core.arrays.string_ import TextDtype from pandas.core.construction import array from pandas.core.groupby import Grouper, NamedAgg from pandas.core.index import ( diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 868118bac6a7b..a7989cd154d04 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -10,5 +10,5 @@ from .numpy_ import PandasArray, PandasDtype # noqa: F401 from .period import PeriodArray, period_array # noqa: F401 from .sparse import SparseArray # noqa: F401 -from .string_ import StringArray # noqa: F401 +from .string_ import TextArray # noqa: F401 from .timedeltas import TimedeltaArray # noqa: F401 diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 9d1549e6c5299..6de2fee0050ca 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -22,7 +22,7 @@ @register_extension_dtype -class StringDtype(ExtensionDtype): +class TextDtype(ExtensionDtype): """ Extension dtype for text data. @@ -30,10 +30,10 @@ class StringDtype(ExtensionDtype): .. warning:: - StringDtype is considered experimental. The implementation and + TextDtype is considered experimental. The implementation and parts of the API may change without warning. - In particular, StringDtype.na_value may change to no longer be + In particular, TextDtype.na_value may change to no longer be ``numpy.nan``. Attributes @@ -46,14 +46,14 @@ class StringDtype(ExtensionDtype): Examples -------- - >>> pd.StringDtype() - StringDtype + >>> pd.TextDtype() + TextDtype """ @property def na_value(self) -> "Scalar": """ - StringDtype uses :attr:`numpy.nan` as the missing NA value. + TextDtype uses :attr:`numpy.nan` as the missing NA value. .. warning:: @@ -68,25 +68,25 @@ def type(self) -> Type: @property def name(self) -> str: """ - The alias for StringDtype is ``'string'``. + The alias for TextDtype is ``'text'``. """ - return "string" + return "text" @classmethod def construct_from_string(cls, string: str) -> ExtensionDtype: - if string == "string": + if string == "text": return cls() return super().construct_from_string(string) @classmethod - def construct_array_type(cls) -> "Type[StringArray]": - return StringArray + def construct_array_type(cls) -> "Type[TextArray]": + return TextArray def __repr__(self) -> str: - return "StringDtype" + return "TextDtype" -class StringArray(PandasArray): +class TextArray(PandasArray): """ Extension array for text data. @@ -94,7 +94,7 @@ class StringArray(PandasArray): .. warning:: - StringArray is considered experimental. The implementation and + TextArray is considered experimental. The implementation and parts of the API may change without warning. In particular, the NA value used may change to no longer be @@ -125,19 +125,19 @@ class StringArray(PandasArray): -------- Series.str The string methods are available on Series backed by - a StringArray. + a TextArray. Examples -------- - >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string") - + >>> pd.array(['This is', 'some text', None, 'data.'], dtype="text") + ['This is', 'some text', nan, 'data.'] Length: 4, dtype: string - Unlike ``object`` dtype arrays, ``StringArray`` doesn't allow non-string + Unlike ``object`` dtype arrays, ``TextArray`` doesn't allow non-string values. - >>> pd.array(['1', 1], dtype="string") + >>> pd.array(['1', 1], dtype="text") Traceback (most recent call last): ... ValueError: Must provide strings @@ -148,23 +148,23 @@ class StringArray(PandasArray): def __init__(self, values, copy=False): super().__init__(values, copy=copy) - self._dtype = StringDtype() + self._dtype = TextDtype() self._validate() def _validate(self): """Validate that we only store NA or strings.""" if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): - raise ValueError("StringArray requires an object-dtype ndarray of strings.") + raise ValueError("TextArray requires an object-dtype ndarray of strings.") if self._ndarray.dtype != "object": raise ValueError( - "StringArray requires an object-dtype ndarray. Got " + "TextArray requires an object-dtype ndarray. Got " "'{}' instead.".format(self._ndarray.dtype) ) @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): if dtype: - assert dtype == "string" + assert dtype == "text" result = super()._from_sequence(scalars, dtype=object, copy=copy) # convert None to np.nan # TODO: it would be nice to do this in _validate / lib.is_string_array @@ -193,7 +193,7 @@ def __setitem__(self, key, value): value = np.nan elif not (isinstance(value, str) or np.isnan(value)): raise ValueError( - "Cannot set value '{}' into a StringArray.".format(value) + "Cannot set value '{}' into a TextArray.".format(value) ) else: if not is_array_like(value): @@ -209,7 +209,7 @@ def fillna(self, value=None, method=None, limit=None): def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) - if isinstance(dtype, StringDtype): + if isinstance(dtype, TextDtype): if copy: return self.copy() return self @@ -254,7 +254,7 @@ def method(self, other): result[valid] = op(self._ndarray[valid], other) if op.__name__ in {"add", "radd", "mul", "rmul"}: - new = StringArray + new = TextArray elif mask.any(): new = lambda x: np.asarray(x, dtype="object") else: @@ -275,5 +275,5 @@ def _add_arithmetic_ops(cls): _create_comparison_method = _create_arithmetic_method -StringArray._add_arithmetic_ops() -StringArray._add_comparison_ops() +TextArray._add_arithmetic_ops() +TextArray._add_comparison_ops() diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 2fb09182cc6cf..2ee4438024c32 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -817,8 +817,8 @@ def _str_extract_frame(arr, pat, flags=0): result_index = arr.index except AttributeError: result_index = None - if arr.dtype.name == "string": - dtype = "string" + if arr.dtype.name == "text": + dtype = "text" else: dtype = object return DataFrame( @@ -1027,7 +1027,7 @@ def str_extractall(arr, pat, flags=0): # workaround #27953 # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails # when the list of values is empty. - if arr.dtype.name == "string": + if arr.dtype.name == "text": dtype = arr.dtype else: dtype = None @@ -1953,7 +1953,7 @@ class StringMethods(NoNewAttributesMixin): def __init__(self, data): self._inferred_dtype = self._validate(data) self._is_categorical = is_categorical_dtype(data) - self._is_string = data.dtype.name == "string" + self._is_string = data.dtype.name == "text" # .values.categories works for both Series/Index self._parent = data.values.categories if self._is_categorical else data @@ -1984,7 +1984,7 @@ def _validate(data): ------- dtype : inferred dtype of data """ - from pandas import StringDtype + from pandas import TextDtype if isinstance(data, ABCMultiIndex): raise AttributeError( @@ -1997,8 +1997,8 @@ def _validate(data): values = getattr(data, "values", data) # Series / Index values = getattr(values, "categories", values) # categorical / normal - # explicitly allow StringDtype - if isinstance(values.dtype, StringDtype): + # explicitly allow TextDtype + if isinstance(values.dtype, TextDtype): return "string" try: @@ -2055,9 +2055,9 @@ def _wrap_result( # We can be wrapping a string / object / categorical result, in which # case we'll want to return the same dtype as the input. # Or we can be wrapping a numeric output, in which case we don't want - # to return a StringArray. + # to return a TextArray. if self._is_string and returns_string: - dtype = "string" + dtype = "text" else: dtype = None diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 6c50159663574..056a23ab81acf 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -68,7 +68,7 @@ class TestPDApi(Base): "Series", "SparseArray", "SparseDtype", - "StringDtype", + "TextDtype", "Timedelta", "TimedeltaIndex", "Timestamp", diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 854fe0b4fc0cf..8ef1533a73e19 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -8,13 +8,13 @@ def test_none_to_nan(): - a = pd.arrays.StringArray._from_sequence(["a", None, "b"]) + a = pd.arrays.TextArray._from_sequence(["a", None, "b"]) assert a[1] is not None assert np.isnan(a[1]) def test_setitem_validates(): - a = pd.arrays.StringArray._from_sequence(["a", "b"]) + a = pd.arrays.TextArray._from_sequence(["a", "b"]) with pytest.raises(ValueError, match="10"): a[0] = 10 @@ -31,12 +31,12 @@ def test_setitem_validates(): ], ) def test_string_methods(input, method): - a = pd.Series(input, dtype="string") + a = pd.Series(input, dtype="text") b = pd.Series(input, dtype="object") result = method(a.str) expected = method(b.str) - assert result.dtype.name == "string" + assert result.dtype.name == "text" tm.assert_series_equal(result.astype(object), expected) @@ -44,47 +44,47 @@ def test_astype_roundtrip(): s = pd.Series(pd.date_range("2000", periods=12)) s[0] = None - result = s.astype("string").astype("datetime64[ns]") + result = s.astype("text").astype("datetime64[ns]") tm.assert_series_equal(result, s) def test_add(): - a = pd.Series(["a", "b", "c", None, None], dtype="string") - b = pd.Series(["x", "y", None, "z", None], dtype="string") + a = pd.Series(["a", "b", "c", None, None], dtype="text") + b = pd.Series(["x", "y", None, "z", None], dtype="text") result = a + b - expected = pd.Series(["ax", "by", None, None, None], dtype="string") + expected = pd.Series(["ax", "by", None, None, None], dtype="text") tm.assert_series_equal(result, expected) result = a.add(b) tm.assert_series_equal(result, expected) result = a.radd(b) - expected = pd.Series(["xa", "yb", None, None, None], dtype="string") + expected = pd.Series(["xa", "yb", None, None, None], dtype="text") tm.assert_series_equal(result, expected) result = a.add(b, fill_value="-") - expected = pd.Series(["ax", "by", "c-", "-z", None], dtype="string") + expected = pd.Series(["ax", "by", "c-", "-z", None], dtype="text") tm.assert_series_equal(result, expected) def test_add_sequence(): - a = pd.array(["a", "b", None, None], dtype="string") + a = pd.array(["a", "b", None, None], dtype="text") other = ["x", None, "y", None] result = a + other - expected = pd.array(["ax", None, None, None], dtype="string") + expected = pd.array(["ax", None, None, None], dtype="text") tm.assert_extension_array_equal(result, expected) result = other + a - expected = pd.array(["xa", None, None, None], dtype="string") + expected = pd.array(["xa", None, None, None], dtype="text") tm.assert_extension_array_equal(result, expected) def test_mul(): - a = pd.array(["a", "b", None], dtype="string") + a = pd.array(["a", "b", None], dtype="text") result = a * 2 - expected = pd.array(["aa", "bb", None], dtype="string") + expected = pd.array(["aa", "bb", None], dtype="text") tm.assert_extension_array_equal(result, expected) result = 2 * a @@ -93,53 +93,53 @@ def test_mul(): @pytest.mark.xfail(reason="GH-28527") def test_add_strings(): - array = pd.array(["a", "b", "c", "d"], dtype="string") + array = pd.array(["a", "b", "c", "d"], dtype="text") df = pd.DataFrame([["t", "u", "v", "w"]]) assert array.__add__(df) is NotImplemented result = array + df - expected = pd.DataFrame([["at", "bu", "cv", "dw"]]).astype("string") + expected = pd.DataFrame([["at", "bu", "cv", "dw"]]).astype("text") tm.assert_frame_equal(result, expected) result = df + array - expected = pd.DataFrame([["ta", "ub", "vc", "wd"]]).astype("string") + expected = pd.DataFrame([["ta", "ub", "vc", "wd"]]).astype("text") tm.assert_frame_equal(result, expected) @pytest.mark.xfail(reason="GH-28527") def test_add_frame(): - array = pd.array(["a", "b", np.nan, np.nan], dtype="string") + array = pd.array(["a", "b", np.nan, np.nan], dtype="text") df = pd.DataFrame([["x", np.nan, "y", np.nan]]) assert array.__add__(df) is NotImplemented result = array + df - expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype("string") + expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype("text") tm.assert_frame_equal(result, expected) result = df + array - expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype("string") + expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype("text") tm.assert_frame_equal(result, expected) def test_constructor_raises(): with pytest.raises(ValueError, match="object-dtype ndarray"): - pd.arrays.StringArray(np.array(["a", "b"], dtype="S1")) + pd.arrays.TextArray(np.array(["a", "b"], dtype="S1")) with pytest.raises(ValueError, match="object-dtype ndarray"): - pd.arrays.StringArray(np.array([])) + pd.arrays.TextArray(np.array([])) @pytest.mark.parametrize("skipna", [True, False]) def test_reduce(skipna): - arr = pd.Series(["a", "b", "c"], dtype="string") + arr = pd.Series(["a", "b", "c"], dtype="text") result = arr.sum(skipna=skipna) assert result == "abc" @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_missing(skipna): - arr = pd.Series([None, "a", None, "b", "c", None], dtype="string") + arr = pd.Series([None, "a", None, "b", "c", None], dtype="text") result = arr.sum(skipna=skipna) if skipna: assert result == "abc" diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index ae1008dce81d1..3489722b4538e 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -4,13 +4,13 @@ import pytest import pandas as pd -from pandas.core.arrays.string_ import StringArray, StringDtype +from pandas.core.arrays.string_ import TextArray, TextDtype from pandas.tests.extension import base @pytest.fixture def dtype(): - return StringDtype() + return TextDtype() @pytest.fixture @@ -19,23 +19,23 @@ def data(): while strings[0] == strings[1]: strings = np.random.choice(list(string.ascii_letters), size=100) - return StringArray._from_sequence(strings) + return TextArray._from_sequence(strings) @pytest.fixture def data_missing(): """Length 2 array with [NA, Valid]""" - return StringArray._from_sequence([np.nan, "A"]) + return TextArray._from_sequence([np.nan, "A"]) @pytest.fixture def data_for_sorting(): - return StringArray._from_sequence(["B", "C", "A"]) + return TextArray._from_sequence(["B", "C", "A"]) @pytest.fixture def data_missing_for_sorting(): - return StringArray._from_sequence(["B", np.nan, "A"]) + return TextArray._from_sequence(["B", np.nan, "A"]) @pytest.fixture @@ -45,7 +45,7 @@ def na_value(): @pytest.fixture def data_for_grouping(): - return StringArray._from_sequence(["B", "B", np.nan, np.nan, "A", "A", "B", "C"]) + return TextArray._from_sequence(["B", "B", np.nan, np.nan, "A", "A", "B", "C"]) class TestDtype(base.BaseDtypeTests): @@ -80,7 +80,7 @@ class TestNoReduce(base.BaseNoReduceTests): @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): if all_numeric_reductions == "sum": - pytest.skip("StringArray implements sum") + pytest.skip("TextArray implements sum") else: return super().test_reduce_series_numeric( data, all_numeric_reductions, skipna diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index b50f1a0fd2f2a..d0e18bd53b1b8 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -141,6 +141,7 @@ def any_string_method(request): # subset of the full set from pandas/conftest.py _any_allowed_skipna_inferred_dtype = [ ("string", ["a", np.nan, "c"]), + ("text", ["a", np.nan, "c"]), ("bytes", [b"a", np.nan, b"c"]), ("empty", [np.nan, np.nan, np.nan]), ("empty", []), @@ -156,6 +157,7 @@ def any_allowed_skipna_inferred_dtype(request): The covered (inferred) types are: * 'string' + * 'text' * 'empty' * 'bytes' * 'mixed' @@ -221,6 +223,7 @@ def test_api_per_dtype(self, box, dtype, any_skipna_inferred_dtype): types_passing_constructor = [ "string", + "text", "unicode", "empty", "bytes", @@ -283,7 +286,7 @@ def test_api_per_method( mixed_allowed = method_name not in ["cat"] allowed_types = ( - ["string", "unicode", "empty"] + ["string", "unicode", "empty", "text"] + ["bytes"] * bytes_allowed + ["mixed", "mixed-integer"] * mixed_allowed ) @@ -3276,7 +3279,7 @@ def test_casefold(self): def test_string_array(any_string_method): data = ["a", "bb", np.nan, "ccc"] a = Series(data, dtype=object) - b = Series(data, dtype="string") + b = Series(data, dtype="text") method_name, args, kwargs = any_string_method expected = getattr(a.str, method_name)(*args, **kwargs) @@ -3286,10 +3289,10 @@ def test_string_array(any_string_method): if expected.dtype == "object" and lib.is_string_array( expected.values, skipna=True ): - assert result.dtype == "string" + assert result.dtype == "text" result = result.astype(object) elif isinstance(expected, DataFrame): columns = expected.select_dtypes(include="object").columns - assert all(result[columns].dtypes == "string") + assert all(result[columns].dtypes == "text") result[columns] = result[columns].astype(object) tm.assert_equal(result, expected) From 903ea2f8bf02acd515f7b0b729047f10f12709ff Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 23 Sep 2019 16:47:16 -0500 Subject: [PATCH 32/49] move --- pandas/core/api.py | 2 +- pandas/core/arrays/__init__.py | 2 +- pandas/core/arrays/{string_.py => text.py} | 0 pandas/tests/arrays/text/__init__.py | 0 .../tests/arrays/{string_/test_string.py => text/test_text.py} | 0 pandas/tests/extension/{test_string.py => test_text.py} | 2 +- 6 files changed, 3 insertions(+), 3 deletions(-) rename pandas/core/arrays/{string_.py => text.py} (100%) create mode 100644 pandas/tests/arrays/text/__init__.py rename pandas/tests/arrays/{string_/test_string.py => text/test_text.py} (100%) rename pandas/tests/extension/{test_string.py => test_text.py} (97%) diff --git a/pandas/core/api.py b/pandas/core/api.py index e17602b8943f5..83ecf675d0389 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -23,7 +23,7 @@ UInt32Dtype, UInt64Dtype, ) -from pandas.core.arrays.string_ import TextDtype +from pandas.core.arrays.text import TextDtype from pandas.core.construction import array from pandas.core.groupby import Grouper, NamedAgg from pandas.core.index import ( diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index a7989cd154d04..0df154dc07322 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -10,5 +10,5 @@ from .numpy_ import PandasArray, PandasDtype # noqa: F401 from .period import PeriodArray, period_array # noqa: F401 from .sparse import SparseArray # noqa: F401 -from .string_ import TextArray # noqa: F401 +from .text import TextArray # noqa: F401 from .timedeltas import TimedeltaArray # noqa: F401 diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/text.py similarity index 100% rename from pandas/core/arrays/string_.py rename to pandas/core/arrays/text.py diff --git a/pandas/tests/arrays/text/__init__.py b/pandas/tests/arrays/text/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/text/test_text.py similarity index 100% rename from pandas/tests/arrays/string_/test_string.py rename to pandas/tests/arrays/text/test_text.py diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_text.py similarity index 97% rename from pandas/tests/extension/test_string.py rename to pandas/tests/extension/test_text.py index 3489722b4538e..24337b86d5e3a 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_text.py @@ -4,7 +4,7 @@ import pytest import pandas as pd -from pandas.core.arrays.string_ import TextArray, TextDtype +from pandas.core.arrays.text import TextArray, TextDtype from pandas.tests.extension import base From 0e1f479bdda2b7f390084f8398345c6b408057b6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 23 Sep 2019 16:51:56 -0500 Subject: [PATCH 33/49] missed --- doc/source/reference/arrays.rst | 2 +- doc/source/whatsnew/v1.0.0.rst | 8 ++++---- pandas/core/strings.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index 8f8e1e1c1ea2e..5f3be7830d027 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -467,7 +467,7 @@ Text data --------- When working with text data, where each valid element is a string, we recommend using -:class:`TextDtype` (with the alias ``"string"``). +:class:`TextDtype` (with the alias ``"text"``). .. autosummary:: :toctree: api/ diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 3f26e661660a8..0abfee0d1e904 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -52,8 +52,8 @@ Enhancements .. _whatsnew_1000.string: -Dedicated string data type -^^^^^^^^^^^^^^^^^^^^^^^^^^ +Dedicated text data type +^^^^^^^^^^^^^^^^^^^^^^^^ We've added :class:`TextDtype`, an extension type dedicated to string data. Previously, strings were typically stored in object-dtype NumPy arrays. @@ -62,11 +62,11 @@ Previously, strings were typically stored in object-dtype NumPy arrays. pd.Series(['abc', None, 'def'], dtype=pd.TextDtype()) -You can use the alias ``'string'`` as well. +You can use the alias ``"text"`` as well. .. ipython:: python - s = pd.Series(['abc', None, 'def'], dtype="string") + s = pd.Series(['abc', None, 'def'], dtype="text") s The usual string accessor methods work. Where appropriate, the return type diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 2ee4438024c32..f50ef530bde86 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1999,7 +1999,7 @@ def _validate(data): # explicitly allow TextDtype if isinstance(values.dtype, TextDtype): - return "string" + return "text" try: inferred_dtype = lib.infer_dtype(values, skipna=True) From c168ecf26d91dca519e586e553dfc2ee1fc1d6db Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 23 Sep 2019 16:53:16 -0500 Subject: [PATCH 34/49] missed --- doc/source/user_guide/text.rst | 4 ++-- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/strings.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 646dac579fe54..502a40fd15144 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -29,7 +29,7 @@ for many reasons: There isn't a clear way to select *just* text while excluding non-text but still object-dtype columns. 3. When reading code, the contents of an ``object`` dtype array is less clear - than ``string``. + than ``text``. Currently, the performance of ``object`` dtype arrays of strings and :class:`arrays.TextArray` are about the same. We expect future enhancements @@ -63,7 +63,7 @@ Or ``astype`` after the ``Series`` or ``DataFrame`` is created s.astype("text") Everything that follows in the rest of this document applies equally to -``string`` and ``object`` dtype. +``text`` and ``object`` dtype. .. _text.string_methods: diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 0abfee0d1e904..1e49e913ef52e 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -75,7 +75,7 @@ of the Series or columns of a DataFrame will also have string dtype. s.str.upper() s.str.split('b', expand=True).dtypes -We recommend explicitly using the ``string`` data type when working with strings. +We recommend explicitly using the ``text`` data type when working with strings. See :ref:`text.types` for more. .. _whatsnew_1000.enhancements.other: diff --git a/pandas/core/strings.py b/pandas/core/strings.py index f50ef530bde86..2ee4438024c32 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1999,7 +1999,7 @@ def _validate(data): # explicitly allow TextDtype if isinstance(values.dtype, TextDtype): - return "text" + return "string" try: inferred_dtype = lib.infer_dtype(values, skipna=True) From d06ba7348471affffc7dd62953ee6485219892f0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 24 Sep 2019 06:50:44 -0500 Subject: [PATCH 35/49] fixup rename --- doc/source/reference/arrays.rst | 2 +- doc/source/user_guide/text.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index 5f3be7830d027..d13f8c21c5f8e 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -24,7 +24,7 @@ Intervals :class:`IntervalDtype` :class:`Interval` :ref:`api.array Nullable Integer :class:`Int64Dtype`, ... (none) :ref:`api.arrays.integer_na` Categorical :class:`CategoricalDtype` (none) :ref:`api.arrays.categorical` Sparse :class:`SparseDtype` (none) :ref:`api.arrays.sparse` -Text :class:`TextDtype` :class:`str` :ref:`api.arrays.string` +Text :class:`TextDtype` :class:`str` :ref:`api.arrays.string` =================== ========================= ================== ============================= Pandas and third-party libraries can extend NumPy's type system (see :ref:`extending.extension-types`). diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 502a40fd15144..e380884604801 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -470,7 +470,7 @@ Named groups like pd.Series(['a1', 'b2', 'c3'], dtype="text").str.extract(r'(?P[ab])(?P\d)', - expand=False) + expand=False) and optional groups like From 3ba27c33829e5f042c29d59efd96dea773abc97d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 24 Sep 2019 07:58:01 -0500 Subject: [PATCH 36/49] fixup --- pandas/core/arrays/numpy_.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index a979135f1b9d7..2ef4f774d2985 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -10,7 +10,7 @@ from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries -from pandas.core.dtypes.inference import is_array_like, is_list_like +from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import isna from pandas import compat @@ -230,12 +230,11 @@ def __setitem__(self, key, value): value = extract_array(value, extract_numpy=True) scalar_key = lib.is_scalar(key) - scalar_value = lib.is_scalar(value) - if not scalar_key and is_list_like(key): + if not scalar_key: key = np.asarray(key) - if not scalar_value: + if not lib.is_scalar(value): value = np.asarray(value, dtype=self._ndarray.dtype) self._ndarray[key] = value From fe8ee77bcf56db959a406bf763c051887754d1cf Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 24 Sep 2019 08:02:31 -0500 Subject: [PATCH 37/49] doctest --- ci/code_checks.sh | 4 ++++ pandas/core/arrays/text.py | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index b03c4f2238445..3fc95efad1905 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -262,6 +262,10 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then -k"-from_arrays -from_breaks -from_intervals -from_tuples -set_closed -to_tuples -interval_range" RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Doctests arrays/text.py' ; echo $MSG + pytest -q --doctest-modules pandas/core/arrays/text.py + RET=$(($RET + $?)) ; echo $MSG "DONE" + fi ### DOCSTRINGS ### diff --git a/pandas/core/arrays/text.py b/pandas/core/arrays/text.py index 6de2fee0050ca..8ad63dc13e893 100644 --- a/pandas/core/arrays/text.py +++ b/pandas/core/arrays/text.py @@ -132,7 +132,7 @@ class TextArray(PandasArray): >>> pd.array(['This is', 'some text', None, 'data.'], dtype="text") ['This is', 'some text', nan, 'data.'] - Length: 4, dtype: string + Length: 4, dtype: text Unlike ``object`` dtype arrays, ``TextArray`` doesn't allow non-string values. @@ -140,7 +140,7 @@ class TextArray(PandasArray): >>> pd.array(['1', 1], dtype="text") Traceback (most recent call last): ... - ValueError: Must provide strings + ValueError: TextArray requires an object-dtype ndarray of strings. """ # undo the PandasArray hack From d9f63aadc20c3bb53112790e7a1227bc90d13d3b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 24 Sep 2019 08:50:03 -0500 Subject: [PATCH 38/49] updates --- doc/source/getting_started/basics.rst | 4 ++-- doc/source/reference/arrays.rst | 4 ++-- doc/source/user_guide/text.rst | 9 +++++---- doc/source/whatsnew/v1.0.0.rst | 16 ++++++++++++++++ pandas/core/arrays/text.py | 6 +++--- pandas/tests/extension/test_text.py | 8 ++++++++ 6 files changed, 36 insertions(+), 11 deletions(-) diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index 37ced806df406..cc1cbeec9b788 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -1938,9 +1938,9 @@ Text :class:`TextDtype` :class:`str` :class:`arrays. Pandas has two ways to store strings. 1. ``object`` dtype, which can hold any Python object, including strings. -2. :class:`arrays.TextArray`, which is dedicated to strings. +2. :class:`TextDtype`, which is dedicated to strings. -Generally, we recommend using :class:`arrays.TextArray`. See :ref:`text.types` fore more. +Generally, we recommend using :class:`TextDtype`. See :ref:`text.types` fore more. Finally, arbitrary objects may be stored using the ``object`` dtype, but should be avoided to the extent possible (for performance and interoperability with diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index d13f8c21c5f8e..81d6cb2e54c2c 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -466,8 +466,8 @@ and methods if the :class:`Series` contains sparse values. See Text data --------- -When working with text data, where each valid element is a string, we recommend using -:class:`TextDtype` (with the alias ``"text"``). +When working with text data, where each valid element is a string or missing, +we recommend using :class:`TextDtype` (with the alias ``"text"``). .. autosummary:: :toctree: api/ diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index e380884604801..ca00be77761a7 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -16,9 +16,9 @@ Text Data Types There are two main ways to store text data 1. ``object`` -dtype NumPy array. -2. As an :class:`arrays.TextArray` extension type. +2. :class:`TextDtype` extension type. -We recommend using :class:`arrays.TextArray` to store text data. +We recommend using :class:`TextDtype` to store text data. Prior to pandas 1.0, ``object`` dtype was the only option. This was unfortunate for many reasons: @@ -34,11 +34,12 @@ for many reasons: Currently, the performance of ``object`` dtype arrays of strings and :class:`arrays.TextArray` are about the same. We expect future enhancements to significantly increase the performance and lower the memory overhead of -:class:`arrays.TextArray`. +:class:`~arrays.TextArray`. .. warning:: - TextArray is currently considered experimental. + ``TextArray`` is currently considered experimental. The implementation + and parts of the API may change without warning. For backwards-compatibility, ``object`` dtype remains the default type we infer a list of strings to diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 1e49e913ef52e..1daa3c95a99c3 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -58,6 +58,22 @@ Dedicated text data type We've added :class:`TextDtype`, an extension type dedicated to string data. Previously, strings were typically stored in object-dtype NumPy arrays. +.. warning:: + + ``TextDtype`` and is currently considered experimental. The implementation + and parts of the API may change without warning. + +The text extension type solves several issues with object-dtype NumPy arrays: + +1. You can accidentally store a *mixture* of strings and non-strings in an + ``object`` dtype array. A ``TextArray`` can only store strings. +2. ``object`` dtype breaks dtype-specific operations like :meth:`DataFrame.select_dtypes`. + There isn't a clear way to select *just* text while excluding non-text, + but still object-dtype columns. +3. When reading code, the contents of an ``object`` dtype array is less clear + than ``text``. + + .. ipython:: python pd.Series(['abc', None, 'def'], dtype=pd.TextDtype()) diff --git a/pandas/core/arrays/text.py b/pandas/core/arrays/text.py index 8ad63dc13e893..b064b5971fdb3 100644 --- a/pandas/core/arrays/text.py +++ b/pandas/core/arrays/text.py @@ -157,8 +157,8 @@ def _validate(self): raise ValueError("TextArray requires an object-dtype ndarray of strings.") if self._ndarray.dtype != "object": raise ValueError( - "TextArray requires an object-dtype ndarray. Got " - "'{}' instead.".format(self._ndarray.dtype) + "TextArray requires a sequence of strings. Got " + "'{}' dtype instead.".format(self._ndarray.dtype) ) @classmethod @@ -193,7 +193,7 @@ def __setitem__(self, key, value): value = np.nan elif not (isinstance(value, str) or np.isnan(value)): raise ValueError( - "Cannot set value '{}' into a TextArray.".format(value) + "Cannot set non-string value '{}' into a TextArray.".format(value) ) else: if not is_array_like(value): diff --git a/pandas/tests/extension/test_text.py b/pandas/tests/extension/test_text.py index 24337b86d5e3a..f0363d7ede470 100644 --- a/pandas/tests/extension/test_text.py +++ b/pandas/tests/extension/test_text.py @@ -109,3 +109,11 @@ def test_compare_scalar(self, data, all_compare_operators): class TestParsing(base.BaseParsingTests): pass + + +class TestPrinting(base.BasePrintingTests): + pass + + +class TestGroupBy(base.BaseGroupbyTests): + pass From d3c49e2394a6695ffbf731535b80705242455eef Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 24 Sep 2019 08:57:03 -0500 Subject: [PATCH 39/49] fixups --- pandas/core/arrays/numpy_.py | 5 +++-- pandas/core/arrays/text.py | 4 +++- pandas/tests/arrays/text/test_text.py | 4 ++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 2ef4f774d2985..bf7404e8997c6 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -230,11 +230,12 @@ def __setitem__(self, key, value): value = extract_array(value, extract_numpy=True) scalar_key = lib.is_scalar(key) + scalar_value = lib.is_scalar(value) - if not scalar_key: + if not scalar_key and scalar_value: key = np.asarray(key) - if not lib.is_scalar(value): + if not scalar_value: value = np.asarray(value, dtype=self._ndarray.dtype) self._ndarray[key] = value diff --git a/pandas/core/arrays/text.py b/pandas/core/arrays/text.py index b064b5971fdb3..7f067e53e1883 100644 --- a/pandas/core/arrays/text.py +++ b/pandas/core/arrays/text.py @@ -154,7 +154,9 @@ def __init__(self, values, copy=False): def _validate(self): """Validate that we only store NA or strings.""" if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): - raise ValueError("TextArray requires an object-dtype ndarray of strings.") + raise ValueError( + "TextArray requires a sequence of strings or missing values." + ) if self._ndarray.dtype != "object": raise ValueError( "TextArray requires a sequence of strings. Got " diff --git a/pandas/tests/arrays/text/test_text.py b/pandas/tests/arrays/text/test_text.py index 8ef1533a73e19..4302f51043638 100644 --- a/pandas/tests/arrays/text/test_text.py +++ b/pandas/tests/arrays/text/test_text.py @@ -123,10 +123,10 @@ def test_add_frame(): def test_constructor_raises(): - with pytest.raises(ValueError, match="object-dtype ndarray"): + with pytest.raises(ValueError, match="sequence of strings"): pd.arrays.TextArray(np.array(["a", "b"], dtype="S1")) - with pytest.raises(ValueError, match="object-dtype ndarray"): + with pytest.raises(ValueError, match="sequence of strings"): pd.arrays.TextArray(np.array([])) From 43b51cdafb46f827834de72b09bade6adcf7baef Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 24 Sep 2019 13:44:38 -0500 Subject: [PATCH 40/49] length check --- pandas/core/arrays/text.py | 19 ++++++++++++------- pandas/tests/arrays/text/test_text.py | 11 +++++++++++ 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/text.py b/pandas/core/arrays/text.py index 7f067e53e1883..c0e9a2128e3f5 100644 --- a/pandas/core/arrays/text.py +++ b/pandas/core/arrays/text.py @@ -234,7 +234,7 @@ def value_counts(self, dropna=False): return value_counts(self._ndarray, dropna=dropna) - # Overrride parent, because we have different return types. + # Overrride parent because we have different return types. @classmethod def _create_arithmetic_method(cls, op): def method(self, other): @@ -248,6 +248,14 @@ def method(self, other): valid = ~mask if not lib.is_scalar(other): + if len(other) != len(self): + # prevent improper broadcasting when other is 2D + raise ValueError( + "Lengths of operands do not match: {} != {}".format( + len(self), len(other) + ) + ) + other = np.asarray(other) other = other[valid] @@ -256,13 +264,10 @@ def method(self, other): result[valid] = op(self._ndarray[valid], other) if op.__name__ in {"add", "radd", "mul", "rmul"}: - new = TextArray - elif mask.any(): - new = lambda x: np.asarray(x, dtype="object") + return TextArray(result) else: - new = lambda x: np.asarray(x, dtype="bool") - - return new(result) + dtype = "object" if mask.any() else "bool" + return np.asarray(result, dtype=dtype) return compat.set_function_name(method, "__{}__".format(op.__name__), cls) diff --git a/pandas/tests/arrays/text/test_text.py b/pandas/tests/arrays/text/test_text.py index 4302f51043638..cd18c58e249bc 100644 --- a/pandas/tests/arrays/text/test_text.py +++ b/pandas/tests/arrays/text/test_text.py @@ -68,6 +68,17 @@ def test_add(): tm.assert_series_equal(result, expected) +def test_add_2d(): + a = pd.array(["a", "b", "c"], dtype="text") + b = np.array([["a", "b", "c"]], dtype=object) + with pytest.raises(ValueError, match="3 != 1"): + a + b + + s = pd.Series(a) + with pytest.raises(ValueError, match="3 != 1"): + s + b + + def test_add_sequence(): a = pd.array(["a", "b", None, None], dtype="text") other = ["x", None, "y", None] From 4fd2d11f4eca70828045ce77a671dcf6b325b0fe Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 24 Sep 2019 13:46:19 -0500 Subject: [PATCH 41/49] unimplement sum --- pandas/core/arrays/text.py | 11 +---------- pandas/tests/arrays/text/test_text.py | 2 ++ pandas/tests/extension/test_text.py | 9 +-------- 3 files changed, 4 insertions(+), 18 deletions(-) diff --git a/pandas/core/arrays/text.py b/pandas/core/arrays/text.py index c0e9a2128e3f5..00458fc4c31e5 100644 --- a/pandas/core/arrays/text.py +++ b/pandas/core/arrays/text.py @@ -218,16 +218,7 @@ def astype(self, dtype, copy=True): return super().astype(dtype, copy) def _reduce(self, name, skipna=True, **kwargs): - if name == "sum": - vals = self._ndarray - missing = self.isna() - if skipna: - vals = vals[~missing] - elif missing.any(): - return np.nan - return vals.sum() - - raise TypeError("Cannot perform reduction '{}' with string dtype".format(name)) + raise TypeError("Cannot perform reduction '{}' with text dtype".format(name)) def value_counts(self, dropna=False): from pandas import value_counts diff --git a/pandas/tests/arrays/text/test_text.py b/pandas/tests/arrays/text/test_text.py index cd18c58e249bc..50208d655a547 100644 --- a/pandas/tests/arrays/text/test_text.py +++ b/pandas/tests/arrays/text/test_text.py @@ -142,6 +142,7 @@ def test_constructor_raises(): @pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.xfail(reason="Not implemented TextArray.sum") def test_reduce(skipna): arr = pd.Series(["a", "b", "c"], dtype="text") result = arr.sum(skipna=skipna) @@ -149,6 +150,7 @@ def test_reduce(skipna): @pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.xfail(reason="Not implemented TextArray.sum") def test_reduce_missing(skipna): arr = pd.Series([None, "a", None, "b", "c", None], dtype="text") result = arr.sum(skipna=skipna) diff --git a/pandas/tests/extension/test_text.py b/pandas/tests/extension/test_text.py index f0363d7ede470..dc0d0cac06489 100644 --- a/pandas/tests/extension/test_text.py +++ b/pandas/tests/extension/test_text.py @@ -77,14 +77,7 @@ class TestMissing(base.BaseMissingTests): class TestNoReduce(base.BaseNoReduceTests): - @pytest.mark.parametrize("skipna", [True, False]) - def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): - if all_numeric_reductions == "sum": - pytest.skip("TextArray implements sum") - else: - return super().test_reduce_series_numeric( - data, all_numeric_reductions, skipna - ) + pass class TestMethods(base.BaseMethodsTests): From 8714a53510229a93443e83053e8262b633475519 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 30 Sep 2019 15:25:40 -0500 Subject: [PATCH 42/49] fixup --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 055a1d3269520..1fe44e75c49a3 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -50,7 +50,7 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ -.. _whatsnew_1000.string: +.. _whatsnew_100.text: Dedicated text data type ^^^^^^^^^^^^^^^^^^^^^^^^ From dc9ef3cffb5caca17eaa23245db67581c72b6c6f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 1 Oct 2019 07:11:39 -0500 Subject: [PATCH 43/49] rename --- doc/source/getting_started/basics.rst | 10 ++-- doc/source/reference/arrays.rst | 10 ++-- doc/source/user_guide/text.rst | 16 +++---- doc/source/whatsnew/v1.0.0.rst | 18 ++++---- pandas/__init__.py | 2 +- pandas/arrays/__init__.py | 4 +- pandas/core/api.py | 2 +- pandas/core/arrays/__init__.py | 2 +- pandas/core/arrays/text.py | 66 +++++++++++++-------------- pandas/core/strings.py | 18 ++++---- pandas/tests/api/test_api.py | 2 +- pandas/tests/arrays/text/test_text.py | 56 +++++++++++------------ pandas/tests/extension/test_text.py | 14 +++--- pandas/tests/test_strings.py | 11 ++--- 14 files changed, 114 insertions(+), 117 deletions(-) diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index cc1cbeec9b788..2818011eb02ca 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -986,7 +986,7 @@ not noted for a particular column will be ``NaN``: tsdf.agg({'A': ['mean', 'min'], 'B': 'sum'}) -.. _basics.aggregation.mixed_dtypes: +.. _basics.aggregation.mixed_string: Mixed dtypes ++++++++++++ @@ -1716,7 +1716,7 @@ always uses them). .. note:: Prior to pandas 1.0, string methods were only available on ``object`` -dtype - ``Series``. Pandas 1.0 added the :class:`TextDtype` which is dedicated + ``Series``. Pandas 1.0 added the :class:`StringDtype` which is dedicated to strings. See :ref:`text.types` for more. Please see :ref:`Vectorized String Methods ` for a complete @@ -1932,15 +1932,15 @@ period (time spans) :class:`PeriodDtype` :class:`Period` :class:`arrays. sparse :class:`SparseDtype` (none) :class:`arrays.SparseArray` :ref:`sparse` intervals :class:`IntervalDtype` :class:`Interval` :class:`arrays.IntervalArray` :ref:`advanced.intervalindex` nullable integer :class:`Int64Dtype`, ... (none) :class:`arrays.IntegerArray` :ref:`integer_na` -Text :class:`TextDtype` :class:`str` :class:`arrays.TextArray` :ref:`text` +Strings :class:`StringDtype` :class:`str` :class:`arrays.StringArray` :ref:`text` =================== ========================= ================== ============================= ============================= Pandas has two ways to store strings. 1. ``object`` dtype, which can hold any Python object, including strings. -2. :class:`TextDtype`, which is dedicated to strings. +2. :class:`StringDtype`, which is dedicated to strings. -Generally, we recommend using :class:`TextDtype`. See :ref:`text.types` fore more. +Generally, we recommend using :class:`StringDtype`. See :ref:`text.types` fore more. Finally, arbitrary objects may be stored using the ``object`` dtype, but should be avoided to the extent possible (for performance and interoperability with diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index 81d6cb2e54c2c..0c435e06ac57f 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -24,7 +24,7 @@ Intervals :class:`IntervalDtype` :class:`Interval` :ref:`api.array Nullable Integer :class:`Int64Dtype`, ... (none) :ref:`api.arrays.integer_na` Categorical :class:`CategoricalDtype` (none) :ref:`api.arrays.categorical` Sparse :class:`SparseDtype` (none) :ref:`api.arrays.sparse` -Text :class:`TextDtype` :class:`str` :ref:`api.arrays.string` +Strings :class:`StringDtype` :class:`str` :ref:`api.arrays.string` =================== ========================= ================== ============================= Pandas and third-party libraries can extend NumPy's type system (see :ref:`extending.extension-types`). @@ -467,21 +467,21 @@ Text data --------- When working with text data, where each valid element is a string or missing, -we recommend using :class:`TextDtype` (with the alias ``"text"``). +we recommend using :class:`StringDtype` (with the alias ``"string"``). .. autosummary:: :toctree: api/ :template: autosummary/class_without_autosummary.rst - arrays.TextArray + arrays.StringArray .. autosummary:: :toctree: api/ :template: autosummary/class_without_autosummary.rst - TextDtype + StringDtype -The ``Series.str`` accessor is available for ``Series`` backed by a :class:`arrays.TextArray`. +The ``Series.str`` accessor is available for ``Series`` backed by a :class:`arrays.StringArray`. See :ref:`api.series.str` for more. diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index ca00be77761a7..c8c8a6d57eb7f 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -16,9 +16,9 @@ Text Data Types There are two main ways to store text data 1. ``object`` -dtype NumPy array. -2. :class:`TextDtype` extension type. +2. :class:`StringDtype` extension type. -We recommend using :class:`TextDtype` to store text data. +We recommend using :class:`StringDtype` to store text data. Prior to pandas 1.0, ``object`` dtype was the only option. This was unfortunate for many reasons: @@ -32,13 +32,13 @@ for many reasons: than ``text``. Currently, the performance of ``object`` dtype arrays of strings and -:class:`arrays.TextArray` are about the same. We expect future enhancements +:class:`arrays.StringArray` are about the same. We expect future enhancements to significantly increase the performance and lower the memory overhead of -:class:`~arrays.TextArray`. +:class:`~arrays.StringArray`. .. warning:: - ``TextArray`` is currently considered experimental. The implementation + ``StringArray`` is currently considered experimental. The implementation and parts of the API may change without warning. For backwards-compatibility, ``object`` dtype remains the default type we @@ -53,7 +53,7 @@ To explicitly request ``text`` dtype, specify the ``dtype`` .. ipython:: python pd.Series(['a', 'b', 'c'], dtype="text") - pd.Series(['a', 'b', 'c'], dtype=pd.TextDtype()) + pd.Series(['a', 'b', 'c'], dtype=pd.StringDtype()) Or ``astype`` after the ``Series`` or ``DataFrame`` is created @@ -170,8 +170,8 @@ It is easy to expand this to return a DataFrame using ``expand``. s2.str.split('_', expand=True) -When original ``Series`` has :class:`TextDtype`, the output columns will all -be :class:`TextDtype` as well. +When original ``Series`` has :class:`StringDtype`, the output columns will all +be :class:`StringDtype` as well. It is also possible to limit the number of splits: diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 8beaaf27d8cd2..22f9617d6dcc0 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -50,39 +50,39 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ -.. _whatsnew_100.text: +.. _whatsnew_100.string: Dedicated text data type ^^^^^^^^^^^^^^^^^^^^^^^^ -We've added :class:`TextDtype`, an extension type dedicated to string data. +We've added :class:`StringDtype`, an extension type dedicated to string data. Previously, strings were typically stored in object-dtype NumPy arrays. .. warning:: - ``TextDtype`` and is currently considered experimental. The implementation + ``StringDtype`` and is currently considered experimental. The implementation and parts of the API may change without warning. The text extension type solves several issues with object-dtype NumPy arrays: 1. You can accidentally store a *mixture* of strings and non-strings in an - ``object`` dtype array. A ``TextArray`` can only store strings. + ``object`` dtype array. A ``StringArray`` can only store strings. 2. ``object`` dtype breaks dtype-specific operations like :meth:`DataFrame.select_dtypes`. There isn't a clear way to select *just* text while excluding non-text, but still object-dtype columns. 3. When reading code, the contents of an ``object`` dtype array is less clear - than ``text``. + than ``string``. .. ipython:: python - pd.Series(['abc', None, 'def'], dtype=pd.TextDtype()) + pd.Series(['abc', None, 'def'], dtype=pd.StringDtype()) -You can use the alias ``"text"`` as well. +You can use the alias ``"string"`` as well. .. ipython:: python - s = pd.Series(['abc', None, 'def'], dtype="text") + s = pd.Series(['abc', None, 'def'], dtype="string") s The usual string accessor methods work. Where appropriate, the return type @@ -91,7 +91,7 @@ of the Series or columns of a DataFrame will also have string dtype. s.str.upper() s.str.split('b', expand=True).dtypes -We recommend explicitly using the ``text`` data type when working with strings. +We recommend explicitly using the ``string`` data type when working with strings. See :ref:`text.types` for more. .. _whatsnew_1000.enhancements.other: diff --git a/pandas/__init__.py b/pandas/__init__.py index da4d5106f7aa7..5d163e411c0ac 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -66,7 +66,7 @@ PeriodDtype, IntervalDtype, DatetimeTZDtype, - TextDtype, + StringDtype, # missing isna, isnull, diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py index 24cb6a9c5acf7..9870b5bed076d 100644 --- a/pandas/arrays/__init__.py +++ b/pandas/arrays/__init__.py @@ -11,7 +11,7 @@ PandasArray, PeriodArray, SparseArray, - TextArray, + StringArray, TimedeltaArray, ) @@ -23,6 +23,6 @@ "PandasArray", "PeriodArray", "SparseArray", - "TextArray", + "StringArray", "TimedeltaArray", ] diff --git a/pandas/core/api.py b/pandas/core/api.py index 83ecf675d0389..9d4be9c075122 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -23,7 +23,7 @@ UInt32Dtype, UInt64Dtype, ) -from pandas.core.arrays.text import TextDtype +from pandas.core.arrays.text import StringDtype from pandas.core.construction import array from pandas.core.groupby import Grouper, NamedAgg from pandas.core.index import ( diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 0df154dc07322..384bbc2318c82 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -10,5 +10,5 @@ from .numpy_ import PandasArray, PandasDtype # noqa: F401 from .period import PeriodArray, period_array # noqa: F401 from .sparse import SparseArray # noqa: F401 -from .text import TextArray # noqa: F401 +from .text import StringArray # noqa: F401 from .timedeltas import TimedeltaArray # noqa: F401 diff --git a/pandas/core/arrays/text.py b/pandas/core/arrays/text.py index 00458fc4c31e5..41c7d81bd55da 100644 --- a/pandas/core/arrays/text.py +++ b/pandas/core/arrays/text.py @@ -22,18 +22,18 @@ @register_extension_dtype -class TextDtype(ExtensionDtype): +class StringDtype(ExtensionDtype): """ - Extension dtype for text data. + Extension dtype for string data. .. versionadded:: 1.0.0 .. warning:: - TextDtype is considered experimental. The implementation and + StringDtype is considered experimental. The implementation and parts of the API may change without warning. - In particular, TextDtype.na_value may change to no longer be + In particular, StringDtype.na_value may change to no longer be ``numpy.nan``. Attributes @@ -46,14 +46,14 @@ class TextDtype(ExtensionDtype): Examples -------- - >>> pd.TextDtype() - TextDtype + >>> pd.StringDtype() + StringDtype """ @property def na_value(self) -> "Scalar": """ - TextDtype uses :attr:`numpy.nan` as the missing NA value. + StringDtype uses :attr:`numpy.nan` as the missing NA value. .. warning:: @@ -68,33 +68,33 @@ def type(self) -> Type: @property def name(self) -> str: """ - The alias for TextDtype is ``'text'``. + The alias for StringDtype is ``'string'``. """ - return "text" + return "string" @classmethod def construct_from_string(cls, string: str) -> ExtensionDtype: - if string == "text": + if string == "string": return cls() return super().construct_from_string(string) @classmethod - def construct_array_type(cls) -> "Type[TextArray]": - return TextArray + def construct_array_type(cls) -> "Type[StringArray]": + return StringArray def __repr__(self) -> str: - return "TextDtype" + return "StringDtype" -class TextArray(PandasArray): +class StringArray(PandasArray): """ - Extension array for text data. + Extension array for string data. .. versionadded:: 1.0.0 .. warning:: - TextArray is considered experimental. The implementation and + StringArray is considered experimental. The implementation and parts of the API may change without warning. In particular, the NA value used may change to no longer be @@ -125,22 +125,22 @@ class TextArray(PandasArray): -------- Series.str The string methods are available on Series backed by - a TextArray. + a StringArray. Examples -------- - >>> pd.array(['This is', 'some text', None, 'data.'], dtype="text") - + >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string") + ['This is', 'some text', nan, 'data.'] - Length: 4, dtype: text + Length: 4, dtype: string - Unlike ``object`` dtype arrays, ``TextArray`` doesn't allow non-string + Unlike ``object`` dtype arrays, ``StringArray`` doesn't allow non-string values. - >>> pd.array(['1', 1], dtype="text") + >>> pd.array(['1', 1], dtype="string") Traceback (most recent call last): ... - ValueError: TextArray requires an object-dtype ndarray of strings. + ValueError: StringArray requires an object-dtype ndarray of strings. """ # undo the PandasArray hack @@ -148,25 +148,25 @@ class TextArray(PandasArray): def __init__(self, values, copy=False): super().__init__(values, copy=copy) - self._dtype = TextDtype() + self._dtype = StringDtype() self._validate() def _validate(self): """Validate that we only store NA or strings.""" if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): raise ValueError( - "TextArray requires a sequence of strings or missing values." + "StringArray requires a sequence of strings or missing values." ) if self._ndarray.dtype != "object": raise ValueError( - "TextArray requires a sequence of strings. Got " + "StringArray requires a sequence of strings. Got " "'{}' dtype instead.".format(self._ndarray.dtype) ) @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): if dtype: - assert dtype == "text" + assert dtype == "string" result = super()._from_sequence(scalars, dtype=object, copy=copy) # convert None to np.nan # TODO: it would be nice to do this in _validate / lib.is_string_array @@ -195,7 +195,7 @@ def __setitem__(self, key, value): value = np.nan elif not (isinstance(value, str) or np.isnan(value)): raise ValueError( - "Cannot set non-string value '{}' into a TextArray.".format(value) + "Cannot set non-string value '{}' into a StringArray.".format(value) ) else: if not is_array_like(value): @@ -211,14 +211,14 @@ def fillna(self, value=None, method=None, limit=None): def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) - if isinstance(dtype, TextDtype): + if isinstance(dtype, StringDtype): if copy: return self.copy() return self return super().astype(dtype, copy) def _reduce(self, name, skipna=True, **kwargs): - raise TypeError("Cannot perform reduction '{}' with text dtype".format(name)) + raise TypeError("Cannot perform reduction '{}' with string dtype".format(name)) def value_counts(self, dropna=False): from pandas import value_counts @@ -255,7 +255,7 @@ def method(self, other): result[valid] = op(self._ndarray[valid], other) if op.__name__ in {"add", "radd", "mul", "rmul"}: - return TextArray(result) + return StringArray(result) else: dtype = "object" if mask.any() else "bool" return np.asarray(result, dtype=dtype) @@ -273,5 +273,5 @@ def _add_arithmetic_ops(cls): _create_comparison_method = _create_arithmetic_method -TextArray._add_arithmetic_ops() -TextArray._add_comparison_ops() +StringArray._add_arithmetic_ops() +StringArray._add_comparison_ops() diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 2ee4438024c32..2fb09182cc6cf 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -817,8 +817,8 @@ def _str_extract_frame(arr, pat, flags=0): result_index = arr.index except AttributeError: result_index = None - if arr.dtype.name == "text": - dtype = "text" + if arr.dtype.name == "string": + dtype = "string" else: dtype = object return DataFrame( @@ -1027,7 +1027,7 @@ def str_extractall(arr, pat, flags=0): # workaround #27953 # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails # when the list of values is empty. - if arr.dtype.name == "text": + if arr.dtype.name == "string": dtype = arr.dtype else: dtype = None @@ -1953,7 +1953,7 @@ class StringMethods(NoNewAttributesMixin): def __init__(self, data): self._inferred_dtype = self._validate(data) self._is_categorical = is_categorical_dtype(data) - self._is_string = data.dtype.name == "text" + self._is_string = data.dtype.name == "string" # .values.categories works for both Series/Index self._parent = data.values.categories if self._is_categorical else data @@ -1984,7 +1984,7 @@ def _validate(data): ------- dtype : inferred dtype of data """ - from pandas import TextDtype + from pandas import StringDtype if isinstance(data, ABCMultiIndex): raise AttributeError( @@ -1997,8 +1997,8 @@ def _validate(data): values = getattr(data, "values", data) # Series / Index values = getattr(values, "categories", values) # categorical / normal - # explicitly allow TextDtype - if isinstance(values.dtype, TextDtype): + # explicitly allow StringDtype + if isinstance(values.dtype, StringDtype): return "string" try: @@ -2055,9 +2055,9 @@ def _wrap_result( # We can be wrapping a string / object / categorical result, in which # case we'll want to return the same dtype as the input. # Or we can be wrapping a numeric output, in which case we don't want - # to return a TextArray. + # to return a StringArray. if self._is_string and returns_string: - dtype = "text" + dtype = "string" else: dtype = None diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 056a23ab81acf..6c50159663574 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -68,7 +68,7 @@ class TestPDApi(Base): "Series", "SparseArray", "SparseDtype", - "TextDtype", + "StringDtype", "Timedelta", "TimedeltaIndex", "Timestamp", diff --git a/pandas/tests/arrays/text/test_text.py b/pandas/tests/arrays/text/test_text.py index 50208d655a547..40221c34116ae 100644 --- a/pandas/tests/arrays/text/test_text.py +++ b/pandas/tests/arrays/text/test_text.py @@ -8,13 +8,13 @@ def test_none_to_nan(): - a = pd.arrays.TextArray._from_sequence(["a", None, "b"]) + a = pd.arrays.StringArray._from_sequence(["a", None, "b"]) assert a[1] is not None assert np.isnan(a[1]) def test_setitem_validates(): - a = pd.arrays.TextArray._from_sequence(["a", "b"]) + a = pd.arrays.StringArray._from_sequence(["a", "b"]) with pytest.raises(ValueError, match="10"): a[0] = 10 @@ -31,12 +31,12 @@ def test_setitem_validates(): ], ) def test_string_methods(input, method): - a = pd.Series(input, dtype="text") + a = pd.Series(input, dtype="string") b = pd.Series(input, dtype="object") result = method(a.str) expected = method(b.str) - assert result.dtype.name == "text" + assert result.dtype.name == "string" tm.assert_series_equal(result.astype(object), expected) @@ -44,32 +44,32 @@ def test_astype_roundtrip(): s = pd.Series(pd.date_range("2000", periods=12)) s[0] = None - result = s.astype("text").astype("datetime64[ns]") + result = s.astype("string").astype("datetime64[ns]") tm.assert_series_equal(result, s) def test_add(): - a = pd.Series(["a", "b", "c", None, None], dtype="text") - b = pd.Series(["x", "y", None, "z", None], dtype="text") + a = pd.Series(["a", "b", "c", None, None], dtype="string") + b = pd.Series(["x", "y", None, "z", None], dtype="string") result = a + b - expected = pd.Series(["ax", "by", None, None, None], dtype="text") + expected = pd.Series(["ax", "by", None, None, None], dtype="string") tm.assert_series_equal(result, expected) result = a.add(b) tm.assert_series_equal(result, expected) result = a.radd(b) - expected = pd.Series(["xa", "yb", None, None, None], dtype="text") + expected = pd.Series(["xa", "yb", None, None, None], dtype="string") tm.assert_series_equal(result, expected) result = a.add(b, fill_value="-") - expected = pd.Series(["ax", "by", "c-", "-z", None], dtype="text") + expected = pd.Series(["ax", "by", "c-", "-z", None], dtype="string") tm.assert_series_equal(result, expected) def test_add_2d(): - a = pd.array(["a", "b", "c"], dtype="text") + a = pd.array(["a", "b", "c"], dtype="string") b = np.array([["a", "b", "c"]], dtype=object) with pytest.raises(ValueError, match="3 != 1"): a + b @@ -80,22 +80,22 @@ def test_add_2d(): def test_add_sequence(): - a = pd.array(["a", "b", None, None], dtype="text") + a = pd.array(["a", "b", None, None], dtype="string") other = ["x", None, "y", None] result = a + other - expected = pd.array(["ax", None, None, None], dtype="text") + expected = pd.array(["ax", None, None, None], dtype="string") tm.assert_extension_array_equal(result, expected) result = other + a - expected = pd.array(["xa", None, None, None], dtype="text") + expected = pd.array(["xa", None, None, None], dtype="string") tm.assert_extension_array_equal(result, expected) def test_mul(): - a = pd.array(["a", "b", None], dtype="text") + a = pd.array(["a", "b", None], dtype="string") result = a * 2 - expected = pd.array(["aa", "bb", None], dtype="text") + expected = pd.array(["aa", "bb", None], dtype="string") tm.assert_extension_array_equal(result, expected) result = 2 * a @@ -104,55 +104,55 @@ def test_mul(): @pytest.mark.xfail(reason="GH-28527") def test_add_strings(): - array = pd.array(["a", "b", "c", "d"], dtype="text") + array = pd.array(["a", "b", "c", "d"], dtype="string") df = pd.DataFrame([["t", "u", "v", "w"]]) assert array.__add__(df) is NotImplemented result = array + df - expected = pd.DataFrame([["at", "bu", "cv", "dw"]]).astype("text") + expected = pd.DataFrame([["at", "bu", "cv", "dw"]]).astype("string") tm.assert_frame_equal(result, expected) result = df + array - expected = pd.DataFrame([["ta", "ub", "vc", "wd"]]).astype("text") + expected = pd.DataFrame([["ta", "ub", "vc", "wd"]]).astype("string") tm.assert_frame_equal(result, expected) @pytest.mark.xfail(reason="GH-28527") def test_add_frame(): - array = pd.array(["a", "b", np.nan, np.nan], dtype="text") + array = pd.array(["a", "b", np.nan, np.nan], dtype="string") df = pd.DataFrame([["x", np.nan, "y", np.nan]]) assert array.__add__(df) is NotImplemented result = array + df - expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype("text") + expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype("string") tm.assert_frame_equal(result, expected) result = df + array - expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype("text") + expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype("string") tm.assert_frame_equal(result, expected) def test_constructor_raises(): with pytest.raises(ValueError, match="sequence of strings"): - pd.arrays.TextArray(np.array(["a", "b"], dtype="S1")) + pd.arrays.StringArray(np.array(["a", "b"], dtype="S1")) with pytest.raises(ValueError, match="sequence of strings"): - pd.arrays.TextArray(np.array([])) + pd.arrays.StringArray(np.array([])) @pytest.mark.parametrize("skipna", [True, False]) -@pytest.mark.xfail(reason="Not implemented TextArray.sum") +@pytest.mark.xfail(reason="Not implemented StringArray.sum") def test_reduce(skipna): - arr = pd.Series(["a", "b", "c"], dtype="text") + arr = pd.Series(["a", "b", "c"], dtype="string") result = arr.sum(skipna=skipna) assert result == "abc" @pytest.mark.parametrize("skipna", [True, False]) -@pytest.mark.xfail(reason="Not implemented TextArray.sum") +@pytest.mark.xfail(reason="Not implemented StringArray.sum") def test_reduce_missing(skipna): - arr = pd.Series([None, "a", None, "b", "c", None], dtype="text") + arr = pd.Series([None, "a", None, "b", "c", None], dtype="string") result = arr.sum(skipna=skipna) if skipna: assert result == "abc" diff --git a/pandas/tests/extension/test_text.py b/pandas/tests/extension/test_text.py index dc0d0cac06489..908a0dbb67718 100644 --- a/pandas/tests/extension/test_text.py +++ b/pandas/tests/extension/test_text.py @@ -4,13 +4,13 @@ import pytest import pandas as pd -from pandas.core.arrays.text import TextArray, TextDtype +from pandas.core.arrays.text import StringArray, StringDtype from pandas.tests.extension import base @pytest.fixture def dtype(): - return TextDtype() + return StringDtype() @pytest.fixture @@ -19,23 +19,23 @@ def data(): while strings[0] == strings[1]: strings = np.random.choice(list(string.ascii_letters), size=100) - return TextArray._from_sequence(strings) + return StringArray._from_sequence(strings) @pytest.fixture def data_missing(): """Length 2 array with [NA, Valid]""" - return TextArray._from_sequence([np.nan, "A"]) + return StringArray._from_sequence([np.nan, "A"]) @pytest.fixture def data_for_sorting(): - return TextArray._from_sequence(["B", "C", "A"]) + return StringArray._from_sequence(["B", "C", "A"]) @pytest.fixture def data_missing_for_sorting(): - return TextArray._from_sequence(["B", np.nan, "A"]) + return StringArray._from_sequence(["B", np.nan, "A"]) @pytest.fixture @@ -45,7 +45,7 @@ def na_value(): @pytest.fixture def data_for_grouping(): - return TextArray._from_sequence(["B", "B", np.nan, np.nan, "A", "A", "B", "C"]) + return StringArray._from_sequence(["B", "B", np.nan, np.nan, "A", "A", "B", "C"]) class TestDtype(base.BaseDtypeTests): diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index d0e18bd53b1b8..b50f1a0fd2f2a 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -141,7 +141,6 @@ def any_string_method(request): # subset of the full set from pandas/conftest.py _any_allowed_skipna_inferred_dtype = [ ("string", ["a", np.nan, "c"]), - ("text", ["a", np.nan, "c"]), ("bytes", [b"a", np.nan, b"c"]), ("empty", [np.nan, np.nan, np.nan]), ("empty", []), @@ -157,7 +156,6 @@ def any_allowed_skipna_inferred_dtype(request): The covered (inferred) types are: * 'string' - * 'text' * 'empty' * 'bytes' * 'mixed' @@ -223,7 +221,6 @@ def test_api_per_dtype(self, box, dtype, any_skipna_inferred_dtype): types_passing_constructor = [ "string", - "text", "unicode", "empty", "bytes", @@ -286,7 +283,7 @@ def test_api_per_method( mixed_allowed = method_name not in ["cat"] allowed_types = ( - ["string", "unicode", "empty", "text"] + ["string", "unicode", "empty"] + ["bytes"] * bytes_allowed + ["mixed", "mixed-integer"] * mixed_allowed ) @@ -3279,7 +3276,7 @@ def test_casefold(self): def test_string_array(any_string_method): data = ["a", "bb", np.nan, "ccc"] a = Series(data, dtype=object) - b = Series(data, dtype="text") + b = Series(data, dtype="string") method_name, args, kwargs = any_string_method expected = getattr(a.str, method_name)(*args, **kwargs) @@ -3289,10 +3286,10 @@ def test_string_array(any_string_method): if expected.dtype == "object" and lib.is_string_array( expected.values, skipna=True ): - assert result.dtype == "text" + assert result.dtype == "string" result = result.astype(object) elif isinstance(expected, DataFrame): columns = expected.select_dtypes(include="object").columns - assert all(result[columns].dtypes == "text") + assert all(result[columns].dtypes == "string") result[columns] = result[columns].astype(object) tm.assert_equal(result, expected) From 9419af241776ba98fe57e76d98b4c49ab447bcd0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 1 Oct 2019 07:16:00 -0500 Subject: [PATCH 44/49] rename --- pandas/core/api.py | 2 +- pandas/core/arrays/__init__.py | 2 +- pandas/core/arrays/{text.py => string_.py} | 0 pandas/tests/arrays/{text => string_}/__init__.py | 0 pandas/tests/arrays/{text => string_}/test_text.py | 0 pandas/tests/extension/{test_text.py => test_string.py} | 2 +- 6 files changed, 3 insertions(+), 3 deletions(-) rename pandas/core/arrays/{text.py => string_.py} (100%) rename pandas/tests/arrays/{text => string_}/__init__.py (100%) rename pandas/tests/arrays/{text => string_}/test_text.py (100%) rename pandas/tests/extension/{test_text.py => test_string.py} (97%) diff --git a/pandas/core/api.py b/pandas/core/api.py index 9d4be9c075122..04f2f84c92a15 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -23,7 +23,7 @@ UInt32Dtype, UInt64Dtype, ) -from pandas.core.arrays.text import StringDtype +from pandas.core.arrays.string_ import StringDtype from pandas.core.construction import array from pandas.core.groupby import Grouper, NamedAgg from pandas.core.index import ( diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 384bbc2318c82..868118bac6a7b 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -10,5 +10,5 @@ from .numpy_ import PandasArray, PandasDtype # noqa: F401 from .period import PeriodArray, period_array # noqa: F401 from .sparse import SparseArray # noqa: F401 -from .text import StringArray # noqa: F401 +from .string_ import StringArray # noqa: F401 from .timedeltas import TimedeltaArray # noqa: F401 diff --git a/pandas/core/arrays/text.py b/pandas/core/arrays/string_.py similarity index 100% rename from pandas/core/arrays/text.py rename to pandas/core/arrays/string_.py diff --git a/pandas/tests/arrays/text/__init__.py b/pandas/tests/arrays/string_/__init__.py similarity index 100% rename from pandas/tests/arrays/text/__init__.py rename to pandas/tests/arrays/string_/__init__.py diff --git a/pandas/tests/arrays/text/test_text.py b/pandas/tests/arrays/string_/test_text.py similarity index 100% rename from pandas/tests/arrays/text/test_text.py rename to pandas/tests/arrays/string_/test_text.py diff --git a/pandas/tests/extension/test_text.py b/pandas/tests/extension/test_string.py similarity index 97% rename from pandas/tests/extension/test_text.py rename to pandas/tests/extension/test_string.py index 908a0dbb67718..5b872d5b72227 100644 --- a/pandas/tests/extension/test_text.py +++ b/pandas/tests/extension/test_string.py @@ -4,7 +4,7 @@ import pytest import pandas as pd -from pandas.core.arrays.text import StringArray, StringDtype +from pandas.core.arrays.string_ import StringArray, StringDtype from pandas.tests.extension import base From 462b29d426d88db4bc87b7207ef080a8334250bf Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 1 Oct 2019 07:19:16 -0500 Subject: [PATCH 45/49] doc updates --- doc/source/user_guide/text.rst | 60 +++++++++++++++++----------------- doc/source/whatsnew/v1.0.0.rst | 4 +-- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index c8c8a6d57eb7f..789ff2a65355b 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -29,7 +29,7 @@ for many reasons: There isn't a clear way to select *just* text while excluding non-text but still object-dtype columns. 3. When reading code, the contents of an ``object`` dtype array is less clear - than ``text``. + than ``'string'``. Currently, the performance of ``object`` dtype arrays of strings and :class:`arrays.StringArray` are about the same. We expect future enhancements @@ -48,11 +48,11 @@ infer a list of strings to pd.Series(['a', 'b', 'c']) -To explicitly request ``text`` dtype, specify the ``dtype`` +To explicitly request ``string`` dtype, specify the ``dtype`` .. ipython:: python - pd.Series(['a', 'b', 'c'], dtype="text") + pd.Series(['a', 'b', 'c'], dtype="string") pd.Series(['a', 'b', 'c'], dtype=pd.StringDtype()) Or ``astype`` after the ``Series`` or ``DataFrame`` is created @@ -61,10 +61,10 @@ Or ``astype`` after the ``Series`` or ``DataFrame`` is created s = pd.Series(['a', 'b', 'c']) s - s.astype("text") + s.astype("string") Everything that follows in the rest of this document applies equally to -``text`` and ``object`` dtype. +``string`` and ``object`` dtype. .. _text.string_methods: @@ -80,7 +80,7 @@ the equivalent (scalar) built-in string methods: .. ipython:: python s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'], - dtype="text") + dtype="string") s.str.lower() s.str.upper() s.str.len() @@ -154,7 +154,7 @@ Methods like ``split`` return a Series of lists: .. ipython:: python - s2 = pd.Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'], dtype="text") + s2 = pd.Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'], dtype="string") s2.str.split('_') Elements in the split lists can be accessed using ``get`` or ``[]`` notation: @@ -193,7 +193,7 @@ i.e., from the end of the string to the beginning of the string: s3 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', '', np.nan, 'CABA', 'dog', 'cat'], - dtype="text") + dtype="string") s3 s3.str.replace('^.a|dog', 'XX-XX ', case=False) @@ -204,7 +204,7 @@ following code will cause trouble because of the regular expression meaning of .. ipython:: python # Consider the following badly formatted financial data - dollars = pd.Series(['12', '-$10', '$10,000'], dtype="text") + dollars = pd.Series(['12', '-$10', '$10,000'], dtype="string") # This does what you'd naively expect: dollars.str.replace('$', '') @@ -243,7 +243,7 @@ positional argument (a regex object) and return a string. return m.group(0)[::-1] pd.Series(['foo 123', 'bar baz', np.nan], - dtype="text").str.replace(pat, repl) + dtype="string").str.replace(pat, repl) # Using regex groups pat = r"(?P\w+) (?P\w+) (?P\w+)" @@ -252,7 +252,7 @@ positional argument (a regex object) and return a string. return m.group('two').swapcase() pd.Series(['Foo Bar Baz', np.nan], - dtype="text").str.replace(pat, repl) + dtype="string").str.replace(pat, repl) .. versionadded:: 0.20.0 @@ -291,7 +291,7 @@ The content of a ``Series`` (or ``Index``) can be concatenated: .. ipython:: python - s = pd.Series(['a', 'b', 'c', 'd'], dtype="text") + s = pd.Series(['a', 'b', 'c', 'd'], dtype="string") s.str.cat(sep=',') If not specified, the keyword ``sep`` for the separator defaults to the empty string, ``sep=''``: @@ -304,7 +304,7 @@ By default, missing values are ignored. Using ``na_rep``, they can be given a re .. ipython:: python - t = pd.Series(['a', 'b', np.nan, 'd'], dtype="text") + t = pd.Series(['a', 'b', np.nan, 'd'], dtype="string") t.str.cat(sep=',') t.str.cat(sep=',', na_rep='-') @@ -350,7 +350,7 @@ the ``join``-keyword. :okwarning: u = pd.Series(['b', 'd', 'a', 'c'], index=[1, 3, 0, 2], - dtype="text") + dtype="string") s u s.str.cat(u) @@ -367,7 +367,7 @@ In particular, alignment also means that the different lengths do not need to co .. ipython:: python v = pd.Series(['z', 'a', 'b', 'd', 'e'], index=[-1, 0, 1, 3, 4], - dtype="text") + dtype="string") s v s.str.cat(v, join='left', na_rep='-') @@ -424,7 +424,7 @@ of the string, the result will be a ``NaN``. s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'], - dtype="text") + dtype="string") s.str[0] s.str[1] @@ -456,7 +456,7 @@ DataFrame with one column per group. .. ipython:: python pd.Series(['a1', 'b2', 'c3'], - dtype="text").str.extract(r'([ab])(\d)', expand=False) + dtype="string").str.extract(r'([ab])(\d)', expand=False) Elements that do not match return a row filled with ``NaN``. Thus, a Series of messy strings can be "converted" into a like-indexed Series @@ -470,15 +470,15 @@ Named groups like .. ipython:: python pd.Series(['a1', 'b2', 'c3'], - dtype="text").str.extract(r'(?P[ab])(?P\d)', - expand=False) + dtype="string").str.extract(r'(?P[ab])(?P\d)', + expand=False) and optional groups like .. ipython:: python pd.Series(['a1', 'b2', '3'], - dtype="text").str.extract(r'([ab])?(\d)', expand=False) + dtype="string").str.extract(r'([ab])?(\d)', expand=False) can also be used. Note that any capture group names in the regular expression will be used for column names; otherwise capture group @@ -490,14 +490,14 @@ with one column if ``expand=True``. .. ipython:: python pd.Series(['a1', 'b2', 'c3'], - dtype="text").str.extract(r'[ab](\d)', expand=True) + dtype="string").str.extract(r'[ab](\d)', expand=True) It returns a Series if ``expand=False``. .. ipython:: python pd.Series(['a1', 'b2', 'c3'], - dtype="text").str.extract(r'[ab](\d)', expand=False) + dtype="string").str.extract(r'[ab](\d)', expand=False) Calling on an ``Index`` with a regex with exactly one capture group returns a ``DataFrame`` with one column if ``expand=True``. @@ -505,7 +505,7 @@ returns a ``DataFrame`` with one column if ``expand=True``. .. ipython:: python s = pd.Series(["a1", "b2", "c3"], ["A11", "B22", "C33"], - dtype="text") + dtype="string") s s.index.str.extract("(?P[a-zA-Z])", expand=True) @@ -551,7 +551,7 @@ Unlike ``extract`` (which returns only the first match), .. ipython:: python s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"], - dtype="text") + dtype="string") s two_groups = '(?P[a-z])(?P[0-9])' s.str.extract(two_groups, expand=True) @@ -569,7 +569,7 @@ When each subject string in the Series has exactly one match, .. ipython:: python - s = pd.Series(['a3', 'b3', 'c2'], dtype="text") + s = pd.Series(['a3', 'b3', 'c2'], dtype="string") s then ``extractall(pat).xs(0, level='match')`` gives the same result as @@ -590,7 +590,7 @@ same result as a ``Series.str.extractall`` with a default index (starts from 0). pd.Index(["a1a2", "b1", "c1"]).str.extractall(two_groups) - pd.Series(["a1a2", "b1", "c1"], dtype="text").str.extractall(two_groups) + pd.Series(["a1a2", "b1", "c1"], dtype="string").str.extractall(two_groups) Testing for Strings that match or contain a pattern @@ -602,14 +602,14 @@ You can check whether elements contain a pattern: pattern = r'[0-9][a-z]' pd.Series(['1', '2', '3a', '3b', '03c'], - dtype="text").str.contains(pattern) + dtype="string").str.contains(pattern) Or whether elements match a pattern: .. ipython:: python pd.Series(['1', '2', '3a', '3b', '03c'], - dtype="text").str.match(pattern) + dtype="string").str.match(pattern) The distinction between ``match`` and ``contains`` is strictness: ``match`` relies on strict ``re.match``, while ``contains`` relies on ``re.search``. @@ -620,7 +620,7 @@ an extra ``na`` argument so missing values can be considered True or False: .. ipython:: python s4 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'], - dtype="text") + dtype="string") s4.str.contains('A', na=False) .. _text.indicator: @@ -633,7 +633,7 @@ For example if they are separated by a ``'|'``: .. ipython:: python - s = pd.Series(['a', 'a|b', np.nan, 'a|c'], dtype="text") + s = pd.Series(['a', 'a|b', np.nan, 'a|c'], dtype="string") s.str.get_dummies(sep='|') String ``Index`` also supports ``get_dummies`` which returns a ``MultiIndex``. diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 22f9617d6dcc0..73acc4207d35a 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -52,8 +52,8 @@ Enhancements .. _whatsnew_100.string: -Dedicated text data type -^^^^^^^^^^^^^^^^^^^^^^^^ +Dedicated string data type +^^^^^^^^^^^^^^^^^^^^^^^^^^ We've added :class:`StringDtype`, an extension type dedicated to string data. Previously, strings were typically stored in object-dtype NumPy arrays. From 0391563156a131959fcaea7b99d04bfd06c4d18e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 1 Oct 2019 09:59:59 -0500 Subject: [PATCH 46/49] fixups --- ci/code_checks.sh | 4 ++-- doc/source/getting_started/basics.rst | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 3fc95efad1905..9f420857319ad 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -262,8 +262,8 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then -k"-from_arrays -from_breaks -from_intervals -from_tuples -set_closed -to_tuples -interval_range" RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Doctests arrays/text.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/arrays/text.py + MSG='Doctests arrays/string_.py' ; echo $MSG + pytest -q --doctest-modules pandas/core/arrays/string_.py RET=$(($RET + $?)) ; echo $MSG "DONE" fi diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index 2818011eb02ca..36a7166f350e5 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -1705,7 +1705,7 @@ built-in string methods. For example: .. ipython:: python s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'], - dtype="text") + dtype="string") s.str.lower() Powerful pattern-matching methods are provided as well, but note that From 6aebd8c86d5f5a845dbaf03278b8c8bf93acf3f4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 3 Oct 2019 19:49:25 -0500 Subject: [PATCH 47/49] move and perf --- pandas/core/arrays/string_.py | 6 +++++- .../tests/arrays/string_/{test_text.py => test_string.py} | 0 2 files changed, 5 insertions(+), 1 deletion(-) rename pandas/tests/arrays/string_/{test_text.py => test_string.py} (100%) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 41c7d81bd55da..87649ac651127 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -147,9 +147,13 @@ class StringArray(PandasArray): _typ = "extension" def __init__(self, values, copy=False): + values = extract_array(values) + skip_validation = isinstance(values, type(self)) + super().__init__(values, copy=copy) self._dtype = StringDtype() - self._validate() + if not skip_validation: + self._validate() def _validate(self): """Validate that we only store NA or strings.""" diff --git a/pandas/tests/arrays/string_/test_text.py b/pandas/tests/arrays/string_/test_string.py similarity index 100% rename from pandas/tests/arrays/string_/test_text.py rename to pandas/tests/arrays/string_/test_string.py From 2ee5e300828a1abcee1ed333f9342d2a8889679e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 3 Oct 2019 19:59:40 -0500 Subject: [PATCH 48/49] test is_string_dtype --- pandas/tests/dtypes/test_common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 266f7ac50c663..466b724f98770 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -291,6 +291,8 @@ def test_is_string_dtype(): assert com.is_string_dtype(str) assert com.is_string_dtype(object) assert com.is_string_dtype(np.array(["a", "b"])) + assert com.is_string_dtype(pd.StringDtype()) + assert com.is_string_dtype(pd.array(["a", "b"], dtype="string")) def test_is_period_arraylike(): From 7e92cded0ca1e3747d02b417d27d4d7d039fe1b4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 4 Oct 2019 09:03:03 -0500 Subject: [PATCH 49/49] helper --- pandas/core/strings.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 2fb09182cc6cf..888d2ae6f9473 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -763,6 +763,16 @@ def f(x): return f +def _result_dtype(arr): + # workaround #27953 + # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails + # when the list of values is empty. + if arr.dtype.name == "string": + return "string" + else: + return object + + def _str_extract_noexpand(arr, pat, flags=0): """ Find groups in each string in the Series using passed regular @@ -817,10 +827,7 @@ def _str_extract_frame(arr, pat, flags=0): result_index = arr.index except AttributeError: result_index = None - if arr.dtype.name == "string": - dtype = "string" - else: - dtype = object + dtype = _result_dtype(arr) return DataFrame( [groups_or_na(val) for val in arr], columns=columns, @@ -1023,14 +1030,7 @@ def str_extractall(arr, pat, flags=0): from pandas import MultiIndex index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"]) - - # workaround #27953 - # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails - # when the list of values is empty. - if arr.dtype.name == "string": - dtype = arr.dtype - else: - dtype = None + dtype = _result_dtype(arr) result = arr._constructor_expanddim( match_list, index=index, columns=columns, dtype=dtype