From c24b5b668bff8e73917c6238455a9f547362b20b Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Wed, 31 Jul 2019 15:52:00 -0500
Subject: [PATCH 01/49] API: Add string extension type

This adds a new extension type 'string' for storing string data.

The data model is essentially unchanged from master. String are still
stored in an object-dtype ndarray. Scalar elements are still Python
strs, and `np.nan` is still used as the string dtype.
---
 doc/source/getting_started/basics.rst      |   9 +-
 doc/source/reference/arrays.rst            |  26 ++-
 doc/source/user_guide/text.rst             | 121 +++++++++++---
 doc/source/whatsnew/v1.0.0.rst             |  27 +++
 pandas/__init__.py                         |   1 +
 pandas/arrays/__init__.py                  |   2 +
 pandas/core/api.py                         |   1 +
 pandas/core/arrays/__init__.py             |   1 +
 pandas/core/arrays/numpy_.py               |   6 +-
 pandas/core/arrays/string_.py              | 181 +++++++++++++++++++++
 pandas/core/dtypes/missing.py              |   1 +
 pandas/core/strings.py                     | 118 ++++++++++----
 pandas/tests/api/test_api.py               |   1 +
 pandas/tests/arrays/string_/test_string.py |  60 +++++++
 pandas/tests/extension/test_string.py      | 105 ++++++++++++
 pandas/tests/test_strings.py               |  25 +++
 pandas/util/testing.py                     |   3 +
 17 files changed, 629 insertions(+), 59 deletions(-)
 create mode 100644 pandas/core/arrays/string_.py
 create mode 100644 pandas/tests/arrays/string_/test_string.py
 create mode 100644 pandas/tests/extension/test_string.py

diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst
index 3f6f56376861f..bffd2c575e5ba 100644
--- a/doc/source/getting_started/basics.rst
+++ b/doc/source/getting_started/basics.rst
@@ -1704,7 +1704,8 @@ built-in string methods. For example:
 
  .. ipython:: python
 
-  s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
+  s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'],
+                dtype="string")
   s.str.lower()
 
 Powerful pattern-matching methods are provided as well, but note that
@@ -1712,6 +1713,12 @@ pattern-matching generally uses `regular expressions
 <https://docs.python.org/3/library/re.html>`__ by default (and in some cases
 always uses them).
 
+.. note::
+
+   Prior to pandas 1.0, string methods were only available on ``object`` -dtype
+   ``Series``. Pandas 1.0 added the :class:`StringDtype` which is dedicated
+   to strings. See :ref:`text.types` for more.
+
 Please see :ref:`Vectorized String Methods <text.string_methods>` for a complete
 description.
 
diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst
index 7f464bf952bfb..f1a155ca85cbf 100644
--- a/doc/source/reference/arrays.rst
+++ b/doc/source/reference/arrays.rst
@@ -24,6 +24,7 @@ Intervals           :class:`IntervalDtype`    :class:`Interval`  :ref:`api.array
 Nullable Integer    :class:`Int64Dtype`, ...  (none)             :ref:`api.arrays.integer_na`
 Categorical         :class:`CategoricalDtype` (none)             :ref:`api.arrays.categorical`
 Sparse              :class:`SparseDtype`      (none)             :ref:`api.arrays.sparse`
+Text                :class:`StringDtype`      :class:`str`       :ref:`api.arrays.string`
 =================== ========================= ================== =============================
 
 Pandas and third-party libraries can extend NumPy's type system (see :ref:`extending.extension-types`).
@@ -460,6 +461,29 @@ and methods if the :class:`Series` contains sparse values. See
 :ref:`api.series.sparse` for more.
 
 
+.. _api.arrays.string:
+
+Text data
+---------
+
+When working with text data, where each valid element is a string, we recommend using
+:ref:`StringDtype` (with the alias ``"string"``).
+
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_without_autosummary.rst
+
+   arrays.StringArray
+
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_without_autosummary.rst
+
+   StringDtype
+
+The ``Series.str`` accessor is available for ``Series`` backed by a :ref:`arrays.StringArray`.
+See :ref:`api.series.str` for more.
+
 
 .. Dtype attributes which are manually listed in their docstrings: including
 .. it here to make sure a docstring page is built for them
@@ -471,4 +495,4 @@ and methods if the :class:`Series` contains sparse values. See
       DatetimeTZDtype.unit
       DatetimeTZDtype.tz
       PeriodDtype.freq
-      IntervalDtype.subtype
\ No newline at end of file
+      IntervalDtype.subtype
diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
index acb5810e5252a..762c16e9a84eb 100644
--- a/doc/source/user_guide/text.rst
+++ b/doc/source/user_guide/text.rst
@@ -6,8 +6,61 @@
 Working with text data
 ======================
 
+.. _text.types:
+
+Text Data Types
+---------------
+
+.. versionadded:: 1.0.0
+
+There are two main ways to store text data
+
+1. ``object`` -dtype NumPy array.
+2. As an :class:`arrays.StringArray` extension type.
+
+We recommend using :class:`arrays.StringArray` to store text data.
+
+Prior to pandas 1.0, ``object`` dtype was the only option. This was unfortunate
+for many reasons:
+
+1. You can accidentally store a *mixture* of strings and non-strings in an
+   ``object`` dtype array. It's better to have a dedicated dtype.
+2. ``object`` dtype breaks dtype-specific operations like ``select_dtypes``.
+   There isn't a clear way to select *just* text while excluding non-text
+   but still object-dtype columns.
+3. When reading code, the contents of an ``object`` dtype array is less clear
+   than ``string``.
+
+For backwards-compatibility, ``object`` dtype remains the default type we
+infer a list of strings to
+
+.. ipython:: python
+
+   pd.Series(['a', 'b', 'c'])
+
+To explicitly request ``string`` dtype, specify the ``dtype``
+
+.. ipython:: python
+
+   pd.Series(['a', 'b', 'c'], dtype="string")
+   pd.Series(['a', 'b', 'c'], dtype=pd.StringDtype())
+
+Or ``astype`` after the ``Series`` or ``DataFrame`` is created
+
+.. ipython:: python
+
+   s = pd.Series(['a', 'b', 'c'])
+   s
+   s.astype("string")
+
+Everything that follows in the rest of this document applies equally to
+``string`` and ``object`` dtype.
+
 .. _text.string_methods:
 
+String Methods
+--------------
+
 Series and Index are equipped with a set of string processing methods
 that make it easy to operate on each element of the array. Perhaps most
 importantly, these methods exclude missing/NA values automatically. These are
@@ -16,7 +69,8 @@ the equivalent (scalar) built-in string methods:
 
 .. ipython:: python
 
-   s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
+   s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'],
+                 dtype="string")
    s.str.lower()
    s.str.upper()
    s.str.len()
@@ -90,7 +144,7 @@ Methods like ``split`` return a Series of lists:
 
 .. ipython:: python
 
-   s2 = pd.Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'])
+   s2 = pd.Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'], dtype="string")
    s2.str.split('_')
 
 Elements in the split lists can be accessed using ``get`` or ``[]`` notation:
@@ -106,6 +160,9 @@ It is easy to expand this to return a DataFrame using ``expand``.
 
    s2.str.split('_', expand=True)
 
+When original ``Series`` has :ref:`StringDtype`, the output columns will all
+be :ref:`StringDtype` as well.
+
 It is also possible to limit the number of splits:
 
 .. ipython:: python
@@ -125,7 +182,8 @@ i.e., from the end of the string to the beginning of the string:
 .. ipython:: python
 
    s3 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca',
-                  '', np.nan, 'CABA', 'dog', 'cat'])
+                  '', np.nan, 'CABA', 'dog', 'cat'],
+                  dtype="string")
    s3
    s3.str.replace('^.a|dog', 'XX-XX ', case=False)
 
@@ -136,7 +194,7 @@ following code will cause trouble because of the regular expression meaning of
 .. ipython:: python
 
    # Consider the following badly formatted financial data
-   dollars = pd.Series(['12', '-$10', '$10,000'])
+   dollars = pd.Series(['12', '-$10', '$10,000'], dtype="string")
 
    # This does what you'd naively expect:
    dollars.str.replace('$', '')
@@ -174,7 +232,7 @@ positional argument (a regex object) and return a string.
    def repl(m):
        return m.group(0)[::-1]
 
-   pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(pat, repl)
+   pd.Series(['foo 123', 'bar baz', np.nan], dtype="string").str.replace(pat, repl)
 
    # Using regex groups
    pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
@@ -182,7 +240,7 @@ positional argument (a regex object) and return a string.
    def repl(m):
        return m.group('two').swapcase()
 
-   pd.Series(['Foo Bar Baz', np.nan]).str.replace(pat, repl)
+   pd.Series(['Foo Bar Baz', np.nan], dtype="string").str.replace(pat, repl)
 
 .. versionadded:: 0.20.0
 
@@ -221,7 +279,7 @@ The content of a ``Series`` (or ``Index``) can be concatenated:
 
 .. ipython:: python
 
-    s = pd.Series(['a', 'b', 'c', 'd'])
+    s = pd.Series(['a', 'b', 'c', 'd'], dtype="string")
     s.str.cat(sep=',')
 
 If not specified, the keyword ``sep`` for the separator defaults to the empty string, ``sep=''``:
@@ -234,7 +292,7 @@ By default, missing values are ignored. Using ``na_rep``, they can be given a re
 
 .. ipython:: python
 
-    t = pd.Series(['a', 'b', np.nan, 'd'])
+    t = pd.Series(['a', 'b', np.nan, 'd'], dtype="string")
     t.str.cat(sep=',')
     t.str.cat(sep=',', na_rep='-')
 
@@ -279,7 +337,8 @@ the ``join``-keyword.
 .. ipython:: python
    :okwarning:
 
-   u = pd.Series(['b', 'd', 'a', 'c'], index=[1, 3, 0, 2])
+   u = pd.Series(['b', 'd', 'a', 'c'], index=[1, 3, 0, 2],
+                 dtype="string")
    s
    u
    s.str.cat(u)
@@ -295,7 +354,8 @@ In particular, alignment also means that the different lengths do not need to co
 
 .. ipython:: python
 
-    v = pd.Series(['z', 'a', 'b', 'd', 'e'], index=[-1, 0, 1, 3, 4])
+    v = pd.Series(['z', 'a', 'b', 'd', 'e'], index=[-1, 0, 1, 3, 4],
+                  dtype="string")
     s
     v
     s.str.cat(v, join='left', na_rep='-')
@@ -351,7 +411,8 @@ of the string, the result will be a ``NaN``.
 .. ipython:: python
 
    s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan,
-                  'CABA', 'dog', 'cat'])
+                  'CABA', 'dog', 'cat'],
+                 dtype="string")
 
    s.str[0]
    s.str[1]
@@ -382,7 +443,8 @@ DataFrame with one column per group.
 
 .. ipython:: python
 
-   pd.Series(['a1', 'b2', 'c3']).str.extract(r'([ab])(\d)', expand=False)
+   pd.Series(['a1', 'b2', 'c3'],
+             dtype="string").str.extract(r'([ab])(\d)', expand=False)
 
 Elements that do not match return a row filled with ``NaN``. Thus, a
 Series of messy strings can be "converted" into a like-indexed Series
@@ -395,14 +457,16 @@ Named groups like
 
 .. ipython:: python
 
-   pd.Series(['a1', 'b2', 'c3']).str.extract(r'(?P<letter>[ab])(?P<digit>\d)',
-                                             expand=False)
+   pd.Series(['a1', 'b2', 'c3'],
+             dtype="string").str.extract(r'(?P<letter>[ab])(?P<digit>\d)',
+                                         expand=False)
 
 and optional groups like
 
 .. ipython:: python
 
-   pd.Series(['a1', 'b2', '3']).str.extract(r'([ab])?(\d)', expand=False)
+   pd.Series(['a1', 'b2', '3'],
+             dtype="string").str.extract(r'([ab])?(\d)', expand=False)
 
 can also be used. Note that any capture group names in the regular
 expression will be used for column names; otherwise capture group
@@ -413,20 +477,23 @@ with one column if ``expand=True``.
 
 .. ipython:: python
 
-   pd.Series(['a1', 'b2', 'c3']).str.extract(r'[ab](\d)', expand=True)
+   pd.Series(['a1', 'b2', 'c3'],
+             dtype="string").str.extract(r'[ab](\d)', expand=True)
 
 It returns a Series if ``expand=False``.
 
 .. ipython:: python
 
-   pd.Series(['a1', 'b2', 'c3']).str.extract(r'[ab](\d)', expand=False)
+   pd.Series(['a1', 'b2', 'c3'],
+             dtype="string").str.extract(r'[ab](\d)', expand=False)
 
 Calling on an ``Index`` with a regex with exactly one capture group
 returns a ``DataFrame`` with one column if ``expand=True``.
 
 .. ipython:: python
 
-   s = pd.Series(["a1", "b2", "c3"], ["A11", "B22", "C33"])
+   s = pd.Series(["a1", "b2", "c3"], ["A11", "B22", "C33"],
+                 dtype="string")
    s
    s.index.str.extract("(?P<letter>[a-zA-Z])", expand=True)
 
@@ -471,7 +538,8 @@ Unlike ``extract`` (which returns only the first match),
 
 .. ipython:: python
 
-   s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"])
+   s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"],
+                 dtype="string")
    s
    two_groups = '(?P<letter>[a-z])(?P<digit>[0-9])'
    s.str.extract(two_groups, expand=True)
@@ -489,7 +557,7 @@ When each subject string in the Series has exactly one match,
 
 .. ipython:: python
 
-   s = pd.Series(['a3', 'b3', 'c2'])
+   s = pd.Series(['a3', 'b3', 'c2'], dtype="string")
    s
 
 then ``extractall(pat).xs(0, level='match')`` gives the same result as
@@ -510,7 +578,7 @@ same result as a ``Series.str.extractall`` with a default index (starts from 0).
 
    pd.Index(["a1a2", "b1", "c1"]).str.extractall(two_groups)
 
-   pd.Series(["a1a2", "b1", "c1"]).str.extractall(two_groups)
+   pd.Series(["a1a2", "b1", "c1"], dtype="string").str.extractall(two_groups)
 
 
 Testing for Strings that match or contain a pattern
@@ -521,13 +589,15 @@ You can check whether elements contain a pattern:
 .. ipython:: python
 
    pattern = r'[0-9][a-z]'
-   pd.Series(['1', '2', '3a', '3b', '03c']).str.contains(pattern)
+   pd.Series(['1', '2', '3a', '3b', '03c'],
+             dtype="string").str.contains(pattern)
 
 Or whether elements match a pattern:
 
 .. ipython:: python
 
-   pd.Series(['1', '2', '3a', '3b', '03c']).str.match(pattern)
+   pd.Series(['1', '2', '3a', '3b', '03c'],
+             dtype="string").str.match(pattern)
 
 The distinction between ``match`` and ``contains`` is strictness: ``match``
 relies on strict ``re.match``, while ``contains`` relies on ``re.search``.
@@ -537,7 +607,8 @@ an extra ``na`` argument so missing values can be considered True or False:
 
 .. ipython:: python
 
-   s4 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
+   s4 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'],
+                  dtype="string")
    s4.str.contains('A', na=False)
 
 .. _text.indicator:
@@ -550,7 +621,7 @@ For example if they are separated by a ``'|'``:
 
 .. ipython:: python
 
-    s = pd.Series(['a', 'a|b', np.nan, 'a|c'])
+    s = pd.Series(['a', 'a|b', np.nan, 'a|c'], dtype="string")
     s.str.get_dummies(sep='|')
 
 String ``Index`` also supports ``get_dummies`` which returns a ``MultiIndex``.
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
index 0be4ebc627b30..4e34d0e5efa02 100644
--- a/doc/source/whatsnew/v1.0.0.rst
+++ b/doc/source/whatsnew/v1.0.0.rst
@@ -21,6 +21,33 @@ including other versions of pandas.
 Enhancements
 ~~~~~~~~~~~~
 
+Dedicated string data type
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+We've added :ref:`StringDtype`, an extension type dedicated to string data.
+Previously, strings were typically stored in object-dtype NumPy arrays.
+
+.. ipython:: python
+
+   pd.Series(['abc', None, 'def'], dtype=pd.StringDtype())
+
+You can use the alias ``'string'`` as well.
+
+.. ipython:: python
+
+   s = pd.Series(['abc', None, 'def'], dtype="string")
+   s
+
+The usual string accessor methods work. Where appropriate, the return type
+of the Series or columns of a DataFrame will also have string dtype.
+
+   s.str.upper()
+   s.str.split('b', expand=True).dtypes
+
+We recommend explicitly using the ``string`` data type when working with strings.
+See :ref:`text.types` for more.
+
+
 -
 -
 
diff --git a/pandas/__init__.py b/pandas/__init__.py
index 6351b508fb0e5..5db1814943cf9 100644
--- a/pandas/__init__.py
+++ b/pandas/__init__.py
@@ -66,6 +66,7 @@
     PeriodDtype,
     IntervalDtype,
     DatetimeTZDtype,
+    StringDtype,
     # missing
     isna,
     isnull,
diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py
index db01f2a0c674f..9870b5bed076d 100644
--- a/pandas/arrays/__init__.py
+++ b/pandas/arrays/__init__.py
@@ -11,6 +11,7 @@
     PandasArray,
     PeriodArray,
     SparseArray,
+    StringArray,
     TimedeltaArray,
 )
 
@@ -22,5 +23,6 @@
     "PandasArray",
     "PeriodArray",
     "SparseArray",
+    "StringArray",
     "TimedeltaArray",
 ]
diff --git a/pandas/core/api.py b/pandas/core/api.py
index 73323d93b8215..fabd4ee3414ea 100644
--- a/pandas/core/api.py
+++ b/pandas/core/api.py
@@ -21,6 +21,7 @@
     DatetimeTZDtype,
 )
 from pandas.core.arrays import Categorical
+from pandas.core.arrays.string_ import StringDtype
 from pandas.core.construction import array
 from pandas.core.groupby import Grouper, NamedAgg
 from pandas.io.formats.format import set_eng_float_format
diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py
index 5c83ed8cf5e24..868118bac6a7b 100644
--- a/pandas/core/arrays/__init__.py
+++ b/pandas/core/arrays/__init__.py
@@ -10,4 +10,5 @@
 from .numpy_ import PandasArray, PandasDtype  # noqa: F401
 from .period import PeriodArray, period_array  # noqa: F401
 from .sparse import SparseArray  # noqa: F401
+from .string_ import StringArray  # noqa: F401
 from .timedeltas import TimedeltaArray  # noqa: F401
diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
index 4e2e37d88eb9a..e3a0ff0eceb1a 100644
--- a/pandas/core/arrays/numpy_.py
+++ b/pandas/core/arrays/numpy_.py
@@ -236,7 +236,11 @@ def __setitem__(self, key, value):
             value = np.asarray(value)
 
         values = self._ndarray
-        t = np.result_type(value, values)
+        if isinstance(value, str):
+            # Avoid issues with result_type and typecodes.
+            t = object
+        else:
+            t = np.result_type(value, values)
         if t != self._ndarray.dtype:
             values = values.astype(t, casting="safe")
             values[key] = value
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
new file mode 100644
index 0000000000000..9a7cfb45d68ea
--- /dev/null
+++ b/pandas/core/arrays/string_.py
@@ -0,0 +1,181 @@
+from typing import Type
+
+import numpy as np
+
+from pandas._libs import lib
+
+from pandas.core.dtypes.base import ExtensionDtype
+from pandas.core.dtypes.common import pandas_dtype
+from pandas.core.dtypes.dtypes import register_extension_dtype
+from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
+from pandas.core.dtypes.inference import is_array_like
+
+from pandas.core.arrays import PandasArray
+from pandas.core.construction import extract_array
+
+
+@register_extension_dtype
+class StringDtype(ExtensionDtype):
+    """
+    Extension dtype for text data.
+
+    .. versionadded:: 1.0.0
+
+    Examples
+    --------
+    >>> pd.StringDtype()
+    StringDtype
+    """
+
+    @property
+    def na_value(self):
+        return np.nan
+
+    @property
+    def type(self) -> Type:
+        return str
+
+    @property
+    def name(self) -> str:
+        return "string"
+
+    @classmethod
+    def construct_from_string(cls, string: str):
+        if string in {"string", "str"}:
+            return cls()
+        return super().construct_from_string(string)
+
+    @classmethod
+    def construct_array_type(cls) -> "Type[StringArray]":
+        return StringArray
+
+    def __repr__(self) -> str:
+        return "StringDtype"
+
+
+class StringArray(PandasArray):
+    """
+    Extension array for text data.
+
+    .. versionadded:: 1.0.0
+
+    Parameters
+    ----------
+    values : ndarray
+    copy : bool, default False
+
+    Examples
+    --------
+    >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string")
+    <PandasArray>
+    ['This is', 'some text', nan, 'data.']
+    Length: 4, dtype: string
+
+    Unlike ``object`` dtype arrays, ``StringArray`` doesn't allow non-string
+    values.
+
+    >>> pd.array(['1', 1], dtype="string")
+    Traceback (most recent call last):
+    ...
+    ValueError: Must provide strings
+    """
+
+    # undo the PandasArray hack
+    _typ = "extension"
+
+    def __init__(self, values, copy=False):
+        super().__init__(values, copy=copy)
+        self._dtype = StringDtype()
+        self._validate()
+
+    def _validate(self):
+        """Validate that we only store NA or strings."""
+        if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
+            raise ValueError("Must provide strings")
+
+    @classmethod
+    def _from_sequence(cls, scalars, dtype=None, copy=False):
+        if dtype:
+            assert dtype == "string"
+        result = super()._from_sequence(scalars, dtype=object, copy=copy)
+        # convert None to np.nan
+        # TODO: it would be nice to do this in _validate / lib.is_string_array
+        # We are already doing a scan over the values there.
+        result[result.isna()] = np.nan
+        return result
+
+    @classmethod
+    def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
+        return cls._from_sequence(strings, dtype=dtype, copy=copy)
+
+    def __setitem__(self, key, value):
+        value = extract_array(value, extract_numpy=True)
+        if isinstance(value, type(self)):
+            value = value._ndarray
+        scalar_key = lib.is_scalar(key)
+        scalar_value = lib.is_scalar(value)
+        if scalar_key and not scalar_value:
+            raise ValueError("setting an array element with a sequence.")
+
+        # validate new items
+        if scalar_value:
+            if scalar_value is None:
+                value = np.nan
+            elif not (isinstance(value, str) or np.isnan(value)):
+                raise ValueError(
+                    "Cannot set value '{}' into a StringArray.".format(value)
+                )
+        else:
+            if not is_array_like(value):
+                value = np.asarray(value, dtype=object)
+            if len(value) and not lib.is_string_array(value, skipna=True):
+                raise ValueError("Must provide strings.")
+
+        super().__setitem__(key, value)
+
+    def fillna(self, value=None, method=None, limit=None):
+        # TODO: validate dtype
+        return super().fillna(value, method, limit)
+
+    def astype(self, dtype, copy=True):
+        dtype = pandas_dtype(dtype)
+        if isinstance(dtype, StringDtype):
+            if copy:
+                return self.copy()
+            return self
+        return super().astype(dtype, copy)
+
+    def __add__(self, other):
+        return _add(self, other)
+
+    def __radd__(self, other):
+        return _add(self, other, reversed=True)
+
+    def _reduce(self, name, skipna=True, **kwargs):
+        raise TypeError("Cannot perform reduction '{}' with string dtype".format(name))
+
+    def value_counts(self, dropna=False):
+        from pandas import value_counts
+
+        return value_counts(self._ndarray, dropna=dropna)
+
+
+def _add(array, other, reversed=False):
+    if isinstance(other, (ABCIndexClass, ABCSeries)):
+        return NotImplemented
+
+    mask = array.isna()
+    if isinstance(other, type(array)):
+        mask |= other.isna()
+        other = other._ndarray[~mask]
+
+    valid = ~mask
+
+    out = np.empty_like(array._ndarray, dtype="object")
+    out[mask] = np.nan
+    if reversed:
+        out[valid] = other + array._ndarray[valid]
+    else:
+        out[valid] = array._ndarray[valid] + other
+
+    return type(array)(out)
diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
index 6f599a6be6021..c4d791c2be20e 100644
--- a/pandas/core/dtypes/missing.py
+++ b/pandas/core/dtypes/missing.py
@@ -128,6 +128,7 @@ def isna(obj):
 
 
 def _isna_new(obj):
+
     if is_scalar(obj):
         return libmissing.checknull(obj)
     # hack (for now) because MI registers as ndarray
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index 25350119f9df5..7edf3abc409e0 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -15,6 +15,7 @@
     ensure_object,
     is_bool_dtype,
     is_categorical_dtype,
+    is_extension_array_dtype,
     is_integer,
     is_list_like,
     is_re,
@@ -817,12 +818,15 @@ def _str_extract_frame(arr, pat, flags=0):
         result_index = arr.index
     except AttributeError:
         result_index = None
-    return DataFrame(
+    result = DataFrame(
         [groups_or_na(val) for val in arr],
         columns=columns,
         index=result_index,
         dtype=object,
     )
+    if arr.dtype.name == "string":
+        result = result.astype("string")
+    return result
 
 
 def str_extract(arr, pat, flags=0, expand=True):
@@ -912,7 +916,7 @@ def str_extract(arr, pat, flags=0, expand=True):
         return _str_extract_frame(arr._orig, pat, flags=flags)
     else:
         result, name = _str_extract_noexpand(arr._parent, pat, flags=flags)
-        return arr._wrap_result(result, name=name, expand=expand)
+        return arr._wrap_result(result, name=name, expand=expand, returns_string=True)
 
 
 def str_extractall(arr, pat, flags=0):
@@ -1020,7 +1024,9 @@ def str_extractall(arr, pat, flags=0):
 
     index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"])
 
-    result = arr._constructor_expanddim(match_list, index=index, columns=columns)
+    result = arr._constructor_expanddim(
+        match_list, index=index, columns=columns, dtype=arr.dtype
+    )
     return result
 
 
@@ -1858,11 +1864,18 @@ def wrapper(self, *args, **kwargs):
     return _forbid_nonstring_types
 
 
-def _noarg_wrapper(f, name=None, docstring=None, forbidden_types=["bytes"], **kargs):
+def _noarg_wrapper(
+    f,
+    name=None,
+    docstring=None,
+    forbidden_types=["bytes"],
+    returns_string=True,
+    **kargs
+):
     @forbid_nonstring_types(forbidden_types, name=name)
     def wrapper(self):
         result = _na_map(f, self._parent, **kargs)
-        return self._wrap_result(result)
+        return self._wrap_result(result, returns_string=returns_string)
 
     wrapper.__name__ = f.__name__ if name is None else name
     if docstring is not None:
@@ -1874,22 +1887,28 @@ def wrapper(self):
 
 
 def _pat_wrapper(
-    f, flags=False, na=False, name=None, forbidden_types=["bytes"], **kwargs
+    f,
+    flags=False,
+    na=False,
+    name=None,
+    forbidden_types=["bytes"],
+    returns_string=True,
+    **kwargs
 ):
     @forbid_nonstring_types(forbidden_types, name=name)
     def wrapper1(self, pat):
         result = f(self._parent, pat)
-        return self._wrap_result(result)
+        return self._wrap_result(result, returns_string=returns_string)
 
     @forbid_nonstring_types(forbidden_types, name=name)
     def wrapper2(self, pat, flags=0, **kwargs):
         result = f(self._parent, pat, flags=flags, **kwargs)
-        return self._wrap_result(result)
+        return self._wrap_result(result, returns_string=returns_string)
 
     @forbid_nonstring_types(forbidden_types, name=name)
     def wrapper3(self, pat, na=np.nan):
         result = f(self._parent, pat, na=na)
-        return self._wrap_result(result)
+        return self._wrap_result(result, returns_string=returns_string)
 
     wrapper = wrapper3 if na else wrapper2 if flags else wrapper1
 
@@ -1926,6 +1945,7 @@ class StringMethods(NoNewAttributesMixin):
     def __init__(self, data):
         self._inferred_dtype = self._validate(data)
         self._is_categorical = is_categorical_dtype(data)
+        self._is_string = data.dtype.name == "string"
 
         # .values.categories works for both Series/Index
         self._parent = data.values.categories if self._is_categorical else data
@@ -1967,6 +1987,9 @@ def _validate(data):
         values = getattr(data, "values", data)  # Series / Index
         values = getattr(values, "categories", values)  # categorical / normal
 
+        if is_extension_array_dtype(values.dtype):
+            return str(values.dtype)
+
         try:
             inferred_dtype = lib.infer_dtype(values, skipna=True)
         except ValueError:
@@ -1992,7 +2015,13 @@ def __iter__(self):
             g = self.get(i)
 
     def _wrap_result(
-        self, result, use_codes=True, name=None, expand=None, fill_value=np.nan
+        self,
+        result,
+        use_codes=True,
+        name=None,
+        expand=None,
+        fill_value=np.nan,
+        returns_string=True,
     ):
 
         from pandas import Index, Series, MultiIndex
@@ -2069,11 +2098,14 @@ def cons_row(x):
             index = self._orig.index
             if expand:
                 cons = self._orig._constructor_expanddim
-                return cons(result, columns=name, index=index)
+                result = cons(result, columns=name, index=index)
             else:
                 # Must be a Series
                 cons = self._orig._constructor
-                return cons(result, name=name, index=index)
+                result = cons(result, name=name, index=index)
+            if self._is_string and returns_string:
+                result = result.astype("string")
+            return result
 
     def _get_series_list(self, others):
         """
@@ -2339,7 +2371,7 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"):
             result = Index(result, dtype=object, name=self._orig.name)
         else:  # Series
             result = Series(
-                result, dtype=object, index=data.index, name=self._orig.name
+                result, dtype=self._orig.dtype, index=data.index, name=self._orig.name
             )
         return result
 
@@ -2479,13 +2511,13 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"):
     @forbid_nonstring_types(["bytes"])
     def split(self, pat=None, n=-1, expand=False):
         result = str_split(self._parent, pat, n=n)
-        return self._wrap_result(result, expand=expand)
+        return self._wrap_result(result, expand=expand, returns_string=expand)
 
     @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"})
     @forbid_nonstring_types(["bytes"])
     def rsplit(self, pat=None, n=-1, expand=False):
         result = str_rsplit(self._parent, pat, n=n)
-        return self._wrap_result(result, expand=expand)
+        return self._wrap_result(result, expand=expand, returns_string=expand)
 
     _shared_docs[
         "str_partition"
@@ -2586,7 +2618,7 @@ def rsplit(self, pat=None, n=-1, expand=False):
     def partition(self, sep=" ", expand=True):
         f = lambda x: x.partition(sep)
         result = _na_map(f, self._parent)
-        return self._wrap_result(result, expand=expand)
+        return self._wrap_result(result, expand=expand, returns_string=expand)
 
     @Appender(
         _shared_docs["str_partition"]
@@ -2602,7 +2634,7 @@ def partition(self, sep=" ", expand=True):
     def rpartition(self, sep=" ", expand=True):
         f = lambda x: x.rpartition(sep)
         result = _na_map(f, self._parent)
-        return self._wrap_result(result, expand=expand)
+        return self._wrap_result(result, expand=expand, returns_string=expand)
 
     @copy(str_get)
     def get(self, i):
@@ -2621,13 +2653,13 @@ def contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
         result = str_contains(
             self._parent, pat, case=case, flags=flags, na=na, regex=regex
         )
-        return self._wrap_result(result, fill_value=na)
+        return self._wrap_result(result, fill_value=na, returns_string=False)
 
     @copy(str_match)
     @forbid_nonstring_types(["bytes"])
     def match(self, pat, case=True, flags=0, na=np.nan):
         result = str_match(self._parent, pat, case=case, flags=flags, na=na)
-        return self._wrap_result(result, fill_value=na)
+        return self._wrap_result(result, fill_value=na, returns_string=False)
 
     @copy(str_replace)
     @forbid_nonstring_types(["bytes"])
@@ -2762,13 +2794,14 @@ def slice_replace(self, start=None, stop=None, repl=None):
     def decode(self, encoding, errors="strict"):
         # need to allow bytes here
         result = str_decode(self._parent, encoding, errors)
-        return self._wrap_result(result)
+        # TODO: Not sure how to handle this.
+        return self._wrap_result(result, returns_string=False)
 
     @copy(str_encode)
     @forbid_nonstring_types(["bytes"])
     def encode(self, encoding, errors="strict"):
         result = str_encode(self._parent, encoding, errors)
-        return self._wrap_result(result)
+        return self._wrap_result(result, returns_string=False)
 
     _shared_docs[
         "str_strip"
@@ -2869,7 +2902,11 @@ def get_dummies(self, sep="|"):
         data = self._orig.astype(str) if self._is_categorical else self._parent
         result, name = str_get_dummies(data, sep)
         return self._wrap_result(
-            result, use_codes=(not self._is_categorical), name=name, expand=True
+            result,
+            use_codes=(not self._is_categorical),
+            name=name,
+            expand=True,
+            returns_string=False,
         )
 
     @copy(str_translate)
@@ -2878,10 +2915,16 @@ def translate(self, table):
         result = str_translate(self._parent, table)
         return self._wrap_result(result)
 
-    count = _pat_wrapper(str_count, flags=True, name="count")
-    startswith = _pat_wrapper(str_startswith, na=True, name="startswith")
-    endswith = _pat_wrapper(str_endswith, na=True, name="endswith")
-    findall = _pat_wrapper(str_findall, flags=True, name="findall")
+    count = _pat_wrapper(str_count, flags=True, name="count", returns_string=False)
+    startswith = _pat_wrapper(
+        str_startswith, na=True, name="startswith", returns_string=False
+    )
+    endswith = _pat_wrapper(
+        str_endswith, na=True, name="endswith", returns_string=False
+    )
+    findall = _pat_wrapper(
+        str_findall, flags=True, name="findall", returns_string=False
+    )
 
     @copy(str_extract)
     @forbid_nonstring_types(["bytes"])
@@ -2929,7 +2972,7 @@ def extractall(self, pat, flags=0):
     @forbid_nonstring_types(["bytes"])
     def find(self, sub, start=0, end=None):
         result = str_find(self._parent, sub, start=start, end=end, side="left")
-        return self._wrap_result(result)
+        return self._wrap_result(result, returns_string=False)
 
     @Appender(
         _shared_docs["find"]
@@ -2942,7 +2985,7 @@ def find(self, sub, start=0, end=None):
     @forbid_nonstring_types(["bytes"])
     def rfind(self, sub, start=0, end=None):
         result = str_find(self._parent, sub, start=start, end=end, side="right")
-        return self._wrap_result(result)
+        return self._wrap_result(result, returns_string=False)
 
     @forbid_nonstring_types(["bytes"])
     def normalize(self, form):
@@ -3004,7 +3047,7 @@ def normalize(self, form):
     @forbid_nonstring_types(["bytes"])
     def index(self, sub, start=0, end=None):
         result = str_index(self._parent, sub, start=start, end=end, side="left")
-        return self._wrap_result(result)
+        return self._wrap_result(result, returns_string=False)
 
     @Appender(
         _shared_docs["index"]
@@ -3018,7 +3061,7 @@ def index(self, sub, start=0, end=None):
     @forbid_nonstring_types(["bytes"])
     def rindex(self, sub, start=0, end=None):
         result = str_index(self._parent, sub, start=start, end=end, side="right")
-        return self._wrap_result(result)
+        return self._wrap_result(result, returns_string=False)
 
     _shared_docs[
         "len"
@@ -3067,7 +3110,11 @@ def rindex(self, sub, start=0, end=None):
     dtype: float64
     """
     len = _noarg_wrapper(
-        len, docstring=_shared_docs["len"], forbidden_types=None, dtype=int
+        len,
+        docstring=_shared_docs["len"],
+        forbidden_types=None,
+        dtype=int,
+        returns_string=False,
     )
 
     _shared_docs[
@@ -3339,46 +3386,55 @@ def rindex(self, sub, start=0, end=None):
         lambda x: x.isalnum(),
         name="isalnum",
         docstring=_shared_docs["ismethods"] % _doc_args["isalnum"],
+        returns_string=False,
     )
     isalpha = _noarg_wrapper(
         lambda x: x.isalpha(),
         name="isalpha",
         docstring=_shared_docs["ismethods"] % _doc_args["isalpha"],
+        returns_string=False,
     )
     isdigit = _noarg_wrapper(
         lambda x: x.isdigit(),
         name="isdigit",
         docstring=_shared_docs["ismethods"] % _doc_args["isdigit"],
+        returns_string=False,
     )
     isspace = _noarg_wrapper(
         lambda x: x.isspace(),
         name="isspace",
         docstring=_shared_docs["ismethods"] % _doc_args["isspace"],
+        returns_string=False,
     )
     islower = _noarg_wrapper(
         lambda x: x.islower(),
         name="islower",
         docstring=_shared_docs["ismethods"] % _doc_args["islower"],
+        returns_string=False,
     )
     isupper = _noarg_wrapper(
         lambda x: x.isupper(),
         name="isupper",
         docstring=_shared_docs["ismethods"] % _doc_args["isupper"],
+        returns_string=False,
     )
     istitle = _noarg_wrapper(
         lambda x: x.istitle(),
         name="istitle",
         docstring=_shared_docs["ismethods"] % _doc_args["istitle"],
+        returns_string=False,
     )
     isnumeric = _noarg_wrapper(
         lambda x: x.isnumeric(),
         name="isnumeric",
         docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"],
+        returns_string=False,
     )
     isdecimal = _noarg_wrapper(
         lambda x: x.isdecimal(),
         name="isdecimal",
         docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"],
+        returns_string=False,
     )
 
     @classmethod
diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py
index 326bef7f4b480..db7888f4b7dca 100644
--- a/pandas/tests/api/test_api.py
+++ b/pandas/tests/api/test_api.py
@@ -70,6 +70,7 @@ class TestPDApi(Base):
         "SparseDataFrame",
         "SparseDtype",
         "SparseSeries",
+        "StringDtype",
         "Timedelta",
         "TimedeltaIndex",
         "Timestamp",
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
new file mode 100644
index 0000000000000..2ab9488461e16
--- /dev/null
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -0,0 +1,60 @@
+import operator
+
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas.util.testing as tm
+
+
+def test_none_to_nan():
+    a = pd.arrays.StringArray._from_sequence(["a", None, "b"])
+    assert a[1] is not None
+    assert np.isnan(a[1])
+
+
+def test_setitem_validates():
+    a = pd.arrays.StringArray._from_sequence(["a", "b"])
+    with pytest.raises(ValueError, match="10"):
+        a[0] = 10
+
+    with pytest.raises(ValueError, match="strings"):
+        a[:] = np.array([1, 2])
+
+
+@pytest.mark.parametrize(
+    "input, method",
+    [
+        (["a", "b", "c"], operator.methodcaller("capitalize")),
+        (["a", "b", "c"], operator.methodcaller("capitalize")),
+        (["a b", "a bc. de"], operator.methodcaller("capitalize")),
+    ],
+)
+def test_string_methods(input, method):
+    a = pd.Series(input, dtype="string")
+    b = pd.Series(input, dtype="object")
+    result = method(a.str)
+    expected = method(b.str)
+
+    assert result.dtype.name == "string"
+    tm.assert_series_equal(result.astype(object), expected)
+
+
+def test_add():
+    a = pd.Series(["a", "b", "c", None, None], dtype="string")
+    b = pd.Series(["x", "y", None, "z", None], dtype="string")
+
+    result = a + b
+    expected = pd.Series(["ax", "by", None, None, None], dtype="string")
+    tm.assert_series_equal(result, expected)
+
+    result = a.add(b)
+    tm.assert_series_equal(result, expected)
+
+    result = a.radd(b)
+    expected = pd.Series(["xa", "yb", None, None, None], dtype="string")
+    tm.assert_series_equal(result, expected)
+
+    result = a.add(b, fill_value="-")
+    expected = pd.Series(["ax", "by", "c-", "-z", None], dtype="string")
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
new file mode 100644
index 0000000000000..a09b3b424269a
--- /dev/null
+++ b/pandas/tests/extension/test_string.py
@@ -0,0 +1,105 @@
+import random
+import string
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas.core.arrays.string_ import StringArray, StringDtype
+from pandas.tests.extension import base
+
+
+@pytest.fixture
+def dtype():
+    return StringDtype()
+
+
+@pytest.fixture
+def data():
+    strings = random.choices(string.ascii_letters, k=100)
+    while strings[0] == strings[1]:
+        strings = random.choices(string.ascii_letters, k=100)
+
+    return StringArray._from_sequence(strings)
+
+
+@pytest.fixture
+def data_missing():
+    """Length 2 array with [NA, Valid]"""
+    return StringArray._from_sequence([np.nan, "A"])
+
+
+@pytest.fixture
+def data_for_sorting():
+    return StringArray._from_sequence(["B", "C", "A"])
+
+
+@pytest.fixture
+def data_missing_for_sorting():
+    return StringArray._from_sequence(["B", np.nan, "A"])
+
+
+@pytest.fixture
+def na_value():
+    return np.nan
+
+
+@pytest.fixture
+def data_for_grouping():
+    return StringArray._from_sequence(["B", "B", np.nan, np.nan, "A", "A", "B", "C"])
+
+
+class TestDtype(base.BaseDtypeTests):
+    pass
+
+
+class TestInterface(base.BaseInterfaceTests):
+    pass
+
+
+class TestConstructors(base.BaseConstructorsTests):
+    pass
+
+
+class TestReshaping(base.BaseReshapingTests):
+    pass
+
+
+class TestGetitem(base.BaseGetitemTests):
+    pass
+
+
+class TestSetitem(base.BaseSetitemTests):
+    pass
+
+
+class TestMissing(base.BaseMissingTests):
+    pass
+
+
+class TestReduce(base.BaseNoReduceTests):
+    pass
+
+
+class TestMethods(base.BaseMethodsTests):
+    pass
+
+
+class TestCasting(base.BaseCastingTests):
+    pass
+
+
+class TestComparisonOps(base.BaseComparisonOpsTests):
+    def _compare_other(self, s, data, op_name, other):
+        result = getattr(s, op_name)(other)
+        expected = getattr(s.astype(object), op_name)(other)
+        self.assert_series_equal(result, expected)
+
+    def test_compare_scalar(self, data, all_compare_operators):
+        op_name = all_compare_operators
+        s = pd.Series(data)
+        self._compare_other(s, data, op_name, "abc")
+
+
+class TestParsing(base.BaseParsingTests):
+    pass
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
index bc8dc7272a83a..0b51fd8682913 100644
--- a/pandas/tests/test_strings.py
+++ b/pandas/tests/test_strings.py
@@ -6,6 +6,8 @@
 from numpy.random import randint
 import pytest
 
+from pandas._libs import lib
+
 from pandas import DataFrame, Index, MultiIndex, Series, concat, isna, notna
 import pandas.core.strings as strings
 import pandas.util.testing as tm
@@ -3269,3 +3271,26 @@ def test_casefold(self):
         result = s.str.casefold()
 
         tm.assert_series_equal(result, expected)
+
+
+def test_string_array(any_string_method):
+    data = ["a", "bb", np.nan, "ccc"]
+    a = Series(data, dtype=object)
+    b = Series(data, dtype="string")
+    method_name, args, kwargs = any_string_method
+
+    expected = getattr(a.str, method_name)(*args, **kwargs)
+    result = getattr(b.str, method_name)(*args, **kwargs)
+
+    if isinstance(expected, Series):
+        if expected.dtype == "object" and lib.is_string_array(
+            expected.values, skipna=True
+        ):
+            assert result.dtype == "string"
+            result = result.astype(object)
+        tm.assert_series_equal(result, expected)
+    elif isinstance(expected, DataFrame):
+        columns = expected.select_dtypes(include="object").columns
+        assert all(result[columns].dtypes == "string")
+        result[columns] = result[columns].astype(object)
+    tm.assert_equal(result, expected)
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
index cf8452cdd0c59..73f07a83dd4fa 100644
--- a/pandas/util/testing.py
+++ b/pandas/util/testing.py
@@ -1434,6 +1434,9 @@ def assert_equal(left, right, **kwargs):
         assert_extension_array_equal(left, right, **kwargs)
     elif isinstance(left, np.ndarray):
         assert_numpy_array_equal(left, right, **kwargs)
+    elif isinstance(left, str):
+        assert kwargs == {}
+        return left == right
     else:
         raise NotImplementedError(type(left))
 

From 3ecb5cc9610500bd588d9b2134120b4562d58642 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Fri, 16 Aug 2019 13:16:46 -0500
Subject: [PATCH 02/49] test fixups

---
 pandas/core/arrays/string_.py |  2 +-
 pandas/core/strings.py        | 21 ++++++++++++++++-----
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 9a7cfb45d68ea..2f641c9fcd53c 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -41,7 +41,7 @@ def name(self) -> str:
 
     @classmethod
     def construct_from_string(cls, string: str):
-        if string in {"string", "str"}:
+        if string == "string":
             return cls()
         return super().construct_from_string(string)
 
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index 7edf3abc409e0..aa3f72969d366 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -1024,8 +1024,16 @@ def str_extractall(arr, pat, flags=0):
 
     index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"])
 
+    # workaround #27953
+    from pandas import StringDtype
+
+    if isinstance(arr.dtype, StringDtype):
+        dtype = arr.dtype
+    else:
+        dtype = None
+
     result = arr._constructor_expanddim(
-        match_list, index=index, columns=columns, dtype=arr.dtype
+        match_list, index=index, columns=columns, dtype=dtype
     )
     return result
 
@@ -1079,7 +1087,7 @@ def str_get_dummies(arr, sep="|"):
 
     for i, t in enumerate(tags):
         pat = sep + t + sep
-        dummies[:, i] = lib.map_infer(arr.values, lambda x: pat in x)
+        dummies[:, i] = lib.map_infer(arr.to_numpy(), lambda x: pat in x)
     return dummies, tags
 
 
@@ -2370,9 +2378,12 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"):
             # add dtype for case that result is all-NA
             result = Index(result, dtype=object, name=self._orig.name)
         else:  # Series
-            result = Series(
-                result, dtype=self._orig.dtype, index=data.index, name=self._orig.name
-            )
+            if is_categorical_dtype(self._orig.dtype):
+                # We need to infer the new categories.
+                dtype = None
+            else:
+                dtype = self._orig.dtype
+            result = Series(result, dtype=dtype, index=data.index, name=self._orig.name)
         return result
 
     _shared_docs[

From 59a7d398ecdb1f85c3edc2249e7b02ecea0d3110 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Fri, 16 Aug 2019 14:20:30 -0500
Subject: [PATCH 03/49] string dtype

---
 pandas/core/strings.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index aa3f72969d366..64953878d978f 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -15,7 +15,6 @@
     ensure_object,
     is_bool_dtype,
     is_categorical_dtype,
-    is_extension_array_dtype,
     is_integer,
     is_list_like,
     is_re,
@@ -1984,6 +1983,8 @@ def _validate(data):
         -------
         dtype : inferred dtype of data
         """
+        from pandas import StringDtype
+
         if isinstance(data, ABCMultiIndex):
             raise AttributeError(
                 "Can only use .str accessor with Index, not MultiIndex"
@@ -1995,8 +1996,9 @@ def _validate(data):
         values = getattr(data, "values", data)  # Series / Index
         values = getattr(values, "categories", values)  # categorical / normal
 
-        if is_extension_array_dtype(values.dtype):
-            return str(values.dtype)
+        # explicitly allow StringDtype
+        if isinstance(values.dtype, StringDtype):
+            return "string"
 
         try:
             inferred_dtype = lib.infer_dtype(values, skipna=True)

From 7c07070053eba9149979458d3ae084b8ea035190 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Fri, 16 Aug 2019 14:52:27 -0500
Subject: [PATCH 04/49] 35 compat

---
 pandas/tests/extension/test_string.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index a09b3b424269a..91dbd8d801c3e 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -16,7 +16,8 @@ def dtype():
 
 @pytest.fixture
 def data():
-    strings = random.choices(string.ascii_letters, k=100)
+    # strings = random.choices(string.ascii_letters, k=100)
+    strings = np.random.choice(list(string.ascii_letters), size=100)
     while strings[0] == strings[1]:
         strings = random.choices(string.ascii_letters, k=100)
 

From 9e1a73b803d598ee6ac7699508dd8046ef5f2d0a Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Fri, 16 Aug 2019 14:53:08 -0500
Subject: [PATCH 05/49] doc

---
 doc/source/user_guide/text.rst | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
index 762c16e9a84eb..6e76945f19bed 100644
--- a/doc/source/user_guide/text.rst
+++ b/doc/source/user_guide/text.rst
@@ -232,7 +232,8 @@ positional argument (a regex object) and return a string.
    def repl(m):
        return m.group(0)[::-1]
 
-   pd.Series(['foo 123', 'bar baz', np.nan], dtype="string").str.replace(pat, repl)
+   pd.Series(['foo 123', 'bar baz', np.nan],
+             dtype="string").str.replace(pat, repl)
 
    # Using regex groups
    pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
@@ -240,7 +241,8 @@ positional argument (a regex object) and return a string.
    def repl(m):
        return m.group('two').swapcase()
 
-   pd.Series(['Foo Bar Baz', np.nan], dtype="string").str.replace(pat, repl)
+   pd.Series(['Foo Bar Baz', np.nan],
+             dtype="string").str.replace(pat, repl)
 
 .. versionadded:: 0.20.0
 

From 16ccad817bfccefe76f8fc42106cae7e5edaed19 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Fri, 16 Aug 2019 15:46:38 -0500
Subject: [PATCH 06/49] fixups

---
 doc/source/user_guide/text.rst        | 4 ++--
 doc/source/whatsnew/v1.0.0.rst        | 2 +-
 pandas/tests/extension/test_string.py | 4 +---
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
index 6e76945f19bed..2125e68840da3 100644
--- a/doc/source/user_guide/text.rst
+++ b/doc/source/user_guide/text.rst
@@ -160,8 +160,8 @@ It is easy to expand this to return a DataFrame using ``expand``.
 
    s2.str.split('_', expand=True)
 
-When original ``Series`` has :ref:`StringDtype`, the output columns will all
-be :ref:`StringDtype` as well.
+When original ``Series`` has :class:`StringDtype`, the output columns will all
+be :class:`StringDtype` as well.
 
 It is also possible to limit the number of splits:
 
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
index 4e34d0e5efa02..6eed40c69c80c 100644
--- a/doc/source/whatsnew/v1.0.0.rst
+++ b/doc/source/whatsnew/v1.0.0.rst
@@ -24,7 +24,7 @@ Enhancements
 Dedicated string data type
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-We've added :ref:`StringDtype`, an extension type dedicated to string data.
+We've added :class:`StringDtype`, an extension type dedicated to string data.
 Previously, strings were typically stored in object-dtype NumPy arrays.
 
 .. ipython:: python
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 91dbd8d801c3e..cba10b1b7f88b 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -1,4 +1,3 @@
-import random
 import string
 
 import numpy as np
@@ -16,10 +15,9 @@ def dtype():
 
 @pytest.fixture
 def data():
-    # strings = random.choices(string.ascii_letters, k=100)
     strings = np.random.choice(list(string.ascii_letters), size=100)
     while strings[0] == strings[1]:
-        strings = random.choices(string.ascii_letters, k=100)
+        strings = np.random.choice(list(string.ascii_letters), size=100)
 
     return StringArray._from_sequence(strings)
 

From 1027463a6a051ded62df8724836d49ea2b86d578 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Fri, 16 Aug 2019 16:21:36 -0500
Subject: [PATCH 07/49] doc

---
 pandas/core/arrays/string_.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 2f641c9fcd53c..bde590ab94a86 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -21,6 +21,11 @@ class StringDtype(ExtensionDtype):
 
     .. versionadded:: 1.0.0
 
+    Attributes
+    ----------
+    na_value
+    name
+
     Examples
     --------
     >>> pd.StringDtype()
@@ -29,6 +34,9 @@ class StringDtype(ExtensionDtype):
 
     @property
     def na_value(self):
+        """
+        StringDtype uses :attr:`numpy.nan` as the missing NA value.
+        """
         return np.nan
 
     @property
@@ -37,6 +45,9 @@ def type(self) -> Type:
 
     @property
     def name(self) -> str:
+        """
+        The alias for StringDtype is ``'string'``.
+        """
         return "string"
 
     @classmethod

From aafb53bc9bbf476c2eb15942a71450bde152efa4 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Sun, 18 Aug 2019 21:26:58 -0500
Subject: [PATCH 08/49] doc

---
 pandas/core/arrays/string_.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index bde590ab94a86..bddfea77f4795 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -23,7 +23,6 @@ class StringDtype(ExtensionDtype):
 
     Attributes
     ----------
-    na_value
     name
 
     Examples

From ab49169be613ba811a1e617c59d1d6d299592ba0 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 19 Aug 2019 10:03:58 -0500
Subject: [PATCH 09/49] fix doc warnings

---
 doc/source/reference/arrays.rst |  4 ++--
 pandas/core/arrays/string_.py   | 12 ++++++++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst
index f1a155ca85cbf..db620e73301cb 100644
--- a/doc/source/reference/arrays.rst
+++ b/doc/source/reference/arrays.rst
@@ -467,7 +467,7 @@ Text data
 ---------
 
 When working with text data, where each valid element is a string, we recommend using
-:ref:`StringDtype` (with the alias ``"string"``).
+:class:`StringDtype` (with the alias ``"string"``).
 
 .. autosummary::
    :toctree: api/
@@ -481,7 +481,7 @@ When working with text data, where each valid element is a string, we recommend
 
    StringDtype
 
-The ``Series.str`` accessor is available for ``Series`` backed by a :ref:`arrays.StringArray`.
+The ``Series.str`` accessor is available for ``Series`` backed by a :class:`arrays.StringArray`.
 See :ref:`api.series.str` for more.
 
 
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index bddfea77f4795..8966612312312 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -21,6 +21,10 @@ class StringDtype(ExtensionDtype):
 
     .. versionadded:: 1.0.0
 
+    Parameters
+    ----------
+    None
+
     Attributes
     ----------
     name
@@ -74,6 +78,14 @@ class StringArray(PandasArray):
     values : ndarray
     copy : bool, default False
 
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
     Examples
     --------
     >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string")

From 978fb55dedeb8e9aa998a1c00572ab906b4a0668 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 19 Aug 2019 11:11:03 -0500
Subject: [PATCH 10/49] fixup docstrings

---
 pandas/core/arrays/string_.py | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 8966612312312..0efb7ca97ab3c 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -21,14 +21,14 @@ class StringDtype(ExtensionDtype):
 
     .. versionadded:: 1.0.0
 
-    Parameters
-    ----------
-    None
-
     Attributes
     ----------
     name
 
+    Methods
+    -------
+    None
+
     Examples
     --------
     >>> pd.StringDtype()
@@ -75,8 +75,16 @@ class StringArray(PandasArray):
 
     Parameters
     ----------
-    values : ndarray
+    values : array-like
+        The array of data.
+
+        .. warning::
+
+           Currently, this expects an object-dtype ndarray
+           where the elements are Python strings. This may
+           change without warning in the future.
     copy : bool, default False
+        Whether to copy the array of data.
 
     Attributes
     ----------
@@ -86,10 +94,16 @@ class StringArray(PandasArray):
     -------
     None
 
+    See Also
+    --------
+    Series.str
+        The string methods are available on Series backed by
+        a StringArray.
+
     Examples
     --------
     >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string")
-    <PandasArray>
+    <StringArray>
     ['This is', 'some text', nan, 'data.']
     Length: 4, dtype: string
 

From aebc68870acc4ca2d7b3940d6e0e3e7f1bd456cc Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 19 Aug 2019 11:43:10 -0500
Subject: [PATCH 11/49] fixup docstrings

---
 pandas/core/arrays/string_.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 0efb7ca97ab3c..5e20f3d2e3c91 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -23,7 +23,7 @@ class StringDtype(ExtensionDtype):
 
     Attributes
     ----------
-    name
+    None
 
     Methods
     -------

From 41dc0f96cd93e133e329c6c08129e4da8d111133 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 9 Sep 2019 09:40:18 -0500
Subject: [PATCH 12/49] lint

---
 pandas/core/api.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pandas/core/api.py b/pandas/core/api.py
index ea6cfa7210bb9..04f2f84c92a15 100644
--- a/pandas/core/api.py
+++ b/pandas/core/api.py
@@ -10,6 +10,7 @@
 )
 from pandas.core.dtypes.missing import isna, isnull, notna, notnull
 
+# TODO: Remove get_dummies import when statsmodels updates #18264
 from pandas.core.algorithms import factorize, unique, value_counts
 from pandas.core.arrays import Categorical
 from pandas.core.arrays.integer import (
@@ -44,9 +45,7 @@
 from pandas.core.indexes.period import Period, period_range
 from pandas.core.indexes.timedeltas import Timedelta, timedelta_range
 from pandas.core.indexing import IndexSlice
-from pandas.core.reshape.reshape import (  # TODO: Remove get_dummies import when statsmodels updates #18264
-    get_dummies,
-)
+from pandas.core.reshape.reshape import get_dummies
 from pandas.core.series import Series
 from pandas.core.tools.datetimes import to_datetime
 from pandas.core.tools.numeric import to_numeric

From 13cdddd9bbf59fb01f2665706a5e35c1b5ca64a1 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 16 Sep 2019 13:12:25 -0500
Subject: [PATCH 13/49] typing

---
 pandas/core/arrays/string_.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 5e20f3d2e3c91..4fbcbd43a2595 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -54,7 +54,7 @@ def name(self) -> str:
         return "string"
 
     @classmethod
-    def construct_from_string(cls, string: str):
+    def construct_from_string(cls, string: str) -> ExtensionDtype:
         if string == "string":
             return cls()
         return super().construct_from_string(string)

From 78c2eaa44269c2b29aa8849bd7f464e264bb0aff Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Wed, 18 Sep 2019 16:40:14 -0500
Subject: [PATCH 14/49] removed double assert

---
 pandas/tests/test_strings.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
index 0b51fd8682913..b50f1a0fd2f2a 100644
--- a/pandas/tests/test_strings.py
+++ b/pandas/tests/test_strings.py
@@ -3288,7 +3288,6 @@ def test_string_array(any_string_method):
         ):
             assert result.dtype == "string"
             result = result.astype(object)
-        tm.assert_series_equal(result, expected)
     elif isinstance(expected, DataFrame):
         columns = expected.select_dtypes(include="object").columns
         assert all(result[columns].dtypes == "string")

From 726d0afcfbe896caf8bd5010f362fdbd55fd3236 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Thu, 19 Sep 2019 06:12:42 -0500
Subject: [PATCH 15/49] experimental

---
 doc/source/user_guide/text.rst | 5 +++++
 pandas/core/arrays/string_.py  | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
index 2125e68840da3..4405497d7c376 100644
--- a/doc/source/user_guide/text.rst
+++ b/doc/source/user_guide/text.rst
@@ -31,6 +31,11 @@ for many reasons:
 3. When reading code, the contents of an ``object`` dtype array is less clear
    than ``string``.
 
+
+.. warning::
+
+   StringArray is currently considered experimental.
+
 For backwards-compatibility, ``object`` dtype remains the default type we
 infer a list of strings to
 
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 4fbcbd43a2595..81d135401eda0 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -73,6 +73,11 @@ class StringArray(PandasArray):
 
     .. versionadded:: 1.0.0
 
+    .. warning::
+
+       StringArray is considered experimental. The implementation and
+       parts of the API may change without warning.
+
     Parameters
     ----------
     values : array-like

From 9cd99459889d5aa4b7e8e662cd91ff90af94250e Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Thu, 19 Sep 2019 07:50:45 -0500
Subject: [PATCH 16/49] failing

---
 pandas/core/arrays/numpy_.py               | 10 +++++----
 pandas/core/arrays/string_.py              | 26 +++++++++++++---------
 pandas/tests/arrays/string_/test_string.py | 19 ++++++++++++++++
 3 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
index 32da0199e28f8..a979135f1b9d7 100644
--- a/pandas/core/arrays/numpy_.py
+++ b/pandas/core/arrays/numpy_.py
@@ -229,13 +229,15 @@ def __getitem__(self, item):
     def __setitem__(self, key, value):
         value = extract_array(value, extract_numpy=True)
 
-        if not lib.is_scalar(key) and is_list_like(key):
+        scalar_key = lib.is_scalar(key)
+        scalar_value = lib.is_scalar(value)
+
+        if not scalar_key and is_list_like(key):
             key = np.asarray(key)
 
-        if not lib.is_scalar(value):
-            value = np.asarray(value)
+        if not scalar_value:
+            value = np.asarray(value, dtype=self._ndarray.dtype)
 
-        value = np.asarray(value, dtype=self._ndarray.dtype)
         self._ndarray[key] = value
 
     def __len__(self) -> int:
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 81d135401eda0..432a125b8ec5e 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -1,3 +1,4 @@
+import operator
 from typing import Type
 
 import numpy as np
@@ -7,9 +8,10 @@
 from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.dtypes.common import pandas_dtype
 from pandas.core.dtypes.dtypes import register_extension_dtype
-from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
+from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
 from pandas.core.dtypes.inference import is_array_like
 
+from pandas.core import ops
 from pandas.core.arrays import PandasArray
 from pandas.core.construction import extract_array
 
@@ -132,7 +134,12 @@ def __init__(self, values, copy=False):
     def _validate(self):
         """Validate that we only store NA or strings."""
         if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
-            raise ValueError("Must provide strings")
+            raise ValueError("StringArray requires an object-dtype ndarray of strings.")
+        if self._ndarray.dtype != "object":
+            raise ValueError(
+                "StringArray requires an object-dtype ndarray. Got "
+                "'{}' instead.".format(self._ndarray.dtype)
+            )
 
     @classmethod
     def _from_sequence(cls, scalars, dtype=None, copy=False):
@@ -152,7 +159,9 @@ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
     def __setitem__(self, key, value):
         value = extract_array(value, extract_numpy=True)
         if isinstance(value, type(self)):
+            # extract_array doesn't extract PandasArray subclasses
             value = value._ndarray
+
         scalar_key = lib.is_scalar(key)
         scalar_value = lib.is_scalar(value)
         if scalar_key and not scalar_value:
@@ -187,10 +196,10 @@ def astype(self, dtype, copy=True):
         return super().astype(dtype, copy)
 
     def __add__(self, other):
-        return _add(self, other)
+        return _add(self, other, operator.add)
 
     def __radd__(self, other):
-        return _add(self, other, reversed=True)
+        return _add(self, other, ops.radd)
 
     def _reduce(self, name, skipna=True, **kwargs):
         raise TypeError("Cannot perform reduction '{}' with string dtype".format(name))
@@ -201,8 +210,8 @@ def value_counts(self, dropna=False):
         return value_counts(self._ndarray, dropna=dropna)
 
 
-def _add(array, other, reversed=False):
-    if isinstance(other, (ABCIndexClass, ABCSeries)):
+def _add(array, other, op):
+    if isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)):
         return NotImplemented
 
     mask = array.isna()
@@ -214,9 +223,6 @@ def _add(array, other, reversed=False):
 
     out = np.empty_like(array._ndarray, dtype="object")
     out[mask] = np.nan
-    if reversed:
-        out[valid] = other + array._ndarray[valid]
-    else:
-        out[valid] = array._ndarray[valid] + other
+    out[valid] = op(array._ndarray[valid], other)
 
     return type(array)(out)
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 2ab9488461e16..4846ea1860ad3 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -58,3 +58,22 @@ def test_add():
     result = a.add(b, fill_value="-")
     expected = pd.Series(["ax", "by", "c-", "-z", None], dtype="string")
     tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.xfail(reason="TODO")  # failing when adding np.nan to ndarray(['y'])
+def test_add_frame():
+    array = pd.array(["a", "b", np.nan, np.nan], dtype="string")
+    df = pd.DataFrame([["x", np.nan, "y", np.nan]])
+
+    assert array.__add__(df) is NotImplemented
+    result = array + df
+    expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]])
+    tm.assert_frame_equal(result, expected)
+
+
+def test_constructor_raises():
+    with pytest.raises(ValueError, match="object-dtype ndarray"):
+        pd.arrays.StringArray(np.array(["a", "b"], dtype="S1"))
+
+    with pytest.raises(ValueError, match="object-dtype ndarray"):
+        pd.arrays.StringArray(np.array([]))

From 070fb76e437d165951050e946f7500a7d344f0d4 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Thu, 19 Sep 2019 09:29:02 -0500
Subject: [PATCH 17/49] xfails

---
 pandas/tests/arrays/string_/test_string.py | 24 ++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 4846ea1860ad3..91414414f18ad 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -60,14 +60,34 @@ def test_add():
     tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.xfail(reason="TODO")  # failing when adding np.nan to ndarray(['y'])
+@pytest.mark.xfail(reason="GH-28527")
+def test_add_strings():
+    array = pd.array(["a", "b", "c", "d"], dtype="string")
+    df = pd.DataFrame([["t", "u", "v", "w"]])
+    assert array.__add__(df) is NotImplemented
+
+    result = array + df
+    expected = pd.DataFrame([["at", "bu", "cv", "dw"]]).astype("string")
+    tm.assert_frame_equal(result, expected)
+
+    result = df + array
+    expected = pd.DataFrame([["ta", "ub", "vc", "wd"]]).astype("string")
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.xfail(reason="GH-28527")
 def test_add_frame():
     array = pd.array(["a", "b", np.nan, np.nan], dtype="string")
     df = pd.DataFrame([["x", np.nan, "y", np.nan]])
 
     assert array.__add__(df) is NotImplemented
+
     result = array + df
-    expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]])
+    expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype("string")
+    tm.assert_frame_equal(result, expected)
+
+    result = df + array
+    expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype("string")
     tm.assert_frame_equal(result, expected)
 
 

From 2b90639f57fcf939d2a795ea7d9256d428cd83a0 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Thu, 19 Sep 2019 09:38:51 -0500
Subject: [PATCH 18/49] Handle non-ndarray in add

---
 pandas/core/arrays/string_.py              | 22 ++++++++++++++++------
 pandas/tests/arrays/string_/test_string.py | 13 +++++++++++++
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 432a125b8ec5e..63c1ddad4140b 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -1,5 +1,5 @@
 import operator
-from typing import Type
+from typing import TYPE_CHECKING, Type
 
 import numpy as np
 
@@ -14,6 +14,10 @@
 from pandas.core import ops
 from pandas.core.arrays import PandasArray
 from pandas.core.construction import extract_array
+from pandas.core.missing import isna
+
+if TYPE_CHECKING:
+    from pandas._typing import Scalar, ArrayLike
 
 
 @register_extension_dtype
@@ -38,7 +42,7 @@ class StringDtype(ExtensionDtype):
     """
 
     @property
-    def na_value(self):
+    def na_value(self) -> "Scalar":
         """
         StringDtype uses :attr:`numpy.nan` as the missing NA value.
         """
@@ -210,14 +214,20 @@ def value_counts(self, dropna=False):
         return value_counts(self._ndarray, dropna=dropna)
 
 
-def _add(array, other, op):
+def _add(array: StringArray, other: "ArrayLike", op):
     if isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)):
         return NotImplemented
 
-    mask = array.isna()
     if isinstance(other, type(array)):
-        mask |= other.isna()
-        other = other._ndarray[~mask]
+        other = other._ndarray
+
+    other = np.asarray(other)
+
+    mask = array.isna()
+    if not lib.is_scalar(other):
+        mask |= isna(other)
+
+        other = other[~mask]
 
     valid = ~mask
 
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 91414414f18ad..75db0ba6c988b 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -60,6 +60,19 @@ def test_add():
     tm.assert_series_equal(result, expected)
 
 
+def test_add_sequence():
+    a = pd.array(["a", "b", None, None], dtype="string")
+    other = ["x", None, "y", None]
+
+    result = a + other
+    expected = pd.array(["ax", None, None, None], dtype="string")
+    tm.assert_extension_array_equal(result, expected)
+
+    result = other + a
+    expected = pd.array(["xa", None, None, None], dtype="string")
+    tm.assert_extension_array_equal(result, expected)
+
+
 @pytest.mark.xfail(reason="GH-28527")
 def test_add_strings():
     array = pd.array(["a", "b", "c", "d"], dtype="string")

From 381c8892cec39ee30da04fab5a92f659402c9220 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Thu, 19 Sep 2019 10:25:42 -0500
Subject: [PATCH 19/49] fixup

---
 pandas/core/arrays/string_.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 63c1ddad4140b..198884e70bd88 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -221,10 +221,9 @@ def _add(array: StringArray, other: "ArrayLike", op):
     if isinstance(other, type(array)):
         other = other._ndarray
 
-    other = np.asarray(other)
-
     mask = array.isna()
     if not lib.is_scalar(other):
+        other = np.asarray(other)
         mask |= isna(other)
 
         other = other[~mask]

From bf82aad452f107cd959b1e9c808d183f472310c0 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Thu, 19 Sep 2019 13:26:42 -0500
Subject: [PATCH 20/49] fixup

---
 pandas/core/arrays/string_.py              | 62 ++++++++++++++--------
 pandas/tests/arrays/string_/test_string.py | 18 +++++++
 2 files changed, 58 insertions(+), 22 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 198884e70bd88..a1580cbd2c228 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -11,13 +11,14 @@
 from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
 from pandas.core.dtypes.inference import is_array_like
 
+from pandas import compat
 from pandas.core import ops
 from pandas.core.arrays import PandasArray
 from pandas.core.construction import extract_array
 from pandas.core.missing import isna
 
 if TYPE_CHECKING:
-    from pandas._typing import Scalar, ArrayLike
+    from pandas._typing import Scalar
 
 
 @register_extension_dtype
@@ -199,12 +200,6 @@ def astype(self, dtype, copy=True):
             return self
         return super().astype(dtype, copy)
 
-    def __add__(self, other):
-        return _add(self, other, operator.add)
-
-    def __radd__(self, other):
-        return _add(self, other, ops.radd)
-
     def _reduce(self, name, skipna=True, **kwargs):
         raise TypeError("Cannot perform reduction '{}' with string dtype".format(name))
 
@@ -213,25 +208,48 @@ def value_counts(self, dropna=False):
 
         return value_counts(self._ndarray, dropna=dropna)
 
+    # Overrride parent, because we have different return types.
+    @classmethod
+    def _create_arithmetic_method(cls, op):
+        def method(self, other):
+            if isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)):
+                return NotImplemented
+
+            elif isinstance(other, cls):
+                other = other._ndarray
+
+            mask = isna(self) | isna(other)
+            valid = ~mask
 
-def _add(array: StringArray, other: "ArrayLike", op):
-    if isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)):
-        return NotImplemented
+            if not lib.is_scalar(other):
+                other = np.asarray(other)
+                other = other[valid]
 
-    if isinstance(other, type(array)):
-        other = other._ndarray
+            result = np.empty_like(self._ndarray, dtype="object")
+            result[mask] = np.nan
+            result[valid] = op(self._ndarray[valid], other)
 
-    mask = array.isna()
-    if not lib.is_scalar(other):
-        other = np.asarray(other)
-        mask |= isna(other)
+            if op.__name__ in {"add", "radd", "mul", "rmul"}:
+                new = StringArray
+            elif mask.any():
+                new = lambda x: np.asarray(x, dtype="object")
+            else:
+                new = lambda x: np.asarray(x, dtype="bool")
+
+            return new(result)
+
+        return compat.set_function_name(method, "__{}__".format(op.__name__), cls)
+
+    @classmethod
+    def _add_arithmetic_ops(cls):
+        cls.__add__ = cls._create_arithmetic_method(operator.add)
+        cls.__radd__ = cls._create_arithmetic_method(ops.radd)
 
-        other = other[~mask]
+        cls.__mul__ = cls._create_arithmetic_method(operator.mul)
+        cls.__rmul__ = cls._create_arithmetic_method(ops.rmul)
 
-    valid = ~mask
+    _create_comparison_method = _create_arithmetic_method
 
-    out = np.empty_like(array._ndarray, dtype="object")
-    out[mask] = np.nan
-    out[valid] = op(array._ndarray[valid], other)
 
-    return type(array)(out)
+StringArray._add_arithmetic_ops()
+StringArray._add_comparison_ops()
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 75db0ba6c988b..3fa4a08023ebe 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -40,6 +40,14 @@ def test_string_methods(input, method):
     tm.assert_series_equal(result.astype(object), expected)
 
 
+def test_astype_roundtrip():
+    s = pd.Series(pd.date_range("2000", periods=12))
+    s[0] = None
+
+    result = s.astype("string").astype("datetime64[ns]")
+    tm.assert_series_equal(result, s)
+
+
 def test_add():
     a = pd.Series(["a", "b", "c", None, None], dtype="string")
     b = pd.Series(["x", "y", None, "z", None], dtype="string")
@@ -73,6 +81,16 @@ def test_add_sequence():
     tm.assert_extension_array_equal(result, expected)
 
 
+def test_mul():
+    a = pd.array(["a", "b", None], dtype="string")
+    result = a * 2
+    expected = pd.array(["aa", "bb", None], dtype="string")
+    tm.assert_extension_array_equal(result, expected)
+
+    result = 2 * a
+    tm.assert_extension_array_equal(result, expected)
+
+
 @pytest.mark.xfail(reason="GH-28527")
 def test_add_strings():
     array = pd.array(["a", "b", "c", "d"], dtype="string")

From 79bd87a835dd9cdfcd50e6731019edefb49e95c9 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Thu, 19 Sep 2019 13:28:13 -0500
Subject: [PATCH 21/49] note

---
 pandas/core/arrays/string_.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index a1580cbd2c228..73d718973da72 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -46,6 +46,10 @@ class StringDtype(ExtensionDtype):
     def na_value(self) -> "Scalar":
         """
         StringDtype uses :attr:`numpy.nan` as the missing NA value.
+
+        .. warning::
+
+           `na_value` may change in a future release.
         """
         return np.nan
 
@@ -85,6 +89,9 @@ class StringArray(PandasArray):
        StringArray is considered experimental. The implementation and
        parts of the API may change without warning.
 
+       In particular, the NA value used may change to no longer be
+       ``numpy.nan``.
+
     Parameters
     ----------
     values : array-like

From fd242749cbaec3d87f05a33aeda692a023dd7978 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 23 Sep 2019 08:12:23 -0500
Subject: [PATCH 22/49] spacing

---
 doc/source/user_guide/text.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
index 4405497d7c376..ce68966995a8b 100644
--- a/doc/source/user_guide/text.rst
+++ b/doc/source/user_guide/text.rst
@@ -187,7 +187,7 @@ i.e., from the end of the string to the beginning of the string:
 .. ipython:: python
 
    s3 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca',
-                  '', np.nan, 'CABA', 'dog', 'cat'],
+                   '', np.nan, 'CABA', 'dog', 'cat'],
                   dtype="string")
    s3
    s3.str.replace('^.a|dog', 'XX-XX ', case=False)

From 0635ede35a8b5db03e7f1406be55ea485d4b2314 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 23 Sep 2019 08:14:30 -0500
Subject: [PATCH 23/49] warning note

---
 pandas/core/arrays/string_.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 73d718973da72..7a57caed8585b 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -28,6 +28,14 @@ class StringDtype(ExtensionDtype):
 
     .. versionadded:: 1.0.0
 
+    .. warning::
+
+       StringDtype is considered experimental. The implementation and
+       parts of the API may change without warning.
+
+       In particular, StringDtype.na_value may change to no longer be
+       ``numpy.nan``.
+
     Attributes
     ----------
     None

From d3311ee9e7d3a7a9c5f7e67d597b98c299fcbe0a Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 23 Sep 2019 08:16:12 -0500
Subject: [PATCH 24/49] update doc

---
 doc/source/getting_started/basics.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst
index bfe88dc7df1c3..b913a70983e9c 100644
--- a/doc/source/getting_started/basics.rst
+++ b/doc/source/getting_started/basics.rst
@@ -1932,6 +1932,7 @@ period (time spans) :class:`PeriodDtype`      :class:`Period`    :class:`arrays.
 sparse              :class:`SparseDtype`      (none)             :class:`arrays.SparseArray`   :ref:`sparse`
 intervals           :class:`IntervalDtype`    :class:`Interval`  :class:`arrays.IntervalArray` :ref:`advanced.intervalindex`
 nullable integer    :class:`Int64Dtype`, ...  (none)             :class:`arrays.IntegerArray`  :ref:`integer_na`
+Text                :class:`StringDtype`      :class:`str`       :class:`arrays.StringArray`   :ref:`text`
 =================== ========================= ================== ============================= =============================
 
 Pandas uses the ``object`` dtype for storing strings.

From dce9258d7c457ea1b9086edad0766ba22c691639 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 23 Sep 2019 08:20:05 -0500
Subject: [PATCH 25/49] doc updates

---
 doc/source/getting_started/basics.rst | 7 ++++++-
 doc/source/user_guide/text.rst        | 4 ++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst
index b913a70983e9c..7f2a60b51cae2 100644
--- a/doc/source/getting_started/basics.rst
+++ b/doc/source/getting_started/basics.rst
@@ -1935,7 +1935,12 @@ nullable integer    :class:`Int64Dtype`, ...  (none)             :class:`arrays.
 Text                :class:`StringDtype`      :class:`str`       :class:`arrays.StringArray`   :ref:`text`
 =================== ========================= ================== ============================= =============================
 
-Pandas uses the ``object`` dtype for storing strings.
+Pandas has two ways to store strings.
+
+1. ``object`` dtype, which can hold any Python object, including strings.
+2. :class:`arrays.StringArray`, which is dedicated to strings.
+
+Generally, we recommend using :class:`arrays.StringArray`. See :ref:`text.types` fore more.
 
 Finally, arbitrary objects may be stored using the ``object`` dtype, but should
 be avoided to the extent possible (for performance and interoperability with
diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
index ce68966995a8b..e71dd540a45c0 100644
--- a/doc/source/user_guide/text.rst
+++ b/doc/source/user_guide/text.rst
@@ -31,6 +31,10 @@ for many reasons:
 3. When reading code, the contents of an ``object`` dtype array is less clear
    than ``string``.
 
+Currently, the performance of ``object`` dtype arrays of strings and
+:class:`arrays.StringArray` are about the same. We expect future enhancements
+to significantly increase the performance and lower the memory overhead of
+:class:`arrays.StringArray`.
 
 .. warning::
 

From 0524f7ebf356f8b591f334d38741f075b4f63a90 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 23 Sep 2019 10:16:09 -0500
Subject: [PATCH 26/49] update ctor

---
 doc/source/user_guide/text.rst |  2 +-
 pandas/core/strings.py         | 11 ++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
index e71dd540a45c0..f757f3894c716 100644
--- a/doc/source/user_guide/text.rst
+++ b/doc/source/user_guide/text.rst
@@ -25,7 +25,7 @@ for many reasons:
 
 1. You can accidentally store a *mixture* of strings and non-strings in an
    ``object`` dtype array. It's better to have a dedicated dtype.
-2. ``object`` dtype breaks dtype-specific operations like ``select_dtypes``.
+2. ``object`` dtype breaks dtype-specific operations like :meth:`DataFrame.select_dtypes`.
    There isn't a clear way to select *just* text while excluding non-text
    but still object-dtype columns.
 3. When reading code, the contents of an ``object`` dtype array is less clear
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index 64953878d978f..e1bb4b7a500c1 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -817,15 +817,16 @@ def _str_extract_frame(arr, pat, flags=0):
         result_index = arr.index
     except AttributeError:
         result_index = None
-    result = DataFrame(
+    if arr.dtype.name == "string":
+        dtype = "string"
+    else:
+        dtype = object
+    return DataFrame(
         [groups_or_na(val) for val in arr],
         columns=columns,
         index=result_index,
-        dtype=object,
+        dtype=dtype,
     )
-    if arr.dtype.name == "string":
-        result = result.astype("string")
-    return result
 
 
 def str_extract(arr, pat, flags=0, expand=True):

From 292a8f34c10fa47542ea4718111e2ba1fd04ddc9 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 23 Sep 2019 10:24:21 -0500
Subject: [PATCH 27/49] clean up wrapping

---
 pandas/core/strings.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index e1bb4b7a500c1..f807b26ab01e9 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -916,7 +916,7 @@ def str_extract(arr, pat, flags=0, expand=True):
         return _str_extract_frame(arr._orig, pat, flags=flags)
     else:
         result, name = _str_extract_noexpand(arr._parent, pat, flags=flags)
-        return arr._wrap_result(result, name=name, expand=expand, returns_string=True)
+        return arr._wrap_result(result, name=name, expand=expand)
 
 
 def str_extractall(arr, pat, flags=0):
@@ -2052,6 +2052,15 @@ def _wrap_result(
             return result
         assert result.ndim < 3
 
+        # We can be wrapping a string / object / categorical result, in which
+        # case we'll want to return the same dtype as the input.
+        # Or we can be wrapping a numeric output, in which case we don't want
+        # to return a StringArray.
+        if self._is_string and returns_string:
+            dtype = "string"
+        else:
+            dtype = None
+
         if expand is None:
             # infer from ndim if expand is not specified
             expand = result.ndim != 1
@@ -2109,13 +2118,11 @@ def cons_row(x):
             index = self._orig.index
             if expand:
                 cons = self._orig._constructor_expanddim
-                result = cons(result, columns=name, index=index)
+                result = cons(result, columns=name, index=index, dtype=dtype)
             else:
                 # Must be a Series
                 cons = self._orig._constructor
-                result = cons(result, name=name, index=index)
-            if self._is_string and returns_string:
-                result = result.astype("string")
+                result = cons(result, name=name, index=index, dtype=dtype)
             return result
 
     def _get_series_list(self, others):

From 2c88e3b26fee576b62c3a83c3cc2c2358333834c Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 23 Sep 2019 10:30:27 -0500
Subject: [PATCH 28/49] clarify

---
 pandas/core/strings.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index f807b26ab01e9..2fb09182cc6cf 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -1025,9 +1025,9 @@ def str_extractall(arr, pat, flags=0):
     index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"])
 
     # workaround #27953
-    from pandas import StringDtype
-
-    if isinstance(arr.dtype, StringDtype):
+    # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails
+    # when the list of values is empty.
+    if arr.dtype.name == "string":
         dtype = arr.dtype
     else:
         dtype = None

From 1b8c83afd49a9d168a750241f29fda2c9a2a982d Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 23 Sep 2019 10:48:34 -0500
Subject: [PATCH 29/49] reduce sum

---
 pandas/core/arrays/string_.py              |  9 +++++++++
 pandas/tests/arrays/string_/test_string.py | 17 +++++++++++++++++
 pandas/tests/extension/test_string.py      |  2 +-
 3 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 7a57caed8585b..9d1549e6c5299 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -216,6 +216,15 @@ def astype(self, dtype, copy=True):
         return super().astype(dtype, copy)
 
     def _reduce(self, name, skipna=True, **kwargs):
+        if name == "sum":
+            vals = self._ndarray
+            missing = self.isna()
+            if skipna:
+                vals = vals[~missing]
+            elif missing.any():
+                return np.nan
+            return vals.sum()
+
         raise TypeError("Cannot perform reduction '{}' with string dtype".format(name))
 
     def value_counts(self, dropna=False):
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 3fa4a08023ebe..854fe0b4fc0cf 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -128,3 +128,20 @@ def test_constructor_raises():
 
     with pytest.raises(ValueError, match="object-dtype ndarray"):
         pd.arrays.StringArray(np.array([]))
+
+
+@pytest.mark.parametrize("skipna", [True, False])
+def test_reduce(skipna):
+    arr = pd.Series(["a", "b", "c"], dtype="string")
+    result = arr.sum(skipna=skipna)
+    assert result == "abc"
+
+
+@pytest.mark.parametrize("skipna", [True, False])
+def test_reduce_missing(skipna):
+    arr = pd.Series([None, "a", None, "b", "c", None], dtype="string")
+    result = arr.sum(skipna=skipna)
+    if skipna:
+        assert result == "abc"
+    else:
+        assert pd.isna(result)
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index cba10b1b7f88b..d2596e75d5583 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -76,7 +76,7 @@ class TestMissing(base.BaseMissingTests):
     pass
 
 
-class TestReduce(base.BaseNoReduceTests):
+class TestNoReduce(base.BaseNoReduceTests):
     pass
 
 

From f1dad2a0972f57c79f9e2b1fccb21f4d2be8eaaa Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 23 Sep 2019 11:42:00 -0500
Subject: [PATCH 30/49] skip reduce sum

---
 pandas/tests/extension/test_string.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index d2596e75d5583..ae1008dce81d1 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -77,7 +77,14 @@ class TestMissing(base.BaseMissingTests):
 
 
 class TestNoReduce(base.BaseNoReduceTests):
-    pass
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
+        if all_numeric_reductions == "sum":
+            pytest.skip("StringArray implements sum")
+        else:
+            return super().test_reduce_series_numeric(
+                data, all_numeric_reductions, skipna
+            )
 
 
 class TestMethods(base.BaseMethodsTests):

From be95ecb23c6b40963043172de6151b71a0b772bd Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 23 Sep 2019 16:44:22 -0500
Subject: [PATCH 31/49] rename

---
 doc/source/getting_started/basics.rst      | 10 ++--
 doc/source/reference/arrays.rst            | 10 ++--
 doc/source/user_guide/text.rst             | 70 +++++++++++-----------
 doc/source/whatsnew/v1.0.0.rst             |  4 +-
 pandas/__init__.py                         |  2 +-
 pandas/arrays/__init__.py                  |  4 +-
 pandas/core/api.py                         |  2 +-
 pandas/core/arrays/__init__.py             |  2 +-
 pandas/core/arrays/string_.py              | 56 ++++++++---------
 pandas/core/strings.py                     | 18 +++---
 pandas/tests/api/test_api.py               |  2 +-
 pandas/tests/arrays/string_/test_string.py | 50 ++++++++--------
 pandas/tests/extension/test_string.py      | 16 ++---
 pandas/tests/test_strings.py               | 11 ++--
 14 files changed, 130 insertions(+), 127 deletions(-)

diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst
index 7f2a60b51cae2..37ced806df406 100644
--- a/doc/source/getting_started/basics.rst
+++ b/doc/source/getting_started/basics.rst
@@ -1705,7 +1705,7 @@ built-in string methods. For example:
  .. ipython:: python
 
   s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'],
-                dtype="string")
+                dtype="text")
   s.str.lower()
 
 Powerful pattern-matching methods are provided as well, but note that
@@ -1716,7 +1716,7 @@ always uses them).
 .. note::
 
    Prior to pandas 1.0, string methods were only available on ``object`` -dtype
-   ``Series``. Pandas 1.0 added the :class:`StringDtype` which is dedicated
+   ``Series``. Pandas 1.0 added the :class:`TextDtype` which is dedicated
    to strings. See :ref:`text.types` for more.
 
 Please see :ref:`Vectorized String Methods <text.string_methods>` for a complete
@@ -1932,15 +1932,15 @@ period (time spans) :class:`PeriodDtype`      :class:`Period`    :class:`arrays.
 sparse              :class:`SparseDtype`      (none)             :class:`arrays.SparseArray`   :ref:`sparse`
 intervals           :class:`IntervalDtype`    :class:`Interval`  :class:`arrays.IntervalArray` :ref:`advanced.intervalindex`
 nullable integer    :class:`Int64Dtype`, ...  (none)             :class:`arrays.IntegerArray`  :ref:`integer_na`
-Text                :class:`StringDtype`      :class:`str`       :class:`arrays.StringArray`   :ref:`text`
+Text                :class:`TextDtype`        :class:`str`       :class:`arrays.TextArray`     :ref:`text`
 =================== ========================= ================== ============================= =============================
 
 Pandas has two ways to store strings.
 
 1. ``object`` dtype, which can hold any Python object, including strings.
-2. :class:`arrays.StringArray`, which is dedicated to strings.
+2. :class:`arrays.TextArray`, which is dedicated to strings.
 
-Generally, we recommend using :class:`arrays.StringArray`. See :ref:`text.types` fore more.
+Generally, we recommend using :class:`arrays.TextArray`. See :ref:`text.types` fore more.
 
 Finally, arbitrary objects may be stored using the ``object`` dtype, but should
 be avoided to the extent possible (for performance and interoperability with
diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst
index db620e73301cb..8f8e1e1c1ea2e 100644
--- a/doc/source/reference/arrays.rst
+++ b/doc/source/reference/arrays.rst
@@ -24,7 +24,7 @@ Intervals           :class:`IntervalDtype`    :class:`Interval`  :ref:`api.array
 Nullable Integer    :class:`Int64Dtype`, ...  (none)             :ref:`api.arrays.integer_na`
 Categorical         :class:`CategoricalDtype` (none)             :ref:`api.arrays.categorical`
 Sparse              :class:`SparseDtype`      (none)             :ref:`api.arrays.sparse`
-Text                :class:`StringDtype`      :class:`str`       :ref:`api.arrays.string`
+Text                :class:`TextDtype`      :class:`str`       :ref:`api.arrays.string`
 =================== ========================= ================== =============================
 
 Pandas and third-party libraries can extend NumPy's type system (see :ref:`extending.extension-types`).
@@ -467,21 +467,21 @@ Text data
 ---------
 
 When working with text data, where each valid element is a string, we recommend using
-:class:`StringDtype` (with the alias ``"string"``).
+:class:`TextDtype` (with the alias ``"string"``).
 
 .. autosummary::
    :toctree: api/
    :template: autosummary/class_without_autosummary.rst
 
-   arrays.StringArray
+   arrays.TextArray
 
 .. autosummary::
    :toctree: api/
    :template: autosummary/class_without_autosummary.rst
 
-   StringDtype
+   TextDtype
 
-The ``Series.str`` accessor is available for ``Series`` backed by a :class:`arrays.StringArray`.
+The ``Series.str`` accessor is available for ``Series`` backed by a :class:`arrays.TextArray`.
 See :ref:`api.series.str` for more.
 
 
diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
index f757f3894c716..646dac579fe54 100644
--- a/doc/source/user_guide/text.rst
+++ b/doc/source/user_guide/text.rst
@@ -16,9 +16,9 @@ Text Data Types
 There are two main ways to store text data
 
 1. ``object`` -dtype NumPy array.
-2. As an :class:`arrays.StringArray` extension type.
+2. As an :class:`arrays.TextArray` extension type.
 
-We recommend using :class:`arrays.StringArray` to store text data.
+We recommend using :class:`arrays.TextArray` to store text data.
 
 Prior to pandas 1.0, ``object`` dtype was the only option. This was unfortunate
 for many reasons:
@@ -32,13 +32,13 @@ for many reasons:
    than ``string``.
 
 Currently, the performance of ``object`` dtype arrays of strings and
-:class:`arrays.StringArray` are about the same. We expect future enhancements
+:class:`arrays.TextArray` are about the same. We expect future enhancements
 to significantly increase the performance and lower the memory overhead of
-:class:`arrays.StringArray`.
+:class:`arrays.TextArray`.
 
 .. warning::
 
-   StringArray is currently considered experimental.
+   TextArray is currently considered experimental.
 
 For backwards-compatibility, ``object`` dtype remains the default type we
 infer a list of strings to
@@ -47,12 +47,12 @@ infer a list of strings to
 
    pd.Series(['a', 'b', 'c'])
 
-To explicitly request ``string`` dtype, specify the ``dtype``
+To explicitly request ``text`` dtype, specify the ``dtype``
 
 .. ipython:: python
 
-   pd.Series(['a', 'b', 'c'], dtype="string")
-   pd.Series(['a', 'b', 'c'], dtype=pd.StringDtype())
+   pd.Series(['a', 'b', 'c'], dtype="text")
+   pd.Series(['a', 'b', 'c'], dtype=pd.TextDtype())
 
 Or ``astype`` after the ``Series`` or ``DataFrame`` is created
 
@@ -60,7 +60,7 @@ Or ``astype`` after the ``Series`` or ``DataFrame`` is created
 
    s = pd.Series(['a', 'b', 'c'])
    s
-   s.astype("string")
+   s.astype("text")
 
 Everything that follows in the rest of this document applies equally to
 ``string`` and ``object`` dtype.
@@ -79,7 +79,7 @@ the equivalent (scalar) built-in string methods:
 .. ipython:: python
 
    s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'],
-                 dtype="string")
+                 dtype="text")
    s.str.lower()
    s.str.upper()
    s.str.len()
@@ -153,7 +153,7 @@ Methods like ``split`` return a Series of lists:
 
 .. ipython:: python
 
-   s2 = pd.Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'], dtype="string")
+   s2 = pd.Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'], dtype="text")
    s2.str.split('_')
 
 Elements in the split lists can be accessed using ``get`` or ``[]`` notation:
@@ -169,8 +169,8 @@ It is easy to expand this to return a DataFrame using ``expand``.
 
    s2.str.split('_', expand=True)
 
-When original ``Series`` has :class:`StringDtype`, the output columns will all
-be :class:`StringDtype` as well.
+When original ``Series`` has :class:`TextDtype`, the output columns will all
+be :class:`TextDtype` as well.
 
 It is also possible to limit the number of splits:
 
@@ -192,7 +192,7 @@ i.e., from the end of the string to the beginning of the string:
 
    s3 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca',
                    '', np.nan, 'CABA', 'dog', 'cat'],
-                  dtype="string")
+                  dtype="text")
    s3
    s3.str.replace('^.a|dog', 'XX-XX ', case=False)
 
@@ -203,7 +203,7 @@ following code will cause trouble because of the regular expression meaning of
 .. ipython:: python
 
    # Consider the following badly formatted financial data
-   dollars = pd.Series(['12', '-$10', '$10,000'], dtype="string")
+   dollars = pd.Series(['12', '-$10', '$10,000'], dtype="text")
 
    # This does what you'd naively expect:
    dollars.str.replace('$', '')
@@ -242,7 +242,7 @@ positional argument (a regex object) and return a string.
        return m.group(0)[::-1]
 
    pd.Series(['foo 123', 'bar baz', np.nan],
-             dtype="string").str.replace(pat, repl)
+             dtype="text").str.replace(pat, repl)
 
    # Using regex groups
    pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
@@ -251,7 +251,7 @@ positional argument (a regex object) and return a string.
        return m.group('two').swapcase()
 
    pd.Series(['Foo Bar Baz', np.nan],
-             dtype="string").str.replace(pat, repl)
+             dtype="text").str.replace(pat, repl)
 
 .. versionadded:: 0.20.0
 
@@ -290,7 +290,7 @@ The content of a ``Series`` (or ``Index``) can be concatenated:
 
 .. ipython:: python
 
-    s = pd.Series(['a', 'b', 'c', 'd'], dtype="string")
+    s = pd.Series(['a', 'b', 'c', 'd'], dtype="text")
     s.str.cat(sep=',')
 
 If not specified, the keyword ``sep`` for the separator defaults to the empty string, ``sep=''``:
@@ -303,7 +303,7 @@ By default, missing values are ignored. Using ``na_rep``, they can be given a re
 
 .. ipython:: python
 
-    t = pd.Series(['a', 'b', np.nan, 'd'], dtype="string")
+    t = pd.Series(['a', 'b', np.nan, 'd'], dtype="text")
     t.str.cat(sep=',')
     t.str.cat(sep=',', na_rep='-')
 
@@ -349,7 +349,7 @@ the ``join``-keyword.
    :okwarning:
 
    u = pd.Series(['b', 'd', 'a', 'c'], index=[1, 3, 0, 2],
-                 dtype="string")
+                 dtype="text")
    s
    u
    s.str.cat(u)
@@ -366,7 +366,7 @@ In particular, alignment also means that the different lengths do not need to co
 .. ipython:: python
 
     v = pd.Series(['z', 'a', 'b', 'd', 'e'], index=[-1, 0, 1, 3, 4],
-                  dtype="string")
+                  dtype="text")
     s
     v
     s.str.cat(v, join='left', na_rep='-')
@@ -423,7 +423,7 @@ of the string, the result will be a ``NaN``.
 
    s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan,
                   'CABA', 'dog', 'cat'],
-                 dtype="string")
+                 dtype="text")
 
    s.str[0]
    s.str[1]
@@ -455,7 +455,7 @@ DataFrame with one column per group.
 .. ipython:: python
 
    pd.Series(['a1', 'b2', 'c3'],
-             dtype="string").str.extract(r'([ab])(\d)', expand=False)
+             dtype="text").str.extract(r'([ab])(\d)', expand=False)
 
 Elements that do not match return a row filled with ``NaN``. Thus, a
 Series of messy strings can be "converted" into a like-indexed Series
@@ -469,7 +469,7 @@ Named groups like
 .. ipython:: python
 
    pd.Series(['a1', 'b2', 'c3'],
-             dtype="string").str.extract(r'(?P<letter>[ab])(?P<digit>\d)',
+             dtype="text").str.extract(r'(?P<letter>[ab])(?P<digit>\d)',
                                          expand=False)
 
 and optional groups like
@@ -477,7 +477,7 @@ and optional groups like
 .. ipython:: python
 
    pd.Series(['a1', 'b2', '3'],
-             dtype="string").str.extract(r'([ab])?(\d)', expand=False)
+             dtype="text").str.extract(r'([ab])?(\d)', expand=False)
 
 can also be used. Note that any capture group names in the regular
 expression will be used for column names; otherwise capture group
@@ -489,14 +489,14 @@ with one column if ``expand=True``.
 .. ipython:: python
 
    pd.Series(['a1', 'b2', 'c3'],
-             dtype="string").str.extract(r'[ab](\d)', expand=True)
+             dtype="text").str.extract(r'[ab](\d)', expand=True)
 
 It returns a Series if ``expand=False``.
 
 .. ipython:: python
 
    pd.Series(['a1', 'b2', 'c3'],
-             dtype="string").str.extract(r'[ab](\d)', expand=False)
+             dtype="text").str.extract(r'[ab](\d)', expand=False)
 
 Calling on an ``Index`` with a regex with exactly one capture group
 returns a ``DataFrame`` with one column if ``expand=True``.
@@ -504,7 +504,7 @@ returns a ``DataFrame`` with one column if ``expand=True``.
 .. ipython:: python
 
    s = pd.Series(["a1", "b2", "c3"], ["A11", "B22", "C33"],
-                 dtype="string")
+                 dtype="text")
    s
    s.index.str.extract("(?P<letter>[a-zA-Z])", expand=True)
 
@@ -550,7 +550,7 @@ Unlike ``extract`` (which returns only the first match),
 .. ipython:: python
 
    s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"],
-                 dtype="string")
+                 dtype="text")
    s
    two_groups = '(?P<letter>[a-z])(?P<digit>[0-9])'
    s.str.extract(two_groups, expand=True)
@@ -568,7 +568,7 @@ When each subject string in the Series has exactly one match,
 
 .. ipython:: python
 
-   s = pd.Series(['a3', 'b3', 'c2'], dtype="string")
+   s = pd.Series(['a3', 'b3', 'c2'], dtype="text")
    s
 
 then ``extractall(pat).xs(0, level='match')`` gives the same result as
@@ -589,7 +589,7 @@ same result as a ``Series.str.extractall`` with a default index (starts from 0).
 
    pd.Index(["a1a2", "b1", "c1"]).str.extractall(two_groups)
 
-   pd.Series(["a1a2", "b1", "c1"], dtype="string").str.extractall(two_groups)
+   pd.Series(["a1a2", "b1", "c1"], dtype="text").str.extractall(two_groups)
 
 
 Testing for Strings that match or contain a pattern
@@ -601,14 +601,14 @@ You can check whether elements contain a pattern:
 
    pattern = r'[0-9][a-z]'
    pd.Series(['1', '2', '3a', '3b', '03c'],
-             dtype="string").str.contains(pattern)
+             dtype="text").str.contains(pattern)
 
 Or whether elements match a pattern:
 
 .. ipython:: python
 
    pd.Series(['1', '2', '3a', '3b', '03c'],
-             dtype="string").str.match(pattern)
+             dtype="text").str.match(pattern)
 
 The distinction between ``match`` and ``contains`` is strictness: ``match``
 relies on strict ``re.match``, while ``contains`` relies on ``re.search``.
@@ -619,7 +619,7 @@ an extra ``na`` argument so missing values can be considered True or False:
 .. ipython:: python
 
    s4 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'],
-                  dtype="string")
+                  dtype="text")
    s4.str.contains('A', na=False)
 
 .. _text.indicator:
@@ -632,7 +632,7 @@ For example if they are separated by a ``'|'``:
 
 .. ipython:: python
 
-    s = pd.Series(['a', 'a|b', np.nan, 'a|c'], dtype="string")
+    s = pd.Series(['a', 'a|b', np.nan, 'a|c'], dtype="text")
     s.str.get_dummies(sep='|')
 
 String ``Index`` also supports ``get_dummies`` which returns a ``MultiIndex``.
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
index 925ef1d16ea2f..3f26e661660a8 100644
--- a/doc/source/whatsnew/v1.0.0.rst
+++ b/doc/source/whatsnew/v1.0.0.rst
@@ -55,12 +55,12 @@ Enhancements
 Dedicated string data type
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-We've added :class:`StringDtype`, an extension type dedicated to string data.
+We've added :class:`TextDtype`, an extension type dedicated to string data.
 Previously, strings were typically stored in object-dtype NumPy arrays.
 
 .. ipython:: python
 
-   pd.Series(['abc', None, 'def'], dtype=pd.StringDtype())
+   pd.Series(['abc', None, 'def'], dtype=pd.TextDtype())
 
 You can use the alias ``'string'`` as well.
 
diff --git a/pandas/__init__.py b/pandas/__init__.py
index 2cd891c696203..f987f1e2f5273 100644
--- a/pandas/__init__.py
+++ b/pandas/__init__.py
@@ -66,7 +66,7 @@
     PeriodDtype,
     IntervalDtype,
     DatetimeTZDtype,
-    StringDtype,
+    TextDtype,
     # missing
     isna,
     isnull,
diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py
index 9870b5bed076d..24cb6a9c5acf7 100644
--- a/pandas/arrays/__init__.py
+++ b/pandas/arrays/__init__.py
@@ -11,7 +11,7 @@
     PandasArray,
     PeriodArray,
     SparseArray,
-    StringArray,
+    TextArray,
     TimedeltaArray,
 )
 
@@ -23,6 +23,6 @@
     "PandasArray",
     "PeriodArray",
     "SparseArray",
-    "StringArray",
+    "TextArray",
     "TimedeltaArray",
 ]
diff --git a/pandas/core/api.py b/pandas/core/api.py
index 04f2f84c92a15..e17602b8943f5 100644
--- a/pandas/core/api.py
+++ b/pandas/core/api.py
@@ -23,7 +23,7 @@
     UInt32Dtype,
     UInt64Dtype,
 )
-from pandas.core.arrays.string_ import StringDtype
+from pandas.core.arrays.string_ import TextDtype
 from pandas.core.construction import array
 from pandas.core.groupby import Grouper, NamedAgg
 from pandas.core.index import (
diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py
index 868118bac6a7b..a7989cd154d04 100644
--- a/pandas/core/arrays/__init__.py
+++ b/pandas/core/arrays/__init__.py
@@ -10,5 +10,5 @@
 from .numpy_ import PandasArray, PandasDtype  # noqa: F401
 from .period import PeriodArray, period_array  # noqa: F401
 from .sparse import SparseArray  # noqa: F401
-from .string_ import StringArray  # noqa: F401
+from .string_ import TextArray  # noqa: F401
 from .timedeltas import TimedeltaArray  # noqa: F401
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 9d1549e6c5299..6de2fee0050ca 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -22,7 +22,7 @@
 
 
 @register_extension_dtype
-class StringDtype(ExtensionDtype):
+class TextDtype(ExtensionDtype):
     """
     Extension dtype for text data.
 
@@ -30,10 +30,10 @@ class StringDtype(ExtensionDtype):
 
     .. warning::
 
-       StringDtype is considered experimental. The implementation and
+       TextDtype is considered experimental. The implementation and
        parts of the API may change without warning.
 
-       In particular, StringDtype.na_value may change to no longer be
+       In particular, TextDtype.na_value may change to no longer be
        ``numpy.nan``.
 
     Attributes
@@ -46,14 +46,14 @@ class StringDtype(ExtensionDtype):
 
     Examples
     --------
-    >>> pd.StringDtype()
-    StringDtype
+    >>> pd.TextDtype()
+    TextDtype
     """
 
     @property
     def na_value(self) -> "Scalar":
         """
-        StringDtype uses :attr:`numpy.nan` as the missing NA value.
+        TextDtype uses :attr:`numpy.nan` as the missing NA value.
 
         .. warning::
 
@@ -68,25 +68,25 @@ def type(self) -> Type:
     @property
     def name(self) -> str:
         """
-        The alias for StringDtype is ``'string'``.
+        The alias for TextDtype is ``'text'``.
         """
-        return "string"
+        return "text"
 
     @classmethod
     def construct_from_string(cls, string: str) -> ExtensionDtype:
-        if string == "string":
+        if string == "text":
             return cls()
         return super().construct_from_string(string)
 
     @classmethod
-    def construct_array_type(cls) -> "Type[StringArray]":
-        return StringArray
+    def construct_array_type(cls) -> "Type[TextArray]":
+        return TextArray
 
     def __repr__(self) -> str:
-        return "StringDtype"
+        return "TextDtype"
 
 
-class StringArray(PandasArray):
+class TextArray(PandasArray):
     """
     Extension array for text data.
 
@@ -94,7 +94,7 @@ class StringArray(PandasArray):
 
     .. warning::
 
-       StringArray is considered experimental. The implementation and
+       TextArray is considered experimental. The implementation and
        parts of the API may change without warning.
 
        In particular, the NA value used may change to no longer be
@@ -125,19 +125,19 @@ class StringArray(PandasArray):
     --------
     Series.str
         The string methods are available on Series backed by
-        a StringArray.
+        a TextArray.
 
     Examples
     --------
-    >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string")
-    <StringArray>
+    >>> pd.array(['This is', 'some text', None, 'data.'], dtype="text")
+    <TextArray>
     ['This is', 'some text', nan, 'data.']
     Length: 4, dtype: string
 
-    Unlike ``object`` dtype arrays, ``StringArray`` doesn't allow non-string
+    Unlike ``object`` dtype arrays, ``TextArray`` doesn't allow non-string
     values.
 
-    >>> pd.array(['1', 1], dtype="string")
+    >>> pd.array(['1', 1], dtype="text")
     Traceback (most recent call last):
     ...
     ValueError: Must provide strings
@@ -148,23 +148,23 @@ class StringArray(PandasArray):
 
     def __init__(self, values, copy=False):
         super().__init__(values, copy=copy)
-        self._dtype = StringDtype()
+        self._dtype = TextDtype()
         self._validate()
 
     def _validate(self):
         """Validate that we only store NA or strings."""
         if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
-            raise ValueError("StringArray requires an object-dtype ndarray of strings.")
+            raise ValueError("TextArray requires an object-dtype ndarray of strings.")
         if self._ndarray.dtype != "object":
             raise ValueError(
-                "StringArray requires an object-dtype ndarray. Got "
+                "TextArray requires an object-dtype ndarray. Got "
                 "'{}' instead.".format(self._ndarray.dtype)
             )
 
     @classmethod
     def _from_sequence(cls, scalars, dtype=None, copy=False):
         if dtype:
-            assert dtype == "string"
+            assert dtype == "text"
         result = super()._from_sequence(scalars, dtype=object, copy=copy)
         # convert None to np.nan
         # TODO: it would be nice to do this in _validate / lib.is_string_array
@@ -193,7 +193,7 @@ def __setitem__(self, key, value):
                 value = np.nan
             elif not (isinstance(value, str) or np.isnan(value)):
                 raise ValueError(
-                    "Cannot set value '{}' into a StringArray.".format(value)
+                    "Cannot set value '{}' into a TextArray.".format(value)
                 )
         else:
             if not is_array_like(value):
@@ -209,7 +209,7 @@ def fillna(self, value=None, method=None, limit=None):
 
     def astype(self, dtype, copy=True):
         dtype = pandas_dtype(dtype)
-        if isinstance(dtype, StringDtype):
+        if isinstance(dtype, TextDtype):
             if copy:
                 return self.copy()
             return self
@@ -254,7 +254,7 @@ def method(self, other):
             result[valid] = op(self._ndarray[valid], other)
 
             if op.__name__ in {"add", "radd", "mul", "rmul"}:
-                new = StringArray
+                new = TextArray
             elif mask.any():
                 new = lambda x: np.asarray(x, dtype="object")
             else:
@@ -275,5 +275,5 @@ def _add_arithmetic_ops(cls):
     _create_comparison_method = _create_arithmetic_method
 
 
-StringArray._add_arithmetic_ops()
-StringArray._add_comparison_ops()
+TextArray._add_arithmetic_ops()
+TextArray._add_comparison_ops()
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index 2fb09182cc6cf..2ee4438024c32 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -817,8 +817,8 @@ def _str_extract_frame(arr, pat, flags=0):
         result_index = arr.index
     except AttributeError:
         result_index = None
-    if arr.dtype.name == "string":
-        dtype = "string"
+    if arr.dtype.name == "text":
+        dtype = "text"
     else:
         dtype = object
     return DataFrame(
@@ -1027,7 +1027,7 @@ def str_extractall(arr, pat, flags=0):
     # workaround #27953
     # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails
     # when the list of values is empty.
-    if arr.dtype.name == "string":
+    if arr.dtype.name == "text":
         dtype = arr.dtype
     else:
         dtype = None
@@ -1953,7 +1953,7 @@ class StringMethods(NoNewAttributesMixin):
     def __init__(self, data):
         self._inferred_dtype = self._validate(data)
         self._is_categorical = is_categorical_dtype(data)
-        self._is_string = data.dtype.name == "string"
+        self._is_string = data.dtype.name == "text"
 
         # .values.categories works for both Series/Index
         self._parent = data.values.categories if self._is_categorical else data
@@ -1984,7 +1984,7 @@ def _validate(data):
         -------
         dtype : inferred dtype of data
         """
-        from pandas import StringDtype
+        from pandas import TextDtype
 
         if isinstance(data, ABCMultiIndex):
             raise AttributeError(
@@ -1997,8 +1997,8 @@ def _validate(data):
         values = getattr(data, "values", data)  # Series / Index
         values = getattr(values, "categories", values)  # categorical / normal
 
-        # explicitly allow StringDtype
-        if isinstance(values.dtype, StringDtype):
+        # explicitly allow TextDtype
+        if isinstance(values.dtype, TextDtype):
             return "string"
 
         try:
@@ -2055,9 +2055,9 @@ def _wrap_result(
         # We can be wrapping a string / object / categorical result, in which
         # case we'll want to return the same dtype as the input.
         # Or we can be wrapping a numeric output, in which case we don't want
-        # to return a StringArray.
+        # to return a TextArray.
         if self._is_string and returns_string:
-            dtype = "string"
+            dtype = "text"
         else:
             dtype = None
 
diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py
index 6c50159663574..056a23ab81acf 100644
--- a/pandas/tests/api/test_api.py
+++ b/pandas/tests/api/test_api.py
@@ -68,7 +68,7 @@ class TestPDApi(Base):
         "Series",
         "SparseArray",
         "SparseDtype",
-        "StringDtype",
+        "TextDtype",
         "Timedelta",
         "TimedeltaIndex",
         "Timestamp",
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 854fe0b4fc0cf..8ef1533a73e19 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -8,13 +8,13 @@
 
 
 def test_none_to_nan():
-    a = pd.arrays.StringArray._from_sequence(["a", None, "b"])
+    a = pd.arrays.TextArray._from_sequence(["a", None, "b"])
     assert a[1] is not None
     assert np.isnan(a[1])
 
 
 def test_setitem_validates():
-    a = pd.arrays.StringArray._from_sequence(["a", "b"])
+    a = pd.arrays.TextArray._from_sequence(["a", "b"])
     with pytest.raises(ValueError, match="10"):
         a[0] = 10
 
@@ -31,12 +31,12 @@ def test_setitem_validates():
     ],
 )
 def test_string_methods(input, method):
-    a = pd.Series(input, dtype="string")
+    a = pd.Series(input, dtype="text")
     b = pd.Series(input, dtype="object")
     result = method(a.str)
     expected = method(b.str)
 
-    assert result.dtype.name == "string"
+    assert result.dtype.name == "text"
     tm.assert_series_equal(result.astype(object), expected)
 
 
@@ -44,47 +44,47 @@ def test_astype_roundtrip():
     s = pd.Series(pd.date_range("2000", periods=12))
     s[0] = None
 
-    result = s.astype("string").astype("datetime64[ns]")
+    result = s.astype("text").astype("datetime64[ns]")
     tm.assert_series_equal(result, s)
 
 
 def test_add():
-    a = pd.Series(["a", "b", "c", None, None], dtype="string")
-    b = pd.Series(["x", "y", None, "z", None], dtype="string")
+    a = pd.Series(["a", "b", "c", None, None], dtype="text")
+    b = pd.Series(["x", "y", None, "z", None], dtype="text")
 
     result = a + b
-    expected = pd.Series(["ax", "by", None, None, None], dtype="string")
+    expected = pd.Series(["ax", "by", None, None, None], dtype="text")
     tm.assert_series_equal(result, expected)
 
     result = a.add(b)
     tm.assert_series_equal(result, expected)
 
     result = a.radd(b)
-    expected = pd.Series(["xa", "yb", None, None, None], dtype="string")
+    expected = pd.Series(["xa", "yb", None, None, None], dtype="text")
     tm.assert_series_equal(result, expected)
 
     result = a.add(b, fill_value="-")
-    expected = pd.Series(["ax", "by", "c-", "-z", None], dtype="string")
+    expected = pd.Series(["ax", "by", "c-", "-z", None], dtype="text")
     tm.assert_series_equal(result, expected)
 
 
 def test_add_sequence():
-    a = pd.array(["a", "b", None, None], dtype="string")
+    a = pd.array(["a", "b", None, None], dtype="text")
     other = ["x", None, "y", None]
 
     result = a + other
-    expected = pd.array(["ax", None, None, None], dtype="string")
+    expected = pd.array(["ax", None, None, None], dtype="text")
     tm.assert_extension_array_equal(result, expected)
 
     result = other + a
-    expected = pd.array(["xa", None, None, None], dtype="string")
+    expected = pd.array(["xa", None, None, None], dtype="text")
     tm.assert_extension_array_equal(result, expected)
 
 
 def test_mul():
-    a = pd.array(["a", "b", None], dtype="string")
+    a = pd.array(["a", "b", None], dtype="text")
     result = a * 2
-    expected = pd.array(["aa", "bb", None], dtype="string")
+    expected = pd.array(["aa", "bb", None], dtype="text")
     tm.assert_extension_array_equal(result, expected)
 
     result = 2 * a
@@ -93,53 +93,53 @@ def test_mul():
 
 @pytest.mark.xfail(reason="GH-28527")
 def test_add_strings():
-    array = pd.array(["a", "b", "c", "d"], dtype="string")
+    array = pd.array(["a", "b", "c", "d"], dtype="text")
     df = pd.DataFrame([["t", "u", "v", "w"]])
     assert array.__add__(df) is NotImplemented
 
     result = array + df
-    expected = pd.DataFrame([["at", "bu", "cv", "dw"]]).astype("string")
+    expected = pd.DataFrame([["at", "bu", "cv", "dw"]]).astype("text")
     tm.assert_frame_equal(result, expected)
 
     result = df + array
-    expected = pd.DataFrame([["ta", "ub", "vc", "wd"]]).astype("string")
+    expected = pd.DataFrame([["ta", "ub", "vc", "wd"]]).astype("text")
     tm.assert_frame_equal(result, expected)
 
 
 @pytest.mark.xfail(reason="GH-28527")
 def test_add_frame():
-    array = pd.array(["a", "b", np.nan, np.nan], dtype="string")
+    array = pd.array(["a", "b", np.nan, np.nan], dtype="text")
     df = pd.DataFrame([["x", np.nan, "y", np.nan]])
 
     assert array.__add__(df) is NotImplemented
 
     result = array + df
-    expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype("string")
+    expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype("text")
     tm.assert_frame_equal(result, expected)
 
     result = df + array
-    expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype("string")
+    expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype("text")
     tm.assert_frame_equal(result, expected)
 
 
 def test_constructor_raises():
     with pytest.raises(ValueError, match="object-dtype ndarray"):
-        pd.arrays.StringArray(np.array(["a", "b"], dtype="S1"))
+        pd.arrays.TextArray(np.array(["a", "b"], dtype="S1"))
 
     with pytest.raises(ValueError, match="object-dtype ndarray"):
-        pd.arrays.StringArray(np.array([]))
+        pd.arrays.TextArray(np.array([]))
 
 
 @pytest.mark.parametrize("skipna", [True, False])
 def test_reduce(skipna):
-    arr = pd.Series(["a", "b", "c"], dtype="string")
+    arr = pd.Series(["a", "b", "c"], dtype="text")
     result = arr.sum(skipna=skipna)
     assert result == "abc"
 
 
 @pytest.mark.parametrize("skipna", [True, False])
 def test_reduce_missing(skipna):
-    arr = pd.Series([None, "a", None, "b", "c", None], dtype="string")
+    arr = pd.Series([None, "a", None, "b", "c", None], dtype="text")
     result = arr.sum(skipna=skipna)
     if skipna:
         assert result == "abc"
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index ae1008dce81d1..3489722b4538e 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -4,13 +4,13 @@
 import pytest
 
 import pandas as pd
-from pandas.core.arrays.string_ import StringArray, StringDtype
+from pandas.core.arrays.string_ import TextArray, TextDtype
 from pandas.tests.extension import base
 
 
 @pytest.fixture
 def dtype():
-    return StringDtype()
+    return TextDtype()
 
 
 @pytest.fixture
@@ -19,23 +19,23 @@ def data():
     while strings[0] == strings[1]:
         strings = np.random.choice(list(string.ascii_letters), size=100)
 
-    return StringArray._from_sequence(strings)
+    return TextArray._from_sequence(strings)
 
 
 @pytest.fixture
 def data_missing():
     """Length 2 array with [NA, Valid]"""
-    return StringArray._from_sequence([np.nan, "A"])
+    return TextArray._from_sequence([np.nan, "A"])
 
 
 @pytest.fixture
 def data_for_sorting():
-    return StringArray._from_sequence(["B", "C", "A"])
+    return TextArray._from_sequence(["B", "C", "A"])
 
 
 @pytest.fixture
 def data_missing_for_sorting():
-    return StringArray._from_sequence(["B", np.nan, "A"])
+    return TextArray._from_sequence(["B", np.nan, "A"])
 
 
 @pytest.fixture
@@ -45,7 +45,7 @@ def na_value():
 
 @pytest.fixture
 def data_for_grouping():
-    return StringArray._from_sequence(["B", "B", np.nan, np.nan, "A", "A", "B", "C"])
+    return TextArray._from_sequence(["B", "B", np.nan, np.nan, "A", "A", "B", "C"])
 
 
 class TestDtype(base.BaseDtypeTests):
@@ -80,7 +80,7 @@ class TestNoReduce(base.BaseNoReduceTests):
     @pytest.mark.parametrize("skipna", [True, False])
     def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
         if all_numeric_reductions == "sum":
-            pytest.skip("StringArray implements sum")
+            pytest.skip("TextArray implements sum")
         else:
             return super().test_reduce_series_numeric(
                 data, all_numeric_reductions, skipna
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
index b50f1a0fd2f2a..d0e18bd53b1b8 100644
--- a/pandas/tests/test_strings.py
+++ b/pandas/tests/test_strings.py
@@ -141,6 +141,7 @@ def any_string_method(request):
 # subset of the full set from pandas/conftest.py
 _any_allowed_skipna_inferred_dtype = [
     ("string", ["a", np.nan, "c"]),
+    ("text", ["a", np.nan, "c"]),
     ("bytes", [b"a", np.nan, b"c"]),
     ("empty", [np.nan, np.nan, np.nan]),
     ("empty", []),
@@ -156,6 +157,7 @@ def any_allowed_skipna_inferred_dtype(request):
 
     The covered (inferred) types are:
     * 'string'
+    * 'text'
     * 'empty'
     * 'bytes'
     * 'mixed'
@@ -221,6 +223,7 @@ def test_api_per_dtype(self, box, dtype, any_skipna_inferred_dtype):
 
         types_passing_constructor = [
             "string",
+            "text",
             "unicode",
             "empty",
             "bytes",
@@ -283,7 +286,7 @@ def test_api_per_method(
         mixed_allowed = method_name not in ["cat"]
 
         allowed_types = (
-            ["string", "unicode", "empty"]
+            ["string", "unicode", "empty", "text"]
             + ["bytes"] * bytes_allowed
             + ["mixed", "mixed-integer"] * mixed_allowed
         )
@@ -3276,7 +3279,7 @@ def test_casefold(self):
 def test_string_array(any_string_method):
     data = ["a", "bb", np.nan, "ccc"]
     a = Series(data, dtype=object)
-    b = Series(data, dtype="string")
+    b = Series(data, dtype="text")
     method_name, args, kwargs = any_string_method
 
     expected = getattr(a.str, method_name)(*args, **kwargs)
@@ -3286,10 +3289,10 @@ def test_string_array(any_string_method):
         if expected.dtype == "object" and lib.is_string_array(
             expected.values, skipna=True
         ):
-            assert result.dtype == "string"
+            assert result.dtype == "text"
             result = result.astype(object)
     elif isinstance(expected, DataFrame):
         columns = expected.select_dtypes(include="object").columns
-        assert all(result[columns].dtypes == "string")
+        assert all(result[columns].dtypes == "text")
         result[columns] = result[columns].astype(object)
     tm.assert_equal(result, expected)

From 903ea2f8bf02acd515f7b0b729047f10f12709ff Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 23 Sep 2019 16:47:16 -0500
Subject: [PATCH 32/49] move

---
 pandas/core/api.py                                              | 2 +-
 pandas/core/arrays/__init__.py                                  | 2 +-
 pandas/core/arrays/{string_.py => text.py}                      | 0
 pandas/tests/arrays/text/__init__.py                            | 0
 .../tests/arrays/{string_/test_string.py => text/test_text.py}  | 0
 pandas/tests/extension/{test_string.py => test_text.py}         | 2 +-
 6 files changed, 3 insertions(+), 3 deletions(-)
 rename pandas/core/arrays/{string_.py => text.py} (100%)
 create mode 100644 pandas/tests/arrays/text/__init__.py
 rename pandas/tests/arrays/{string_/test_string.py => text/test_text.py} (100%)
 rename pandas/tests/extension/{test_string.py => test_text.py} (97%)

diff --git a/pandas/core/api.py b/pandas/core/api.py
index e17602b8943f5..83ecf675d0389 100644
--- a/pandas/core/api.py
+++ b/pandas/core/api.py
@@ -23,7 +23,7 @@
     UInt32Dtype,
     UInt64Dtype,
 )
-from pandas.core.arrays.string_ import TextDtype
+from pandas.core.arrays.text import TextDtype
 from pandas.core.construction import array
 from pandas.core.groupby import Grouper, NamedAgg
 from pandas.core.index import (
diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py
index a7989cd154d04..0df154dc07322 100644
--- a/pandas/core/arrays/__init__.py
+++ b/pandas/core/arrays/__init__.py
@@ -10,5 +10,5 @@
 from .numpy_ import PandasArray, PandasDtype  # noqa: F401
 from .period import PeriodArray, period_array  # noqa: F401
 from .sparse import SparseArray  # noqa: F401
-from .string_ import TextArray  # noqa: F401
+from .text import TextArray  # noqa: F401
 from .timedeltas import TimedeltaArray  # noqa: F401
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/text.py
similarity index 100%
rename from pandas/core/arrays/string_.py
rename to pandas/core/arrays/text.py
diff --git a/pandas/tests/arrays/text/__init__.py b/pandas/tests/arrays/text/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/text/test_text.py
similarity index 100%
rename from pandas/tests/arrays/string_/test_string.py
rename to pandas/tests/arrays/text/test_text.py
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_text.py
similarity index 97%
rename from pandas/tests/extension/test_string.py
rename to pandas/tests/extension/test_text.py
index 3489722b4538e..24337b86d5e3a 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_text.py
@@ -4,7 +4,7 @@
 import pytest
 
 import pandas as pd
-from pandas.core.arrays.string_ import TextArray, TextDtype
+from pandas.core.arrays.text import TextArray, TextDtype
 from pandas.tests.extension import base
 
 

From 0e1f479bdda2b7f390084f8398345c6b408057b6 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 23 Sep 2019 16:51:56 -0500
Subject: [PATCH 33/49] missed

---
 doc/source/reference/arrays.rst | 2 +-
 doc/source/whatsnew/v1.0.0.rst  | 8 ++++----
 pandas/core/strings.py          | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst
index 8f8e1e1c1ea2e..5f3be7830d027 100644
--- a/doc/source/reference/arrays.rst
+++ b/doc/source/reference/arrays.rst
@@ -467,7 +467,7 @@ Text data
 ---------
 
 When working with text data, where each valid element is a string, we recommend using
-:class:`TextDtype` (with the alias ``"string"``).
+:class:`TextDtype` (with the alias ``"text"``).
 
 .. autosummary::
    :toctree: api/
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
index 3f26e661660a8..0abfee0d1e904 100644
--- a/doc/source/whatsnew/v1.0.0.rst
+++ b/doc/source/whatsnew/v1.0.0.rst
@@ -52,8 +52,8 @@ Enhancements
 
 .. _whatsnew_1000.string:
 
-Dedicated string data type
-^^^^^^^^^^^^^^^^^^^^^^^^^^
+Dedicated text data type
+^^^^^^^^^^^^^^^^^^^^^^^^
 
 We've added :class:`TextDtype`, an extension type dedicated to string data.
 Previously, strings were typically stored in object-dtype NumPy arrays.
@@ -62,11 +62,11 @@ Previously, strings were typically stored in object-dtype NumPy arrays.
 
    pd.Series(['abc', None, 'def'], dtype=pd.TextDtype())
 
-You can use the alias ``'string'`` as well.
+You can use the alias ``"text"`` as well.
 
 .. ipython:: python
 
-   s = pd.Series(['abc', None, 'def'], dtype="string")
+   s = pd.Series(['abc', None, 'def'], dtype="text")
    s
 
 The usual string accessor methods work. Where appropriate, the return type
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index 2ee4438024c32..f50ef530bde86 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -1999,7 +1999,7 @@ def _validate(data):
 
         # explicitly allow TextDtype
         if isinstance(values.dtype, TextDtype):
-            return "string"
+            return "text"
 
         try:
             inferred_dtype = lib.infer_dtype(values, skipna=True)

From c168ecf26d91dca519e586e553dfc2ee1fc1d6db Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 23 Sep 2019 16:53:16 -0500
Subject: [PATCH 34/49] missed

---
 doc/source/user_guide/text.rst | 4 ++--
 doc/source/whatsnew/v1.0.0.rst | 2 +-
 pandas/core/strings.py         | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
index 646dac579fe54..502a40fd15144 100644
--- a/doc/source/user_guide/text.rst
+++ b/doc/source/user_guide/text.rst
@@ -29,7 +29,7 @@ for many reasons:
    There isn't a clear way to select *just* text while excluding non-text
    but still object-dtype columns.
 3. When reading code, the contents of an ``object`` dtype array is less clear
-   than ``string``.
+   than ``text``.
 
 Currently, the performance of ``object`` dtype arrays of strings and
 :class:`arrays.TextArray` are about the same. We expect future enhancements
@@ -63,7 +63,7 @@ Or ``astype`` after the ``Series`` or ``DataFrame`` is created
    s.astype("text")
 
 Everything that follows in the rest of this document applies equally to
-``string`` and ``object`` dtype.
+``text`` and ``object`` dtype.
 
 .. _text.string_methods:
 
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
index 0abfee0d1e904..1e49e913ef52e 100644
--- a/doc/source/whatsnew/v1.0.0.rst
+++ b/doc/source/whatsnew/v1.0.0.rst
@@ -75,7 +75,7 @@ of the Series or columns of a DataFrame will also have string dtype.
    s.str.upper()
    s.str.split('b', expand=True).dtypes
 
-We recommend explicitly using the ``string`` data type when working with strings.
+We recommend explicitly using the ``text`` data type when working with strings.
 See :ref:`text.types` for more.
 
 .. _whatsnew_1000.enhancements.other:
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index f50ef530bde86..2ee4438024c32 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -1999,7 +1999,7 @@ def _validate(data):
 
         # explicitly allow TextDtype
         if isinstance(values.dtype, TextDtype):
-            return "text"
+            return "string"
 
         try:
             inferred_dtype = lib.infer_dtype(values, skipna=True)

From d06ba7348471affffc7dd62953ee6485219892f0 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 24 Sep 2019 06:50:44 -0500
Subject: [PATCH 35/49] fixup rename

---
 doc/source/reference/arrays.rst | 2 +-
 doc/source/user_guide/text.rst  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst
index 5f3be7830d027..d13f8c21c5f8e 100644
--- a/doc/source/reference/arrays.rst
+++ b/doc/source/reference/arrays.rst
@@ -24,7 +24,7 @@ Intervals           :class:`IntervalDtype`    :class:`Interval`  :ref:`api.array
 Nullable Integer    :class:`Int64Dtype`, ...  (none)             :ref:`api.arrays.integer_na`
 Categorical         :class:`CategoricalDtype` (none)             :ref:`api.arrays.categorical`
 Sparse              :class:`SparseDtype`      (none)             :ref:`api.arrays.sparse`
-Text                :class:`TextDtype`      :class:`str`       :ref:`api.arrays.string`
+Text                :class:`TextDtype`        :class:`str`       :ref:`api.arrays.string`
 =================== ========================= ================== =============================
 
 Pandas and third-party libraries can extend NumPy's type system (see :ref:`extending.extension-types`).
diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
index 502a40fd15144..e380884604801 100644
--- a/doc/source/user_guide/text.rst
+++ b/doc/source/user_guide/text.rst
@@ -470,7 +470,7 @@ Named groups like
 
    pd.Series(['a1', 'b2', 'c3'],
              dtype="text").str.extract(r'(?P<letter>[ab])(?P<digit>\d)',
-                                         expand=False)
+                                       expand=False)
 
 and optional groups like
 

From 3ba27c33829e5f042c29d59efd96dea773abc97d Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 24 Sep 2019 07:58:01 -0500
Subject: [PATCH 36/49] fixup

---
 pandas/core/arrays/numpy_.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
index a979135f1b9d7..2ef4f774d2985 100644
--- a/pandas/core/arrays/numpy_.py
+++ b/pandas/core/arrays/numpy_.py
@@ -10,7 +10,7 @@
 
 from pandas.core.dtypes.dtypes import ExtensionDtype
 from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
-from pandas.core.dtypes.inference import is_array_like, is_list_like
+from pandas.core.dtypes.inference import is_array_like
 from pandas.core.dtypes.missing import isna
 
 from pandas import compat
@@ -230,12 +230,11 @@ def __setitem__(self, key, value):
         value = extract_array(value, extract_numpy=True)
 
         scalar_key = lib.is_scalar(key)
-        scalar_value = lib.is_scalar(value)
 
-        if not scalar_key and is_list_like(key):
+        if not scalar_key:
             key = np.asarray(key)
 
-        if not scalar_value:
+        if not lib.is_scalar(value):
             value = np.asarray(value, dtype=self._ndarray.dtype)
 
         self._ndarray[key] = value

From fe8ee77bcf56db959a406bf763c051887754d1cf Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 24 Sep 2019 08:02:31 -0500
Subject: [PATCH 37/49] doctest

---
 ci/code_checks.sh          | 4 ++++
 pandas/core/arrays/text.py | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index b03c4f2238445..3fc95efad1905 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -262,6 +262,10 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
         -k"-from_arrays -from_breaks -from_intervals -from_tuples -set_closed -to_tuples -interval_range"
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
+    MSG='Doctests arrays/text.py' ; echo $MSG
+    pytest -q --doctest-modules pandas/core/arrays/text.py
+    RET=$(($RET + $?)) ; echo $MSG "DONE"
+
 fi
 
 ### DOCSTRINGS ###
diff --git a/pandas/core/arrays/text.py b/pandas/core/arrays/text.py
index 6de2fee0050ca..8ad63dc13e893 100644
--- a/pandas/core/arrays/text.py
+++ b/pandas/core/arrays/text.py
@@ -132,7 +132,7 @@ class TextArray(PandasArray):
     >>> pd.array(['This is', 'some text', None, 'data.'], dtype="text")
     <TextArray>
     ['This is', 'some text', nan, 'data.']
-    Length: 4, dtype: string
+    Length: 4, dtype: text
 
     Unlike ``object`` dtype arrays, ``TextArray`` doesn't allow non-string
     values.
@@ -140,7 +140,7 @@ class TextArray(PandasArray):
     >>> pd.array(['1', 1], dtype="text")
     Traceback (most recent call last):
     ...
-    ValueError: Must provide strings
+    ValueError: TextArray requires an object-dtype ndarray of strings.
     """
 
     # undo the PandasArray hack

From d9f63aadc20c3bb53112790e7a1227bc90d13d3b Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 24 Sep 2019 08:50:03 -0500
Subject: [PATCH 38/49] updates

---
 doc/source/getting_started/basics.rst |  4 ++--
 doc/source/reference/arrays.rst       |  4 ++--
 doc/source/user_guide/text.rst        |  9 +++++----
 doc/source/whatsnew/v1.0.0.rst        | 16 ++++++++++++++++
 pandas/core/arrays/text.py            |  6 +++---
 pandas/tests/extension/test_text.py   |  8 ++++++++
 6 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst
index 37ced806df406..cc1cbeec9b788 100644
--- a/doc/source/getting_started/basics.rst
+++ b/doc/source/getting_started/basics.rst
@@ -1938,9 +1938,9 @@ Text                :class:`TextDtype`        :class:`str`       :class:`arrays.
 Pandas has two ways to store strings.
 
 1. ``object`` dtype, which can hold any Python object, including strings.
-2. :class:`arrays.TextArray`, which is dedicated to strings.
+2. :class:`TextDtype`, which is dedicated to strings.
 
-Generally, we recommend using :class:`arrays.TextArray`. See :ref:`text.types` fore more.
+Generally, we recommend using :class:`TextDtype`. See :ref:`text.types` fore more.
 
 Finally, arbitrary objects may be stored using the ``object`` dtype, but should
 be avoided to the extent possible (for performance and interoperability with
diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst
index d13f8c21c5f8e..81d6cb2e54c2c 100644
--- a/doc/source/reference/arrays.rst
+++ b/doc/source/reference/arrays.rst
@@ -466,8 +466,8 @@ and methods if the :class:`Series` contains sparse values. See
 Text data
 ---------
 
-When working with text data, where each valid element is a string, we recommend using
-:class:`TextDtype` (with the alias ``"text"``).
+When working with text data, where each valid element is a string or missing,
+we recommend using :class:`TextDtype` (with the alias ``"text"``).
 
 .. autosummary::
    :toctree: api/
diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
index e380884604801..ca00be77761a7 100644
--- a/doc/source/user_guide/text.rst
+++ b/doc/source/user_guide/text.rst
@@ -16,9 +16,9 @@ Text Data Types
 There are two main ways to store text data
 
 1. ``object`` -dtype NumPy array.
-2. As an :class:`arrays.TextArray` extension type.
+2. :class:`TextDtype` extension type.
 
-We recommend using :class:`arrays.TextArray` to store text data.
+We recommend using :class:`TextDtype` to store text data.
 
 Prior to pandas 1.0, ``object`` dtype was the only option. This was unfortunate
 for many reasons:
@@ -34,11 +34,12 @@ for many reasons:
 Currently, the performance of ``object`` dtype arrays of strings and
 :class:`arrays.TextArray` are about the same. We expect future enhancements
 to significantly increase the performance and lower the memory overhead of
-:class:`arrays.TextArray`.
+:class:`~arrays.TextArray`.
 
 .. warning::
 
-   TextArray is currently considered experimental.
+   ``TextArray`` is currently considered experimental. The implementation
+   and parts of the API may change without warning.
 
 For backwards-compatibility, ``object`` dtype remains the default type we
 infer a list of strings to
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
index 1e49e913ef52e..1daa3c95a99c3 100644
--- a/doc/source/whatsnew/v1.0.0.rst
+++ b/doc/source/whatsnew/v1.0.0.rst
@@ -58,6 +58,22 @@ Dedicated text data type
 We've added :class:`TextDtype`, an extension type dedicated to string data.
 Previously, strings were typically stored in object-dtype NumPy arrays.
 
+.. warning::
+
+   ``TextDtype`` and is currently considered experimental. The implementation
+   and parts of the API may change without warning.
+
+The text extension type solves several issues with object-dtype NumPy arrays:
+
+1. You can accidentally store a *mixture* of strings and non-strings in an
+   ``object`` dtype array. A ``TextArray`` can only store strings.
+2. ``object`` dtype breaks dtype-specific operations like :meth:`DataFrame.select_dtypes`.
+   There isn't a clear way to select *just* text while excluding non-text,
+   but still object-dtype columns.
+3. When reading code, the contents of an ``object`` dtype array is less clear
+   than ``text``.
+
+
 .. ipython:: python
 
    pd.Series(['abc', None, 'def'], dtype=pd.TextDtype())
diff --git a/pandas/core/arrays/text.py b/pandas/core/arrays/text.py
index 8ad63dc13e893..b064b5971fdb3 100644
--- a/pandas/core/arrays/text.py
+++ b/pandas/core/arrays/text.py
@@ -157,8 +157,8 @@ def _validate(self):
             raise ValueError("TextArray requires an object-dtype ndarray of strings.")
         if self._ndarray.dtype != "object":
             raise ValueError(
-                "TextArray requires an object-dtype ndarray. Got "
-                "'{}' instead.".format(self._ndarray.dtype)
+                "TextArray requires a sequence of strings. Got "
+                "'{}' dtype instead.".format(self._ndarray.dtype)
             )
 
     @classmethod
@@ -193,7 +193,7 @@ def __setitem__(self, key, value):
                 value = np.nan
             elif not (isinstance(value, str) or np.isnan(value)):
                 raise ValueError(
-                    "Cannot set value '{}' into a TextArray.".format(value)
+                    "Cannot set non-string value '{}' into a TextArray.".format(value)
                 )
         else:
             if not is_array_like(value):
diff --git a/pandas/tests/extension/test_text.py b/pandas/tests/extension/test_text.py
index 24337b86d5e3a..f0363d7ede470 100644
--- a/pandas/tests/extension/test_text.py
+++ b/pandas/tests/extension/test_text.py
@@ -109,3 +109,11 @@ def test_compare_scalar(self, data, all_compare_operators):
 
 class TestParsing(base.BaseParsingTests):
     pass
+
+
+class TestPrinting(base.BasePrintingTests):
+    pass
+
+
+class TestGroupBy(base.BaseGroupbyTests):
+    pass

From d3c49e2394a6695ffbf731535b80705242455eef Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 24 Sep 2019 08:57:03 -0500
Subject: [PATCH 39/49] fixups

---
 pandas/core/arrays/numpy_.py          | 5 +++--
 pandas/core/arrays/text.py            | 4 +++-
 pandas/tests/arrays/text/test_text.py | 4 ++--
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
index 2ef4f774d2985..bf7404e8997c6 100644
--- a/pandas/core/arrays/numpy_.py
+++ b/pandas/core/arrays/numpy_.py
@@ -230,11 +230,12 @@ def __setitem__(self, key, value):
         value = extract_array(value, extract_numpy=True)
 
         scalar_key = lib.is_scalar(key)
+        scalar_value = lib.is_scalar(value)
 
-        if not scalar_key:
+        if not scalar_key and scalar_value:
             key = np.asarray(key)
 
-        if not lib.is_scalar(value):
+        if not scalar_value:
             value = np.asarray(value, dtype=self._ndarray.dtype)
 
         self._ndarray[key] = value
diff --git a/pandas/core/arrays/text.py b/pandas/core/arrays/text.py
index b064b5971fdb3..7f067e53e1883 100644
--- a/pandas/core/arrays/text.py
+++ b/pandas/core/arrays/text.py
@@ -154,7 +154,9 @@ def __init__(self, values, copy=False):
     def _validate(self):
         """Validate that we only store NA or strings."""
         if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
-            raise ValueError("TextArray requires an object-dtype ndarray of strings.")
+            raise ValueError(
+                "TextArray requires a sequence of strings or missing values."
+            )
         if self._ndarray.dtype != "object":
             raise ValueError(
                 "TextArray requires a sequence of strings. Got "
diff --git a/pandas/tests/arrays/text/test_text.py b/pandas/tests/arrays/text/test_text.py
index 8ef1533a73e19..4302f51043638 100644
--- a/pandas/tests/arrays/text/test_text.py
+++ b/pandas/tests/arrays/text/test_text.py
@@ -123,10 +123,10 @@ def test_add_frame():
 
 
 def test_constructor_raises():
-    with pytest.raises(ValueError, match="object-dtype ndarray"):
+    with pytest.raises(ValueError, match="sequence of strings"):
         pd.arrays.TextArray(np.array(["a", "b"], dtype="S1"))
 
-    with pytest.raises(ValueError, match="object-dtype ndarray"):
+    with pytest.raises(ValueError, match="sequence of strings"):
         pd.arrays.TextArray(np.array([]))
 
 

From 43b51cdafb46f827834de72b09bade6adcf7baef Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 24 Sep 2019 13:44:38 -0500
Subject: [PATCH 40/49] length check

---
 pandas/core/arrays/text.py            | 19 ++++++++++++-------
 pandas/tests/arrays/text/test_text.py | 11 +++++++++++
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/pandas/core/arrays/text.py b/pandas/core/arrays/text.py
index 7f067e53e1883..c0e9a2128e3f5 100644
--- a/pandas/core/arrays/text.py
+++ b/pandas/core/arrays/text.py
@@ -234,7 +234,7 @@ def value_counts(self, dropna=False):
 
         return value_counts(self._ndarray, dropna=dropna)
 
-    # Overrride parent, because we have different return types.
+    # Overrride parent because we have different return types.
     @classmethod
     def _create_arithmetic_method(cls, op):
         def method(self, other):
@@ -248,6 +248,14 @@ def method(self, other):
             valid = ~mask
 
             if not lib.is_scalar(other):
+                if len(other) != len(self):
+                    # prevent improper broadcasting when other is 2D
+                    raise ValueError(
+                        "Lengths of operands do not match: {} != {}".format(
+                            len(self), len(other)
+                        )
+                    )
+
                 other = np.asarray(other)
                 other = other[valid]
 
@@ -256,13 +264,10 @@ def method(self, other):
             result[valid] = op(self._ndarray[valid], other)
 
             if op.__name__ in {"add", "radd", "mul", "rmul"}:
-                new = TextArray
-            elif mask.any():
-                new = lambda x: np.asarray(x, dtype="object")
+                return TextArray(result)
             else:
-                new = lambda x: np.asarray(x, dtype="bool")
-
-            return new(result)
+                dtype = "object" if mask.any() else "bool"
+                return np.asarray(result, dtype=dtype)
 
         return compat.set_function_name(method, "__{}__".format(op.__name__), cls)
 
diff --git a/pandas/tests/arrays/text/test_text.py b/pandas/tests/arrays/text/test_text.py
index 4302f51043638..cd18c58e249bc 100644
--- a/pandas/tests/arrays/text/test_text.py
+++ b/pandas/tests/arrays/text/test_text.py
@@ -68,6 +68,17 @@ def test_add():
     tm.assert_series_equal(result, expected)
 
 
+def test_add_2d():
+    a = pd.array(["a", "b", "c"], dtype="text")
+    b = np.array([["a", "b", "c"]], dtype=object)
+    with pytest.raises(ValueError, match="3 != 1"):
+        a + b
+
+    s = pd.Series(a)
+    with pytest.raises(ValueError, match="3 != 1"):
+        s + b
+
+
 def test_add_sequence():
     a = pd.array(["a", "b", None, None], dtype="text")
     other = ["x", None, "y", None]

From 4fd2d11f4eca70828045ce77a671dcf6b325b0fe Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 24 Sep 2019 13:46:19 -0500
Subject: [PATCH 41/49] unimplement sum

---
 pandas/core/arrays/text.py            | 11 +----------
 pandas/tests/arrays/text/test_text.py |  2 ++
 pandas/tests/extension/test_text.py   |  9 +--------
 3 files changed, 4 insertions(+), 18 deletions(-)

diff --git a/pandas/core/arrays/text.py b/pandas/core/arrays/text.py
index c0e9a2128e3f5..00458fc4c31e5 100644
--- a/pandas/core/arrays/text.py
+++ b/pandas/core/arrays/text.py
@@ -218,16 +218,7 @@ def astype(self, dtype, copy=True):
         return super().astype(dtype, copy)
 
     def _reduce(self, name, skipna=True, **kwargs):
-        if name == "sum":
-            vals = self._ndarray
-            missing = self.isna()
-            if skipna:
-                vals = vals[~missing]
-            elif missing.any():
-                return np.nan
-            return vals.sum()
-
-        raise TypeError("Cannot perform reduction '{}' with string dtype".format(name))
+        raise TypeError("Cannot perform reduction '{}' with text dtype".format(name))
 
     def value_counts(self, dropna=False):
         from pandas import value_counts
diff --git a/pandas/tests/arrays/text/test_text.py b/pandas/tests/arrays/text/test_text.py
index cd18c58e249bc..50208d655a547 100644
--- a/pandas/tests/arrays/text/test_text.py
+++ b/pandas/tests/arrays/text/test_text.py
@@ -142,6 +142,7 @@ def test_constructor_raises():
 
 
 @pytest.mark.parametrize("skipna", [True, False])
+@pytest.mark.xfail(reason="Not implemented TextArray.sum")
 def test_reduce(skipna):
     arr = pd.Series(["a", "b", "c"], dtype="text")
     result = arr.sum(skipna=skipna)
@@ -149,6 +150,7 @@ def test_reduce(skipna):
 
 
 @pytest.mark.parametrize("skipna", [True, False])
+@pytest.mark.xfail(reason="Not implemented TextArray.sum")
 def test_reduce_missing(skipna):
     arr = pd.Series([None, "a", None, "b", "c", None], dtype="text")
     result = arr.sum(skipna=skipna)
diff --git a/pandas/tests/extension/test_text.py b/pandas/tests/extension/test_text.py
index f0363d7ede470..dc0d0cac06489 100644
--- a/pandas/tests/extension/test_text.py
+++ b/pandas/tests/extension/test_text.py
@@ -77,14 +77,7 @@ class TestMissing(base.BaseMissingTests):
 
 
 class TestNoReduce(base.BaseNoReduceTests):
-    @pytest.mark.parametrize("skipna", [True, False])
-    def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
-        if all_numeric_reductions == "sum":
-            pytest.skip("TextArray implements sum")
-        else:
-            return super().test_reduce_series_numeric(
-                data, all_numeric_reductions, skipna
-            )
+    pass
 
 
 class TestMethods(base.BaseMethodsTests):

From 8714a53510229a93443e83053e8262b633475519 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Mon, 30 Sep 2019 15:25:40 -0500
Subject: [PATCH 42/49] fixup

---
 doc/source/whatsnew/v1.0.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
index 055a1d3269520..1fe44e75c49a3 100644
--- a/doc/source/whatsnew/v1.0.0.rst
+++ b/doc/source/whatsnew/v1.0.0.rst
@@ -50,7 +50,7 @@ including other versions of pandas.
 Enhancements
 ~~~~~~~~~~~~
 
-.. _whatsnew_1000.string:
+.. _whatsnew_100.text:
 
 Dedicated text data type
 ^^^^^^^^^^^^^^^^^^^^^^^^

From dc9ef3cffb5caca17eaa23245db67581c72b6c6f Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 1 Oct 2019 07:11:39 -0500
Subject: [PATCH 43/49] rename

---
 doc/source/getting_started/basics.rst | 10 ++--
 doc/source/reference/arrays.rst       | 10 ++--
 doc/source/user_guide/text.rst        | 16 +++----
 doc/source/whatsnew/v1.0.0.rst        | 18 ++++----
 pandas/__init__.py                    |  2 +-
 pandas/arrays/__init__.py             |  4 +-
 pandas/core/api.py                    |  2 +-
 pandas/core/arrays/__init__.py        |  2 +-
 pandas/core/arrays/text.py            | 66 +++++++++++++--------------
 pandas/core/strings.py                | 18 ++++----
 pandas/tests/api/test_api.py          |  2 +-
 pandas/tests/arrays/text/test_text.py | 56 +++++++++++------------
 pandas/tests/extension/test_text.py   | 14 +++---
 pandas/tests/test_strings.py          | 11 ++---
 14 files changed, 114 insertions(+), 117 deletions(-)

diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst
index cc1cbeec9b788..2818011eb02ca 100644
--- a/doc/source/getting_started/basics.rst
+++ b/doc/source/getting_started/basics.rst
@@ -986,7 +986,7 @@ not noted for a particular column will be ``NaN``:
 
    tsdf.agg({'A': ['mean', 'min'], 'B': 'sum'})
 
-.. _basics.aggregation.mixed_dtypes:
+.. _basics.aggregation.mixed_string:
 
 Mixed dtypes
 ++++++++++++
@@ -1716,7 +1716,7 @@ always uses them).
 .. note::
 
    Prior to pandas 1.0, string methods were only available on ``object`` -dtype
-   ``Series``. Pandas 1.0 added the :class:`TextDtype` which is dedicated
+   ``Series``. Pandas 1.0 added the :class:`StringDtype` which is dedicated
    to strings. See :ref:`text.types` for more.
 
 Please see :ref:`Vectorized String Methods <text.string_methods>` for a complete
@@ -1932,15 +1932,15 @@ period (time spans) :class:`PeriodDtype`      :class:`Period`    :class:`arrays.
 sparse              :class:`SparseDtype`      (none)             :class:`arrays.SparseArray`   :ref:`sparse`
 intervals           :class:`IntervalDtype`    :class:`Interval`  :class:`arrays.IntervalArray` :ref:`advanced.intervalindex`
 nullable integer    :class:`Int64Dtype`, ...  (none)             :class:`arrays.IntegerArray`  :ref:`integer_na`
-Text                :class:`TextDtype`        :class:`str`       :class:`arrays.TextArray`     :ref:`text`
+Strings             :class:`StringDtype`      :class:`str`       :class:`arrays.StringArray`   :ref:`text`
 =================== ========================= ================== ============================= =============================
 
 Pandas has two ways to store strings.
 
 1. ``object`` dtype, which can hold any Python object, including strings.
-2. :class:`TextDtype`, which is dedicated to strings.
+2. :class:`StringDtype`, which is dedicated to strings.
 
-Generally, we recommend using :class:`TextDtype`. See :ref:`text.types` fore more.
+Generally, we recommend using :class:`StringDtype`. See :ref:`text.types` fore more.
 
 Finally, arbitrary objects may be stored using the ``object`` dtype, but should
 be avoided to the extent possible (for performance and interoperability with
diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst
index 81d6cb2e54c2c..0c435e06ac57f 100644
--- a/doc/source/reference/arrays.rst
+++ b/doc/source/reference/arrays.rst
@@ -24,7 +24,7 @@ Intervals           :class:`IntervalDtype`    :class:`Interval`  :ref:`api.array
 Nullable Integer    :class:`Int64Dtype`, ...  (none)             :ref:`api.arrays.integer_na`
 Categorical         :class:`CategoricalDtype` (none)             :ref:`api.arrays.categorical`
 Sparse              :class:`SparseDtype`      (none)             :ref:`api.arrays.sparse`
-Text                :class:`TextDtype`        :class:`str`       :ref:`api.arrays.string`
+Strings             :class:`StringDtype`      :class:`str`       :ref:`api.arrays.string`
 =================== ========================= ================== =============================
 
 Pandas and third-party libraries can extend NumPy's type system (see :ref:`extending.extension-types`).
@@ -467,21 +467,21 @@ Text data
 ---------
 
 When working with text data, where each valid element is a string or missing,
-we recommend using :class:`TextDtype` (with the alias ``"text"``).
+we recommend using :class:`StringDtype` (with the alias ``"string"``).
 
 .. autosummary::
    :toctree: api/
    :template: autosummary/class_without_autosummary.rst
 
-   arrays.TextArray
+   arrays.StringArray
 
 .. autosummary::
    :toctree: api/
    :template: autosummary/class_without_autosummary.rst
 
-   TextDtype
+   StringDtype
 
-The ``Series.str`` accessor is available for ``Series`` backed by a :class:`arrays.TextArray`.
+The ``Series.str`` accessor is available for ``Series`` backed by a :class:`arrays.StringArray`.
 See :ref:`api.series.str` for more.
 
 
diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
index ca00be77761a7..c8c8a6d57eb7f 100644
--- a/doc/source/user_guide/text.rst
+++ b/doc/source/user_guide/text.rst
@@ -16,9 +16,9 @@ Text Data Types
 There are two main ways to store text data
 
 1. ``object`` -dtype NumPy array.
-2. :class:`TextDtype` extension type.
+2. :class:`StringDtype` extension type.
 
-We recommend using :class:`TextDtype` to store text data.
+We recommend using :class:`StringDtype` to store text data.
 
 Prior to pandas 1.0, ``object`` dtype was the only option. This was unfortunate
 for many reasons:
@@ -32,13 +32,13 @@ for many reasons:
    than ``text``.
 
 Currently, the performance of ``object`` dtype arrays of strings and
-:class:`arrays.TextArray` are about the same. We expect future enhancements
+:class:`arrays.StringArray` are about the same. We expect future enhancements
 to significantly increase the performance and lower the memory overhead of
-:class:`~arrays.TextArray`.
+:class:`~arrays.StringArray`.
 
 .. warning::
 
-   ``TextArray`` is currently considered experimental. The implementation
+   ``StringArray`` is currently considered experimental. The implementation
    and parts of the API may change without warning.
 
 For backwards-compatibility, ``object`` dtype remains the default type we
@@ -53,7 +53,7 @@ To explicitly request ``text`` dtype, specify the ``dtype``
 .. ipython:: python
 
    pd.Series(['a', 'b', 'c'], dtype="text")
-   pd.Series(['a', 'b', 'c'], dtype=pd.TextDtype())
+   pd.Series(['a', 'b', 'c'], dtype=pd.StringDtype())
 
 Or ``astype`` after the ``Series`` or ``DataFrame`` is created
 
@@ -170,8 +170,8 @@ It is easy to expand this to return a DataFrame using ``expand``.
 
    s2.str.split('_', expand=True)
 
-When original ``Series`` has :class:`TextDtype`, the output columns will all
-be :class:`TextDtype` as well.
+When original ``Series`` has :class:`StringDtype`, the output columns will all
+be :class:`StringDtype` as well.
 
 It is also possible to limit the number of splits:
 
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
index 8beaaf27d8cd2..22f9617d6dcc0 100644
--- a/doc/source/whatsnew/v1.0.0.rst
+++ b/doc/source/whatsnew/v1.0.0.rst
@@ -50,39 +50,39 @@ including other versions of pandas.
 Enhancements
 ~~~~~~~~~~~~
 
-.. _whatsnew_100.text:
+.. _whatsnew_100.string:
 
 Dedicated text data type
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
-We've added :class:`TextDtype`, an extension type dedicated to string data.
+We've added :class:`StringDtype`, an extension type dedicated to string data.
 Previously, strings were typically stored in object-dtype NumPy arrays.
 
 .. warning::
 
-   ``TextDtype`` and is currently considered experimental. The implementation
+   ``StringDtype`` and is currently considered experimental. The implementation
    and parts of the API may change without warning.
 
 The text extension type solves several issues with object-dtype NumPy arrays:
 
 1. You can accidentally store a *mixture* of strings and non-strings in an
-   ``object`` dtype array. A ``TextArray`` can only store strings.
+   ``object`` dtype array. A ``StringArray`` can only store strings.
 2. ``object`` dtype breaks dtype-specific operations like :meth:`DataFrame.select_dtypes`.
    There isn't a clear way to select *just* text while excluding non-text,
    but still object-dtype columns.
 3. When reading code, the contents of an ``object`` dtype array is less clear
-   than ``text``.
+   than ``string``.
 
 
 .. ipython:: python
 
-   pd.Series(['abc', None, 'def'], dtype=pd.TextDtype())
+   pd.Series(['abc', None, 'def'], dtype=pd.StringDtype())
 
-You can use the alias ``"text"`` as well.
+You can use the alias ``"string"`` as well.
 
 .. ipython:: python
 
-   s = pd.Series(['abc', None, 'def'], dtype="text")
+   s = pd.Series(['abc', None, 'def'], dtype="string")
    s
 
 The usual string accessor methods work. Where appropriate, the return type
@@ -91,7 +91,7 @@ of the Series or columns of a DataFrame will also have string dtype.
    s.str.upper()
    s.str.split('b', expand=True).dtypes
 
-We recommend explicitly using the ``text`` data type when working with strings.
+We recommend explicitly using the ``string`` data type when working with strings.
 See :ref:`text.types` for more.
 
 .. _whatsnew_1000.enhancements.other:
diff --git a/pandas/__init__.py b/pandas/__init__.py
index da4d5106f7aa7..5d163e411c0ac 100644
--- a/pandas/__init__.py
+++ b/pandas/__init__.py
@@ -66,7 +66,7 @@
     PeriodDtype,
     IntervalDtype,
     DatetimeTZDtype,
-    TextDtype,
+    StringDtype,
     # missing
     isna,
     isnull,
diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py
index 24cb6a9c5acf7..9870b5bed076d 100644
--- a/pandas/arrays/__init__.py
+++ b/pandas/arrays/__init__.py
@@ -11,7 +11,7 @@
     PandasArray,
     PeriodArray,
     SparseArray,
-    TextArray,
+    StringArray,
     TimedeltaArray,
 )
 
@@ -23,6 +23,6 @@
     "PandasArray",
     "PeriodArray",
     "SparseArray",
-    "TextArray",
+    "StringArray",
     "TimedeltaArray",
 ]
diff --git a/pandas/core/api.py b/pandas/core/api.py
index 83ecf675d0389..9d4be9c075122 100644
--- a/pandas/core/api.py
+++ b/pandas/core/api.py
@@ -23,7 +23,7 @@
     UInt32Dtype,
     UInt64Dtype,
 )
-from pandas.core.arrays.text import TextDtype
+from pandas.core.arrays.text import StringDtype
 from pandas.core.construction import array
 from pandas.core.groupby import Grouper, NamedAgg
 from pandas.core.index import (
diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py
index 0df154dc07322..384bbc2318c82 100644
--- a/pandas/core/arrays/__init__.py
+++ b/pandas/core/arrays/__init__.py
@@ -10,5 +10,5 @@
 from .numpy_ import PandasArray, PandasDtype  # noqa: F401
 from .period import PeriodArray, period_array  # noqa: F401
 from .sparse import SparseArray  # noqa: F401
-from .text import TextArray  # noqa: F401
+from .text import StringArray  # noqa: F401
 from .timedeltas import TimedeltaArray  # noqa: F401
diff --git a/pandas/core/arrays/text.py b/pandas/core/arrays/text.py
index 00458fc4c31e5..41c7d81bd55da 100644
--- a/pandas/core/arrays/text.py
+++ b/pandas/core/arrays/text.py
@@ -22,18 +22,18 @@
 
 
 @register_extension_dtype
-class TextDtype(ExtensionDtype):
+class StringDtype(ExtensionDtype):
     """
-    Extension dtype for text data.
+    Extension dtype for string data.
 
     .. versionadded:: 1.0.0
 
     .. warning::
 
-       TextDtype is considered experimental. The implementation and
+       StringDtype is considered experimental. The implementation and
        parts of the API may change without warning.
 
-       In particular, TextDtype.na_value may change to no longer be
+       In particular, StringDtype.na_value may change to no longer be
        ``numpy.nan``.
 
     Attributes
@@ -46,14 +46,14 @@ class TextDtype(ExtensionDtype):
 
     Examples
     --------
-    >>> pd.TextDtype()
-    TextDtype
+    >>> pd.StringDtype()
+    StringDtype
     """
 
     @property
     def na_value(self) -> "Scalar":
         """
-        TextDtype uses :attr:`numpy.nan` as the missing NA value.
+        StringDtype uses :attr:`numpy.nan` as the missing NA value.
 
         .. warning::
 
@@ -68,33 +68,33 @@ def type(self) -> Type:
     @property
     def name(self) -> str:
         """
-        The alias for TextDtype is ``'text'``.
+        The alias for StringDtype is ``'string'``.
         """
-        return "text"
+        return "string"
 
     @classmethod
     def construct_from_string(cls, string: str) -> ExtensionDtype:
-        if string == "text":
+        if string == "string":
             return cls()
         return super().construct_from_string(string)
 
     @classmethod
-    def construct_array_type(cls) -> "Type[TextArray]":
-        return TextArray
+    def construct_array_type(cls) -> "Type[StringArray]":
+        return StringArray
 
     def __repr__(self) -> str:
-        return "TextDtype"
+        return "StringDtype"
 
 
-class TextArray(PandasArray):
+class StringArray(PandasArray):
     """
-    Extension array for text data.
+    Extension array for string data.
 
     .. versionadded:: 1.0.0
 
     .. warning::
 
-       TextArray is considered experimental. The implementation and
+       StringArray is considered experimental. The implementation and
        parts of the API may change without warning.
 
        In particular, the NA value used may change to no longer be
@@ -125,22 +125,22 @@ class TextArray(PandasArray):
     --------
     Series.str
         The string methods are available on Series backed by
-        a TextArray.
+        a StringArray.
 
     Examples
     --------
-    >>> pd.array(['This is', 'some text', None, 'data.'], dtype="text")
-    <TextArray>
+    >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string")
+    <StringArray>
     ['This is', 'some text', nan, 'data.']
-    Length: 4, dtype: text
+    Length: 4, dtype: string
 
-    Unlike ``object`` dtype arrays, ``TextArray`` doesn't allow non-string
+    Unlike ``object`` dtype arrays, ``StringArray`` doesn't allow non-string
     values.
 
-    >>> pd.array(['1', 1], dtype="text")
+    >>> pd.array(['1', 1], dtype="string")
     Traceback (most recent call last):
     ...
-    ValueError: TextArray requires an object-dtype ndarray of strings.
+    ValueError: StringArray requires an object-dtype ndarray of strings.
     """
 
     # undo the PandasArray hack
@@ -148,25 +148,25 @@ class TextArray(PandasArray):
 
     def __init__(self, values, copy=False):
         super().__init__(values, copy=copy)
-        self._dtype = TextDtype()
+        self._dtype = StringDtype()
         self._validate()
 
     def _validate(self):
         """Validate that we only store NA or strings."""
         if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
             raise ValueError(
-                "TextArray requires a sequence of strings or missing values."
+                "StringArray requires a sequence of strings or missing values."
             )
         if self._ndarray.dtype != "object":
             raise ValueError(
-                "TextArray requires a sequence of strings. Got "
+                "StringArray requires a sequence of strings. Got "
                 "'{}' dtype instead.".format(self._ndarray.dtype)
             )
 
     @classmethod
     def _from_sequence(cls, scalars, dtype=None, copy=False):
         if dtype:
-            assert dtype == "text"
+            assert dtype == "string"
         result = super()._from_sequence(scalars, dtype=object, copy=copy)
         # convert None to np.nan
         # TODO: it would be nice to do this in _validate / lib.is_string_array
@@ -195,7 +195,7 @@ def __setitem__(self, key, value):
                 value = np.nan
             elif not (isinstance(value, str) or np.isnan(value)):
                 raise ValueError(
-                    "Cannot set non-string value '{}' into a TextArray.".format(value)
+                    "Cannot set non-string value '{}' into a StringArray.".format(value)
                 )
         else:
             if not is_array_like(value):
@@ -211,14 +211,14 @@ def fillna(self, value=None, method=None, limit=None):
 
     def astype(self, dtype, copy=True):
         dtype = pandas_dtype(dtype)
-        if isinstance(dtype, TextDtype):
+        if isinstance(dtype, StringDtype):
             if copy:
                 return self.copy()
             return self
         return super().astype(dtype, copy)
 
     def _reduce(self, name, skipna=True, **kwargs):
-        raise TypeError("Cannot perform reduction '{}' with text dtype".format(name))
+        raise TypeError("Cannot perform reduction '{}' with string dtype".format(name))
 
     def value_counts(self, dropna=False):
         from pandas import value_counts
@@ -255,7 +255,7 @@ def method(self, other):
             result[valid] = op(self._ndarray[valid], other)
 
             if op.__name__ in {"add", "radd", "mul", "rmul"}:
-                return TextArray(result)
+                return StringArray(result)
             else:
                 dtype = "object" if mask.any() else "bool"
                 return np.asarray(result, dtype=dtype)
@@ -273,5 +273,5 @@ def _add_arithmetic_ops(cls):
     _create_comparison_method = _create_arithmetic_method
 
 
-TextArray._add_arithmetic_ops()
-TextArray._add_comparison_ops()
+StringArray._add_arithmetic_ops()
+StringArray._add_comparison_ops()
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index 2ee4438024c32..2fb09182cc6cf 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -817,8 +817,8 @@ def _str_extract_frame(arr, pat, flags=0):
         result_index = arr.index
     except AttributeError:
         result_index = None
-    if arr.dtype.name == "text":
-        dtype = "text"
+    if arr.dtype.name == "string":
+        dtype = "string"
     else:
         dtype = object
     return DataFrame(
@@ -1027,7 +1027,7 @@ def str_extractall(arr, pat, flags=0):
     # workaround #27953
     # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails
     # when the list of values is empty.
-    if arr.dtype.name == "text":
+    if arr.dtype.name == "string":
         dtype = arr.dtype
     else:
         dtype = None
@@ -1953,7 +1953,7 @@ class StringMethods(NoNewAttributesMixin):
     def __init__(self, data):
         self._inferred_dtype = self._validate(data)
         self._is_categorical = is_categorical_dtype(data)
-        self._is_string = data.dtype.name == "text"
+        self._is_string = data.dtype.name == "string"
 
         # .values.categories works for both Series/Index
         self._parent = data.values.categories if self._is_categorical else data
@@ -1984,7 +1984,7 @@ def _validate(data):
         -------
         dtype : inferred dtype of data
         """
-        from pandas import TextDtype
+        from pandas import StringDtype
 
         if isinstance(data, ABCMultiIndex):
             raise AttributeError(
@@ -1997,8 +1997,8 @@ def _validate(data):
         values = getattr(data, "values", data)  # Series / Index
         values = getattr(values, "categories", values)  # categorical / normal
 
-        # explicitly allow TextDtype
-        if isinstance(values.dtype, TextDtype):
+        # explicitly allow StringDtype
+        if isinstance(values.dtype, StringDtype):
             return "string"
 
         try:
@@ -2055,9 +2055,9 @@ def _wrap_result(
         # We can be wrapping a string / object / categorical result, in which
         # case we'll want to return the same dtype as the input.
         # Or we can be wrapping a numeric output, in which case we don't want
-        # to return a TextArray.
+        # to return a StringArray.
         if self._is_string and returns_string:
-            dtype = "text"
+            dtype = "string"
         else:
             dtype = None
 
diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py
index 056a23ab81acf..6c50159663574 100644
--- a/pandas/tests/api/test_api.py
+++ b/pandas/tests/api/test_api.py
@@ -68,7 +68,7 @@ class TestPDApi(Base):
         "Series",
         "SparseArray",
         "SparseDtype",
-        "TextDtype",
+        "StringDtype",
         "Timedelta",
         "TimedeltaIndex",
         "Timestamp",
diff --git a/pandas/tests/arrays/text/test_text.py b/pandas/tests/arrays/text/test_text.py
index 50208d655a547..40221c34116ae 100644
--- a/pandas/tests/arrays/text/test_text.py
+++ b/pandas/tests/arrays/text/test_text.py
@@ -8,13 +8,13 @@
 
 
 def test_none_to_nan():
-    a = pd.arrays.TextArray._from_sequence(["a", None, "b"])
+    a = pd.arrays.StringArray._from_sequence(["a", None, "b"])
     assert a[1] is not None
     assert np.isnan(a[1])
 
 
 def test_setitem_validates():
-    a = pd.arrays.TextArray._from_sequence(["a", "b"])
+    a = pd.arrays.StringArray._from_sequence(["a", "b"])
     with pytest.raises(ValueError, match="10"):
         a[0] = 10
 
@@ -31,12 +31,12 @@ def test_setitem_validates():
     ],
 )
 def test_string_methods(input, method):
-    a = pd.Series(input, dtype="text")
+    a = pd.Series(input, dtype="string")
     b = pd.Series(input, dtype="object")
     result = method(a.str)
     expected = method(b.str)
 
-    assert result.dtype.name == "text"
+    assert result.dtype.name == "string"
     tm.assert_series_equal(result.astype(object), expected)
 
 
@@ -44,32 +44,32 @@ def test_astype_roundtrip():
     s = pd.Series(pd.date_range("2000", periods=12))
     s[0] = None
 
-    result = s.astype("text").astype("datetime64[ns]")
+    result = s.astype("string").astype("datetime64[ns]")
     tm.assert_series_equal(result, s)
 
 
 def test_add():
-    a = pd.Series(["a", "b", "c", None, None], dtype="text")
-    b = pd.Series(["x", "y", None, "z", None], dtype="text")
+    a = pd.Series(["a", "b", "c", None, None], dtype="string")
+    b = pd.Series(["x", "y", None, "z", None], dtype="string")
 
     result = a + b
-    expected = pd.Series(["ax", "by", None, None, None], dtype="text")
+    expected = pd.Series(["ax", "by", None, None, None], dtype="string")
     tm.assert_series_equal(result, expected)
 
     result = a.add(b)
     tm.assert_series_equal(result, expected)
 
     result = a.radd(b)
-    expected = pd.Series(["xa", "yb", None, None, None], dtype="text")
+    expected = pd.Series(["xa", "yb", None, None, None], dtype="string")
     tm.assert_series_equal(result, expected)
 
     result = a.add(b, fill_value="-")
-    expected = pd.Series(["ax", "by", "c-", "-z", None], dtype="text")
+    expected = pd.Series(["ax", "by", "c-", "-z", None], dtype="string")
     tm.assert_series_equal(result, expected)
 
 
 def test_add_2d():
-    a = pd.array(["a", "b", "c"], dtype="text")
+    a = pd.array(["a", "b", "c"], dtype="string")
     b = np.array([["a", "b", "c"]], dtype=object)
     with pytest.raises(ValueError, match="3 != 1"):
         a + b
@@ -80,22 +80,22 @@ def test_add_2d():
 
 
 def test_add_sequence():
-    a = pd.array(["a", "b", None, None], dtype="text")
+    a = pd.array(["a", "b", None, None], dtype="string")
     other = ["x", None, "y", None]
 
     result = a + other
-    expected = pd.array(["ax", None, None, None], dtype="text")
+    expected = pd.array(["ax", None, None, None], dtype="string")
     tm.assert_extension_array_equal(result, expected)
 
     result = other + a
-    expected = pd.array(["xa", None, None, None], dtype="text")
+    expected = pd.array(["xa", None, None, None], dtype="string")
     tm.assert_extension_array_equal(result, expected)
 
 
 def test_mul():
-    a = pd.array(["a", "b", None], dtype="text")
+    a = pd.array(["a", "b", None], dtype="string")
     result = a * 2
-    expected = pd.array(["aa", "bb", None], dtype="text")
+    expected = pd.array(["aa", "bb", None], dtype="string")
     tm.assert_extension_array_equal(result, expected)
 
     result = 2 * a
@@ -104,55 +104,55 @@ def test_mul():
 
 @pytest.mark.xfail(reason="GH-28527")
 def test_add_strings():
-    array = pd.array(["a", "b", "c", "d"], dtype="text")
+    array = pd.array(["a", "b", "c", "d"], dtype="string")
     df = pd.DataFrame([["t", "u", "v", "w"]])
     assert array.__add__(df) is NotImplemented
 
     result = array + df
-    expected = pd.DataFrame([["at", "bu", "cv", "dw"]]).astype("text")
+    expected = pd.DataFrame([["at", "bu", "cv", "dw"]]).astype("string")
     tm.assert_frame_equal(result, expected)
 
     result = df + array
-    expected = pd.DataFrame([["ta", "ub", "vc", "wd"]]).astype("text")
+    expected = pd.DataFrame([["ta", "ub", "vc", "wd"]]).astype("string")
     tm.assert_frame_equal(result, expected)
 
 
 @pytest.mark.xfail(reason="GH-28527")
 def test_add_frame():
-    array = pd.array(["a", "b", np.nan, np.nan], dtype="text")
+    array = pd.array(["a", "b", np.nan, np.nan], dtype="string")
     df = pd.DataFrame([["x", np.nan, "y", np.nan]])
 
     assert array.__add__(df) is NotImplemented
 
     result = array + df
-    expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype("text")
+    expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype("string")
     tm.assert_frame_equal(result, expected)
 
     result = df + array
-    expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype("text")
+    expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype("string")
     tm.assert_frame_equal(result, expected)
 
 
 def test_constructor_raises():
     with pytest.raises(ValueError, match="sequence of strings"):
-        pd.arrays.TextArray(np.array(["a", "b"], dtype="S1"))
+        pd.arrays.StringArray(np.array(["a", "b"], dtype="S1"))
 
     with pytest.raises(ValueError, match="sequence of strings"):
-        pd.arrays.TextArray(np.array([]))
+        pd.arrays.StringArray(np.array([]))
 
 
 @pytest.mark.parametrize("skipna", [True, False])
-@pytest.mark.xfail(reason="Not implemented TextArray.sum")
+@pytest.mark.xfail(reason="Not implemented StringArray.sum")
 def test_reduce(skipna):
-    arr = pd.Series(["a", "b", "c"], dtype="text")
+    arr = pd.Series(["a", "b", "c"], dtype="string")
     result = arr.sum(skipna=skipna)
     assert result == "abc"
 
 
 @pytest.mark.parametrize("skipna", [True, False])
-@pytest.mark.xfail(reason="Not implemented TextArray.sum")
+@pytest.mark.xfail(reason="Not implemented StringArray.sum")
 def test_reduce_missing(skipna):
-    arr = pd.Series([None, "a", None, "b", "c", None], dtype="text")
+    arr = pd.Series([None, "a", None, "b", "c", None], dtype="string")
     result = arr.sum(skipna=skipna)
     if skipna:
         assert result == "abc"
diff --git a/pandas/tests/extension/test_text.py b/pandas/tests/extension/test_text.py
index dc0d0cac06489..908a0dbb67718 100644
--- a/pandas/tests/extension/test_text.py
+++ b/pandas/tests/extension/test_text.py
@@ -4,13 +4,13 @@
 import pytest
 
 import pandas as pd
-from pandas.core.arrays.text import TextArray, TextDtype
+from pandas.core.arrays.text import StringArray, StringDtype
 from pandas.tests.extension import base
 
 
 @pytest.fixture
 def dtype():
-    return TextDtype()
+    return StringDtype()
 
 
 @pytest.fixture
@@ -19,23 +19,23 @@ def data():
     while strings[0] == strings[1]:
         strings = np.random.choice(list(string.ascii_letters), size=100)
 
-    return TextArray._from_sequence(strings)
+    return StringArray._from_sequence(strings)
 
 
 @pytest.fixture
 def data_missing():
     """Length 2 array with [NA, Valid]"""
-    return TextArray._from_sequence([np.nan, "A"])
+    return StringArray._from_sequence([np.nan, "A"])
 
 
 @pytest.fixture
 def data_for_sorting():
-    return TextArray._from_sequence(["B", "C", "A"])
+    return StringArray._from_sequence(["B", "C", "A"])
 
 
 @pytest.fixture
 def data_missing_for_sorting():
-    return TextArray._from_sequence(["B", np.nan, "A"])
+    return StringArray._from_sequence(["B", np.nan, "A"])
 
 
 @pytest.fixture
@@ -45,7 +45,7 @@ def na_value():
 
 @pytest.fixture
 def data_for_grouping():
-    return TextArray._from_sequence(["B", "B", np.nan, np.nan, "A", "A", "B", "C"])
+    return StringArray._from_sequence(["B", "B", np.nan, np.nan, "A", "A", "B", "C"])
 
 
 class TestDtype(base.BaseDtypeTests):
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
index d0e18bd53b1b8..b50f1a0fd2f2a 100644
--- a/pandas/tests/test_strings.py
+++ b/pandas/tests/test_strings.py
@@ -141,7 +141,6 @@ def any_string_method(request):
 # subset of the full set from pandas/conftest.py
 _any_allowed_skipna_inferred_dtype = [
     ("string", ["a", np.nan, "c"]),
-    ("text", ["a", np.nan, "c"]),
     ("bytes", [b"a", np.nan, b"c"]),
     ("empty", [np.nan, np.nan, np.nan]),
     ("empty", []),
@@ -157,7 +156,6 @@ def any_allowed_skipna_inferred_dtype(request):
 
     The covered (inferred) types are:
     * 'string'
-    * 'text'
     * 'empty'
     * 'bytes'
     * 'mixed'
@@ -223,7 +221,6 @@ def test_api_per_dtype(self, box, dtype, any_skipna_inferred_dtype):
 
         types_passing_constructor = [
             "string",
-            "text",
             "unicode",
             "empty",
             "bytes",
@@ -286,7 +283,7 @@ def test_api_per_method(
         mixed_allowed = method_name not in ["cat"]
 
         allowed_types = (
-            ["string", "unicode", "empty", "text"]
+            ["string", "unicode", "empty"]
             + ["bytes"] * bytes_allowed
             + ["mixed", "mixed-integer"] * mixed_allowed
         )
@@ -3279,7 +3276,7 @@ def test_casefold(self):
 def test_string_array(any_string_method):
     data = ["a", "bb", np.nan, "ccc"]
     a = Series(data, dtype=object)
-    b = Series(data, dtype="text")
+    b = Series(data, dtype="string")
     method_name, args, kwargs = any_string_method
 
     expected = getattr(a.str, method_name)(*args, **kwargs)
@@ -3289,10 +3286,10 @@ def test_string_array(any_string_method):
         if expected.dtype == "object" and lib.is_string_array(
             expected.values, skipna=True
         ):
-            assert result.dtype == "text"
+            assert result.dtype == "string"
             result = result.astype(object)
     elif isinstance(expected, DataFrame):
         columns = expected.select_dtypes(include="object").columns
-        assert all(result[columns].dtypes == "text")
+        assert all(result[columns].dtypes == "string")
         result[columns] = result[columns].astype(object)
     tm.assert_equal(result, expected)

From 9419af241776ba98fe57e76d98b4c49ab447bcd0 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 1 Oct 2019 07:16:00 -0500
Subject: [PATCH 44/49] rename

---
 pandas/core/api.py                                      | 2 +-
 pandas/core/arrays/__init__.py                          | 2 +-
 pandas/core/arrays/{text.py => string_.py}              | 0
 pandas/tests/arrays/{text => string_}/__init__.py       | 0
 pandas/tests/arrays/{text => string_}/test_text.py      | 0
 pandas/tests/extension/{test_text.py => test_string.py} | 2 +-
 6 files changed, 3 insertions(+), 3 deletions(-)
 rename pandas/core/arrays/{text.py => string_.py} (100%)
 rename pandas/tests/arrays/{text => string_}/__init__.py (100%)
 rename pandas/tests/arrays/{text => string_}/test_text.py (100%)
 rename pandas/tests/extension/{test_text.py => test_string.py} (97%)

diff --git a/pandas/core/api.py b/pandas/core/api.py
index 9d4be9c075122..04f2f84c92a15 100644
--- a/pandas/core/api.py
+++ b/pandas/core/api.py
@@ -23,7 +23,7 @@
     UInt32Dtype,
     UInt64Dtype,
 )
-from pandas.core.arrays.text import StringDtype
+from pandas.core.arrays.string_ import StringDtype
 from pandas.core.construction import array
 from pandas.core.groupby import Grouper, NamedAgg
 from pandas.core.index import (
diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py
index 384bbc2318c82..868118bac6a7b 100644
--- a/pandas/core/arrays/__init__.py
+++ b/pandas/core/arrays/__init__.py
@@ -10,5 +10,5 @@
 from .numpy_ import PandasArray, PandasDtype  # noqa: F401
 from .period import PeriodArray, period_array  # noqa: F401
 from .sparse import SparseArray  # noqa: F401
-from .text import StringArray  # noqa: F401
+from .string_ import StringArray  # noqa: F401
 from .timedeltas import TimedeltaArray  # noqa: F401
diff --git a/pandas/core/arrays/text.py b/pandas/core/arrays/string_.py
similarity index 100%
rename from pandas/core/arrays/text.py
rename to pandas/core/arrays/string_.py
diff --git a/pandas/tests/arrays/text/__init__.py b/pandas/tests/arrays/string_/__init__.py
similarity index 100%
rename from pandas/tests/arrays/text/__init__.py
rename to pandas/tests/arrays/string_/__init__.py
diff --git a/pandas/tests/arrays/text/test_text.py b/pandas/tests/arrays/string_/test_text.py
similarity index 100%
rename from pandas/tests/arrays/text/test_text.py
rename to pandas/tests/arrays/string_/test_text.py
diff --git a/pandas/tests/extension/test_text.py b/pandas/tests/extension/test_string.py
similarity index 97%
rename from pandas/tests/extension/test_text.py
rename to pandas/tests/extension/test_string.py
index 908a0dbb67718..5b872d5b72227 100644
--- a/pandas/tests/extension/test_text.py
+++ b/pandas/tests/extension/test_string.py
@@ -4,7 +4,7 @@
 import pytest
 
 import pandas as pd
-from pandas.core.arrays.text import StringArray, StringDtype
+from pandas.core.arrays.string_ import StringArray, StringDtype
 from pandas.tests.extension import base
 
 

From 462b29d426d88db4bc87b7207ef080a8334250bf Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 1 Oct 2019 07:19:16 -0500
Subject: [PATCH 45/49] doc updates

---
 doc/source/user_guide/text.rst | 60 +++++++++++++++++-----------------
 doc/source/whatsnew/v1.0.0.rst |  4 +--
 2 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
index c8c8a6d57eb7f..789ff2a65355b 100644
--- a/doc/source/user_guide/text.rst
+++ b/doc/source/user_guide/text.rst
@@ -29,7 +29,7 @@ for many reasons:
    There isn't a clear way to select *just* text while excluding non-text
    but still object-dtype columns.
 3. When reading code, the contents of an ``object`` dtype array is less clear
-   than ``text``.
+   than ``'string'``.
 
 Currently, the performance of ``object`` dtype arrays of strings and
 :class:`arrays.StringArray` are about the same. We expect future enhancements
@@ -48,11 +48,11 @@ infer a list of strings to
 
    pd.Series(['a', 'b', 'c'])
 
-To explicitly request ``text`` dtype, specify the ``dtype``
+To explicitly request ``string`` dtype, specify the ``dtype``
 
 .. ipython:: python
 
-   pd.Series(['a', 'b', 'c'], dtype="text")
+   pd.Series(['a', 'b', 'c'], dtype="string")
    pd.Series(['a', 'b', 'c'], dtype=pd.StringDtype())
 
 Or ``astype`` after the ``Series`` or ``DataFrame`` is created
@@ -61,10 +61,10 @@ Or ``astype`` after the ``Series`` or ``DataFrame`` is created
 
    s = pd.Series(['a', 'b', 'c'])
    s
-   s.astype("text")
+   s.astype("string")
 
 Everything that follows in the rest of this document applies equally to
-``text`` and ``object`` dtype.
+``string`` and ``object`` dtype.
 
 .. _text.string_methods:
 
@@ -80,7 +80,7 @@ the equivalent (scalar) built-in string methods:
 .. ipython:: python
 
    s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'],
-                 dtype="text")
+                 dtype="string")
    s.str.lower()
    s.str.upper()
    s.str.len()
@@ -154,7 +154,7 @@ Methods like ``split`` return a Series of lists:
 
 .. ipython:: python
 
-   s2 = pd.Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'], dtype="text")
+   s2 = pd.Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'], dtype="string")
    s2.str.split('_')
 
 Elements in the split lists can be accessed using ``get`` or ``[]`` notation:
@@ -193,7 +193,7 @@ i.e., from the end of the string to the beginning of the string:
 
    s3 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca',
                    '', np.nan, 'CABA', 'dog', 'cat'],
-                  dtype="text")
+                  dtype="string")
    s3
    s3.str.replace('^.a|dog', 'XX-XX ', case=False)
 
@@ -204,7 +204,7 @@ following code will cause trouble because of the regular expression meaning of
 .. ipython:: python
 
    # Consider the following badly formatted financial data
-   dollars = pd.Series(['12', '-$10', '$10,000'], dtype="text")
+   dollars = pd.Series(['12', '-$10', '$10,000'], dtype="string")
 
    # This does what you'd naively expect:
    dollars.str.replace('$', '')
@@ -243,7 +243,7 @@ positional argument (a regex object) and return a string.
        return m.group(0)[::-1]
 
    pd.Series(['foo 123', 'bar baz', np.nan],
-             dtype="text").str.replace(pat, repl)
+             dtype="string").str.replace(pat, repl)
 
    # Using regex groups
    pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
@@ -252,7 +252,7 @@ positional argument (a regex object) and return a string.
        return m.group('two').swapcase()
 
    pd.Series(['Foo Bar Baz', np.nan],
-             dtype="text").str.replace(pat, repl)
+             dtype="string").str.replace(pat, repl)
 
 .. versionadded:: 0.20.0
 
@@ -291,7 +291,7 @@ The content of a ``Series`` (or ``Index``) can be concatenated:
 
 .. ipython:: python
 
-    s = pd.Series(['a', 'b', 'c', 'd'], dtype="text")
+    s = pd.Series(['a', 'b', 'c', 'd'], dtype="string")
     s.str.cat(sep=',')
 
 If not specified, the keyword ``sep`` for the separator defaults to the empty string, ``sep=''``:
@@ -304,7 +304,7 @@ By default, missing values are ignored. Using ``na_rep``, they can be given a re
 
 .. ipython:: python
 
-    t = pd.Series(['a', 'b', np.nan, 'd'], dtype="text")
+    t = pd.Series(['a', 'b', np.nan, 'd'], dtype="string")
     t.str.cat(sep=',')
     t.str.cat(sep=',', na_rep='-')
 
@@ -350,7 +350,7 @@ the ``join``-keyword.
    :okwarning:
 
    u = pd.Series(['b', 'd', 'a', 'c'], index=[1, 3, 0, 2],
-                 dtype="text")
+                 dtype="string")
    s
    u
    s.str.cat(u)
@@ -367,7 +367,7 @@ In particular, alignment also means that the different lengths do not need to co
 .. ipython:: python
 
     v = pd.Series(['z', 'a', 'b', 'd', 'e'], index=[-1, 0, 1, 3, 4],
-                  dtype="text")
+                  dtype="string")
     s
     v
     s.str.cat(v, join='left', na_rep='-')
@@ -424,7 +424,7 @@ of the string, the result will be a ``NaN``.
 
    s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan,
                   'CABA', 'dog', 'cat'],
-                 dtype="text")
+                 dtype="string")
 
    s.str[0]
    s.str[1]
@@ -456,7 +456,7 @@ DataFrame with one column per group.
 .. ipython:: python
 
    pd.Series(['a1', 'b2', 'c3'],
-             dtype="text").str.extract(r'([ab])(\d)', expand=False)
+             dtype="string").str.extract(r'([ab])(\d)', expand=False)
 
 Elements that do not match return a row filled with ``NaN``. Thus, a
 Series of messy strings can be "converted" into a like-indexed Series
@@ -470,15 +470,15 @@ Named groups like
 .. ipython:: python
 
    pd.Series(['a1', 'b2', 'c3'],
-             dtype="text").str.extract(r'(?P<letter>[ab])(?P<digit>\d)',
-                                       expand=False)
+             dtype="string").str.extract(r'(?P<letter>[ab])(?P<digit>\d)',
+                                         expand=False)
 
 and optional groups like
 
 .. ipython:: python
 
    pd.Series(['a1', 'b2', '3'],
-             dtype="text").str.extract(r'([ab])?(\d)', expand=False)
+             dtype="string").str.extract(r'([ab])?(\d)', expand=False)
 
 can also be used. Note that any capture group names in the regular
 expression will be used for column names; otherwise capture group
@@ -490,14 +490,14 @@ with one column if ``expand=True``.
 .. ipython:: python
 
    pd.Series(['a1', 'b2', 'c3'],
-             dtype="text").str.extract(r'[ab](\d)', expand=True)
+             dtype="string").str.extract(r'[ab](\d)', expand=True)
 
 It returns a Series if ``expand=False``.
 
 .. ipython:: python
 
    pd.Series(['a1', 'b2', 'c3'],
-             dtype="text").str.extract(r'[ab](\d)', expand=False)
+             dtype="string").str.extract(r'[ab](\d)', expand=False)
 
 Calling on an ``Index`` with a regex with exactly one capture group
 returns a ``DataFrame`` with one column if ``expand=True``.
@@ -505,7 +505,7 @@ returns a ``DataFrame`` with one column if ``expand=True``.
 .. ipython:: python
 
    s = pd.Series(["a1", "b2", "c3"], ["A11", "B22", "C33"],
-                 dtype="text")
+                 dtype="string")
    s
    s.index.str.extract("(?P<letter>[a-zA-Z])", expand=True)
 
@@ -551,7 +551,7 @@ Unlike ``extract`` (which returns only the first match),
 .. ipython:: python
 
    s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"],
-                 dtype="text")
+                 dtype="string")
    s
    two_groups = '(?P<letter>[a-z])(?P<digit>[0-9])'
    s.str.extract(two_groups, expand=True)
@@ -569,7 +569,7 @@ When each subject string in the Series has exactly one match,
 
 .. ipython:: python
 
-   s = pd.Series(['a3', 'b3', 'c2'], dtype="text")
+   s = pd.Series(['a3', 'b3', 'c2'], dtype="string")
    s
 
 then ``extractall(pat).xs(0, level='match')`` gives the same result as
@@ -590,7 +590,7 @@ same result as a ``Series.str.extractall`` with a default index (starts from 0).
 
    pd.Index(["a1a2", "b1", "c1"]).str.extractall(two_groups)
 
-   pd.Series(["a1a2", "b1", "c1"], dtype="text").str.extractall(two_groups)
+   pd.Series(["a1a2", "b1", "c1"], dtype="string").str.extractall(two_groups)
 
 
 Testing for Strings that match or contain a pattern
@@ -602,14 +602,14 @@ You can check whether elements contain a pattern:
 
    pattern = r'[0-9][a-z]'
    pd.Series(['1', '2', '3a', '3b', '03c'],
-             dtype="text").str.contains(pattern)
+             dtype="string").str.contains(pattern)
 
 Or whether elements match a pattern:
 
 .. ipython:: python
 
    pd.Series(['1', '2', '3a', '3b', '03c'],
-             dtype="text").str.match(pattern)
+             dtype="string").str.match(pattern)
 
 The distinction between ``match`` and ``contains`` is strictness: ``match``
 relies on strict ``re.match``, while ``contains`` relies on ``re.search``.
@@ -620,7 +620,7 @@ an extra ``na`` argument so missing values can be considered True or False:
 .. ipython:: python
 
    s4 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'],
-                  dtype="text")
+                  dtype="string")
    s4.str.contains('A', na=False)
 
 .. _text.indicator:
@@ -633,7 +633,7 @@ For example if they are separated by a ``'|'``:
 
 .. ipython:: python
 
-    s = pd.Series(['a', 'a|b', np.nan, 'a|c'], dtype="text")
+    s = pd.Series(['a', 'a|b', np.nan, 'a|c'], dtype="string")
     s.str.get_dummies(sep='|')
 
 String ``Index`` also supports ``get_dummies`` which returns a ``MultiIndex``.
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
index 22f9617d6dcc0..73acc4207d35a 100644
--- a/doc/source/whatsnew/v1.0.0.rst
+++ b/doc/source/whatsnew/v1.0.0.rst
@@ -52,8 +52,8 @@ Enhancements
 
 .. _whatsnew_100.string:
 
-Dedicated text data type
-^^^^^^^^^^^^^^^^^^^^^^^^
+Dedicated string data type
+^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 We've added :class:`StringDtype`, an extension type dedicated to string data.
 Previously, strings were typically stored in object-dtype NumPy arrays.

From 0391563156a131959fcaea7b99d04bfd06c4d18e Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 1 Oct 2019 09:59:59 -0500
Subject: [PATCH 46/49] fixups

---
 ci/code_checks.sh                     | 4 ++--
 doc/source/getting_started/basics.rst | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 3fc95efad1905..9f420857319ad 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -262,8 +262,8 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
         -k"-from_arrays -from_breaks -from_intervals -from_tuples -set_closed -to_tuples -interval_range"
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
-    MSG='Doctests arrays/text.py' ; echo $MSG
-    pytest -q --doctest-modules pandas/core/arrays/text.py
+    MSG='Doctests arrays/string_.py' ; echo $MSG
+    pytest -q --doctest-modules pandas/core/arrays/string_.py
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
 fi
diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst
index 2818011eb02ca..36a7166f350e5 100644
--- a/doc/source/getting_started/basics.rst
+++ b/doc/source/getting_started/basics.rst
@@ -1705,7 +1705,7 @@ built-in string methods. For example:
  .. ipython:: python
 
   s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'],
-                dtype="text")
+                dtype="string")
   s.str.lower()
 
 Powerful pattern-matching methods are provided as well, but note that

From 6aebd8c86d5f5a845dbaf03278b8c8bf93acf3f4 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Thu, 3 Oct 2019 19:49:25 -0500
Subject: [PATCH 47/49] move and perf

---
 pandas/core/arrays/string_.py                               | 6 +++++-
 .../tests/arrays/string_/{test_text.py => test_string.py}   | 0
 2 files changed, 5 insertions(+), 1 deletion(-)
 rename pandas/tests/arrays/string_/{test_text.py => test_string.py} (100%)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 41c7d81bd55da..87649ac651127 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -147,9 +147,13 @@ class StringArray(PandasArray):
     _typ = "extension"
 
     def __init__(self, values, copy=False):
+        values = extract_array(values)
+        skip_validation = isinstance(values, type(self))
+
         super().__init__(values, copy=copy)
         self._dtype = StringDtype()
-        self._validate()
+        if not skip_validation:
+            self._validate()
 
     def _validate(self):
         """Validate that we only store NA or strings."""
diff --git a/pandas/tests/arrays/string_/test_text.py b/pandas/tests/arrays/string_/test_string.py
similarity index 100%
rename from pandas/tests/arrays/string_/test_text.py
rename to pandas/tests/arrays/string_/test_string.py

From 2ee5e300828a1abcee1ed333f9342d2a8889679e Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Thu, 3 Oct 2019 19:59:40 -0500
Subject: [PATCH 48/49] test is_string_dtype

---
 pandas/tests/dtypes/test_common.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py
index 266f7ac50c663..466b724f98770 100644
--- a/pandas/tests/dtypes/test_common.py
+++ b/pandas/tests/dtypes/test_common.py
@@ -291,6 +291,8 @@ def test_is_string_dtype():
     assert com.is_string_dtype(str)
     assert com.is_string_dtype(object)
     assert com.is_string_dtype(np.array(["a", "b"]))
+    assert com.is_string_dtype(pd.StringDtype())
+    assert com.is_string_dtype(pd.array(["a", "b"], dtype="string"))
 
 
 def test_is_period_arraylike():

From 7e92cded0ca1e3747d02b417d27d4d7d039fe1b4 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Fri, 4 Oct 2019 09:03:03 -0500
Subject: [PATCH 49/49] helper

---
 pandas/core/strings.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index 2fb09182cc6cf..888d2ae6f9473 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -763,6 +763,16 @@ def f(x):
     return f
 
 
+def _result_dtype(arr):
+    # workaround #27953
+    # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails
+    # when the list of values is empty.
+    if arr.dtype.name == "string":
+        return "string"
+    else:
+        return object
+
+
 def _str_extract_noexpand(arr, pat, flags=0):
     """
     Find groups in each string in the Series using passed regular
@@ -817,10 +827,7 @@ def _str_extract_frame(arr, pat, flags=0):
         result_index = arr.index
     except AttributeError:
         result_index = None
-    if arr.dtype.name == "string":
-        dtype = "string"
-    else:
-        dtype = object
+    dtype = _result_dtype(arr)
     return DataFrame(
         [groups_or_na(val) for val in arr],
         columns=columns,
@@ -1023,14 +1030,7 @@ def str_extractall(arr, pat, flags=0):
     from pandas import MultiIndex
 
     index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"])
-
-    # workaround #27953
-    # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails
-    # when the list of values is empty.
-    if arr.dtype.name == "string":
-        dtype = arr.dtype
-    else:
-        dtype = None
+    dtype = _result_dtype(arr)
 
     result = arr._constructor_expanddim(
         match_list, index=index, columns=columns, dtype=dtype