MarcoGorelli
diff --git a/‎asv_bench/benchmarks/categoricals.py
+14 b/‎asv_bench/benchmarks/categoricals.py
+14
diff --git a/‎ci/code_checks.sh
+11-3 b/‎ci/code_checks.sh
+11-3
diff --git a/‎ci/deps/azure-36-32bit.yaml
+2-1 b/‎ci/deps/azure-36-32bit.yaml
+2-1
diff --git a/‎ci/deps/travis-36-cov.yaml
+1-1 b/‎ci/deps/travis-36-cov.yaml
+1-1
diff --git a/‎ci/deps/travis-36-slow.yaml
+1-1 b/‎ci/deps/travis-36-slow.yaml
+1-1
diff --git a/‎ci/deps/travis-37.yaml
+1-1 b/‎ci/deps/travis-37.yaml
+1-1
diff --git a/‎ci/run_tests.sh
+5-6 b/‎ci/run_tests.sh
+5-6
diff --git a/‎doc/.gitignore
+4 b/‎doc/.gitignore
+4
diff --git a/‎doc/redirects.csv
+4 b/‎doc/redirects.csv
+4
diff --git a/‎doc/source/conf.py
+2-2 b/‎doc/source/conf.py
+2-2
diff --git a/‎doc/source/development/contributing.rst
+1-1 b/‎doc/source/development/contributing.rst
+1-1
diff --git a/‎doc/source/ecosystem.rst
+2 b/‎doc/source/ecosystem.rst
+2
diff --git a/‎doc/source/getting_started/basics.rst
+16-3 b/‎doc/source/getting_started/basics.rst
+16-3
diff --git a/‎doc/source/getting_started/index.rst
+1 b/‎doc/source/getting_started/index.rst
+1
diff --git a/‎doc/source/install.rst renamed to ‎doc/source/getting_started/install.rst b/‎doc/source/install.rst renamed to ‎doc/source/getting_started/install.rst
diff --git a/‎doc/source/index.rst.template
+2-3 b/‎doc/source/index.rst.template
+2-3
diff --git a/‎doc/source/reference/arrays.rst
+25-1 b/‎doc/source/reference/arrays.rst
+25-1
diff --git a/‎doc/source/reference/indexing.rst
-1 b/‎doc/source/reference/indexing.rst
-1
diff --git a/‎doc/source/user_guide/index.rst
+1 b/‎doc/source/user_guide/index.rst
+1
diff --git a/‎doc/source/user_guide/io.rst
+5-2 b/‎doc/source/user_guide/io.rst
+5-2
@@ -282,4 +282,18 @@ def time_sort_values(self):
         self.index.sort_values(ascending=False)
 
 
+class SearchSorted:
+    def setup(self):
+        N = 10 ** 5
+        self.ci = tm.makeCategoricalIndex(N).sort_values()
+        self.c = self.ci.values
+        self.key = self.ci.categories[1]
+
+    def time_categorical_index_contains(self):
+        self.ci.searchsorted(self.key)
+
+    def time_categorical_contains(self):
+        self.c.searchsorted(self.key)
+
+
 from .pandas_vb_common import setup  # noqa: F401 isort:skip
@@ -125,6 +125,10 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then
     # invgrep -R --include="*.py*" -E "from numpy import nan " pandas  # GH#24822 not yet implemented since the offending imports have not all been removed
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
+    MSG='Check for use of exec' ; echo $MSG
+    invgrep -R --include="*.py*" -E "[^a-zA-Z0-9_]exec\(" pandas
+    RET=$(($RET + $?)) ; echo $MSG "DONE"
+
     MSG='Check for pytest warns' ; echo $MSG
     invgrep -r -E --include '*.py' 'pytest\.warns' pandas/tests/
     RET=$(($RET + $?)) ; echo $MSG "DONE"
@@ -184,7 +188,7 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then
     invgrep -R --include="*.rst" ".. ipython ::" doc/source
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
-    MSG='Check that no file in the repo contains tailing whitespaces' ; echo $MSG
+    MSG='Check that no file in the repo contains trailing whitespaces' ; echo $MSG
     set -o pipefail
     if [[ "$AZURE" == "true" ]]; then
         # we exclude all c/cpp files as the c/cpp files of pandas code base are tested when Linting .c and .h files
@@ -262,13 +266,17 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
         -k"-from_arrays -from_breaks -from_intervals -from_tuples -set_closed -to_tuples -interval_range"
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
+    MSG='Doctests arrays/string_.py' ; echo $MSG
+    pytest -q --doctest-modules pandas/core/arrays/string_.py
+    RET=$(($RET + $?)) ; echo $MSG "DONE"
+
 fi
 
 ### DOCSTRINGS ###
 if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
 
-    MSG='Validate docstrings (GL03, GL04, GL05, GL06, GL07, GL09, GL10, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA05)' ; echo $MSG
-    $BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL03,GL04,GL05,GL06,GL07,GL09,GL10,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA05
+    MSG='Validate docstrings (GL03, GL04, GL05, GL06, GL07, GL09, GL10, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA01, SA02, SA03, SA05)' ; echo $MSG
+    $BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL03,GL04,GL05,GL06,GL07,GL09,GL10,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA01,SA02,SA03,SA05
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
 fi
 
@@ -3,6 +3,7 @@ channels:
   - defaults
   - conda-forge
 dependencies:
+  - attrs=19.1.0
   - gcc_linux-32
   - gcc_linux-32
   - gxx_linux-32
@@ -11,7 +12,7 @@ dependencies:
   - python=3.6.*
   - pytz=2017.2
   # universal
-  - pytest>=4.0.2,<5.0.0
+  - pytest
   - pytest-xdist
   - pytest-mock
   - pytest-azurepipelines
 
@@ -29,7 +29,7 @@ dependencies:
   - python-snappy
   - python=3.6.*
   - pytz
-  - s3fs
+  - s3fs<0.3
   - scikit-learn
   - scipy
   - sqlalchemy
 
@@ -18,7 +18,7 @@ dependencies:
   - python-dateutil
   - python=3.6.*
   - pytz
-  - s3fs
+  - s3fs<0.3
   - scipy
   - sqlalchemy
   - xlrd
 
@@ -17,7 +17,7 @@ dependencies:
   - pytest-xdist>=1.29.0
   - pytest-mock
   - hypothesis>=3.58.0
-  - s3fs
+  - s3fs<0.3
   - pip
   - pyreadstat
   - pip:
 
@@ -43,10 +43,9 @@ do
     # if no tests are found (the case of "single and slow"), pytest exits with code 5, and would make the script fail, if not for the below code
     sh -c "$PYTEST_CMD; ret=\$?; [ \$ret = 5 ] && exit 0 || exit \$ret"
 
-    # 2019-08-21 disabling because this is hitting HTTP 400 errors GH#27602
-    # if [[ "$COVERAGE" && $? == 0 && "$TRAVIS_BRANCH" == "master" ]]; then
-    #    echo "uploading coverage for $TYPE tests"
-    #    echo "bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME"
-    #          bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME
-    # fi
+    if [[ "$COVERAGE" && $? == 0 && "$TRAVIS_BRANCH" == "master" ]]; then
+        echo "uploading coverage for $TYPE tests"
+        echo "bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME"
+              bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME
+    fi
 done
@@ -0,0 +1,4 @@
+data/
+timeseries.csv
+timeseries.parquet
+timeseries_wide.parquet
@@ -6,6 +6,7 @@ whatsnew,whatsnew/index
 release,whatsnew/index
 
 # getting started
+install,getting_started/install
 10min,getting_started/10min
 basics,getting_started/basics
 comparison_with_r,getting_started/comparison/comparison_with_r
@@ -1577,3 +1578,6 @@ generated/pandas.unique,../reference/api/pandas.unique
 generated/pandas.util.hash_array,../reference/api/pandas.util.hash_array
 generated/pandas.util.hash_pandas_object,../reference/api/pandas.util.hash_pandas_object
 generated/pandas.wide_to_long,../reference/api/pandas.wide_to_long
+
+# Cached searches
+reference/api/pandas.DataFrame.from_csv,pandas.read_csv
@@ -191,7 +191,7 @@
 
 # The theme to use for HTML and HTML Help pages.  Major themes that come with
 # Sphinx are currently 'default' and 'sphinxdoc'.
-html_theme = "nature_with_gtoc"
+html_theme = "pandas_sphinx_theme"
 
 # The style sheet to use for HTML and HTML Help pages. A file of that name
 # must exist either in Sphinx' static/ path, or in one of the custom paths
@@ -204,7 +204,7 @@
 # html_theme_options = {}
 
 # Add any paths that contain custom themes here, relative to this directory.
-html_theme_path = ["themes"]
+# html_theme_path = ["themes"]
 
 # The name for this set of Sphinx documents.  If None, it defaults to
 # "<project> v<release> documentation".
 
@@ -952,7 +952,7 @@ the expected correct result::
 Transitioning to ``pytest``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-*pandas* existing test structure is *mostly* classed based, meaning that you will typically find tests wrapped in a class.
+*pandas* existing test structure is *mostly* class-based, meaning that you will typically find tests wrapped in a class.
 
 .. code-block:: python
 
 
@@ -1,3 +1,5 @@
+:orphan:
+
 .. _ecosystem:
 
 {{ header }}
 
@@ -986,7 +986,7 @@ not noted for a particular column will be ``NaN``:
 
    tsdf.agg({'A': ['mean', 'min'], 'B': 'sum'})
 
-.. _basics.aggregation.mixed_dtypes:
+.. _basics.aggregation.mixed_string:
 
 Mixed dtypes
 ++++++++++++
@@ -1704,14 +1704,21 @@ built-in string methods. For example:
 
  .. ipython:: python
 
-  s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
+  s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'],
+                dtype="string")
   s.str.lower()
 
 Powerful pattern-matching methods are provided as well, but note that
 pattern-matching generally uses `regular expressions
 <https://docs.python.org/3/library/re.html>`__ by default (and in some cases
 always uses them).
 
+.. note::
+
+   Prior to pandas 1.0, string methods were only available on ``object`` -dtype
+   ``Series``. Pandas 1.0 added the :class:`StringDtype` which is dedicated
+   to strings. See :ref:`text.types` for more.
+
 Please see :ref:`Vectorized String Methods <text.string_methods>` for a complete
 description.
 
@@ -1925,9 +1932,15 @@ period (time spans) :class:`PeriodDtype`      :class:`Period`    :class:`arrays.
 sparse              :class:`SparseDtype`      (none)             :class:`arrays.SparseArray`   :ref:`sparse`
 intervals           :class:`IntervalDtype`    :class:`Interval`  :class:`arrays.IntervalArray` :ref:`advanced.intervalindex`
 nullable integer    :class:`Int64Dtype`, ...  (none)             :class:`arrays.IntegerArray`  :ref:`integer_na`
+Strings             :class:`StringDtype`      :class:`str`       :class:`arrays.StringArray`   :ref:`text`
 =================== ========================= ================== ============================= =============================
 
-Pandas uses the ``object`` dtype for storing strings.
+Pandas has two ways to store strings.
+
+1. ``object`` dtype, which can hold any Python object, including strings.
+2. :class:`StringDtype`, which is dedicated to strings.
+
+Generally, we recommend using :class:`StringDtype`. See :ref:`text.types` fore more.
 
 Finally, arbitrary objects may be stored using the ``object`` dtype, but should
 be avoided to the extent possible (for performance and interoperability with
 
@@ -12,6 +12,7 @@ Getting started
 .. toctree::
     :maxdepth: 2
 
+    install
     overview
     10min
     basics
 
@@ -40,10 +40,8 @@ See the :ref:`overview` for more detail about what's in the library.
 {% endif %}
 {% if not single_doc %}
     What's New in 1.0.0 <whatsnew/v1.0.0>
-    install
     getting_started/index
     user_guide/index
-    ecosystem
     {% endif -%}
     {% if include_api -%}
     reference/index
@@ -54,9 +52,9 @@ See the :ref:`overview` for more detail about what's in the library.
 {% endif %}
 
 * :doc:`whatsnew/v1.0.0`
-* :doc:`install`
 * :doc:`getting_started/index`
 
+  * :doc:`getting_started/install`
   * :doc:`getting_started/overview`
   * :doc:`getting_started/10min`
   * :doc:`getting_started/basics`
@@ -83,6 +81,7 @@ See the :ref:`overview` for more detail about what's in the library.
   * :doc:`user_guide/style`
   * :doc:`user_guide/options`
   * :doc:`user_guide/enhancingperf`
+  * :doc:`user_guide/scale`
   * :doc:`user_guide/sparse`
   * :doc:`user_guide/gotchas`
   * :doc:`user_guide/cookbook`
 
@@ -24,6 +24,7 @@ Intervals           :class:`IntervalDtype`    :class:`Interval`  :ref:`api.array
 Nullable Integer    :class:`Int64Dtype`, ...  (none)             :ref:`api.arrays.integer_na`
 Categorical         :class:`CategoricalDtype` (none)             :ref:`api.arrays.categorical`
 Sparse              :class:`SparseDtype`      (none)             :ref:`api.arrays.sparse`
+Strings             :class:`StringDtype`      :class:`str`       :ref:`api.arrays.string`
 =================== ========================= ================== =============================
 
 Pandas and third-party libraries can extend NumPy's type system (see :ref:`extending.extension-types`).
@@ -460,6 +461,29 @@ and methods if the :class:`Series` contains sparse values. See
 :ref:`api.series.sparse` for more.
 
 
+.. _api.arrays.string:
+
+Text data
+---------
+
+When working with text data, where each valid element is a string or missing,
+we recommend using :class:`StringDtype` (with the alias ``"string"``).
+
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_without_autosummary.rst
+
+   arrays.StringArray
+
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_without_autosummary.rst
+
+   StringDtype
+
+The ``Series.str`` accessor is available for ``Series`` backed by a :class:`arrays.StringArray`.
+See :ref:`api.series.str` for more.
+
 
 .. Dtype attributes which are manually listed in their docstrings: including
 .. it here to make sure a docstring page is built for them
@@ -471,4 +495,4 @@ and methods if the :class:`Series` contains sparse values. See
       DatetimeTZDtype.unit
       DatetimeTZDtype.tz
       PeriodDtype.freq
-      IntervalDtype.subtype
+      IntervalDtype.subtype
@@ -166,7 +166,6 @@ Selecting
    Index.get_slice_bound
    Index.get_value
    Index.get_values
-   Index.set_value
    Index.isin
    Index.slice_indexer
    Index.slice_locs
 
@@ -38,6 +38,7 @@ Further information on any specific method can be obtained in the
     style
     options
     enhancingperf
+    scale
     sparse
     gotchas
     cookbook
@@ -4710,7 +4710,8 @@ Several caveats.
   indexes. This extra column can cause problems for non-Pandas consumers that are not expecting it. You can
   force including or omitting indexes with the ``index`` argument, regardless of the underlying engine.
 * Index level names, if specified, must be strings.
-* Categorical dtypes can be serialized to parquet, but will de-serialize as ``object`` dtype.
+* In the ``pyarrow`` engine, categorical dtypes for non-string types can be serialized to parquet, but will de-serialize as their primitive dtype.
+* The ``pyarrow`` engine preserves the ``ordered`` flag of categorical dtypes with string types. ``fastparquet`` does not preserve the ``ordered`` flag.
 * Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message
   on an attempt at serialization.
 
@@ -4734,7 +4735,9 @@ See the documentation for `pyarrow <https://arrow.apache.org/docs/python/>`__ an
                       'd': np.arange(4.0, 7.0, dtype='float64'),
                       'e': [True, False, True],
                       'f': pd.date_range('20130101', periods=3),
-                      'g': pd.date_range('20130101', periods=3, tz='US/Eastern')})
+                      'g': pd.date_range('20130101', periods=3, tz='US/Eastern'),
+                      'h': pd.Categorical(list('abc')),
+                      'i': pd.Categorical(list('abc'), ordered=True)})
 
    df
    df.dtypes
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+:orphan:`
	`2`	`+`
`1`	`3`	`.. _ecosystem:`
`2`	`4`
`3`	`5`	`{{ header }}`