From a20f41cbbc272d940353b6cb6c60ae3bace2de88 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 11 Feb 2020 15:08:06 +0100
Subject: [PATCH 1/5] BUG: fix infer_dtype for StringDtype

---
 doc/source/whatsnew/v1.0.2.rst                     |  5 +++++
 pandas/_libs/lib.pyx                               |  2 +-
 pandas/tests/dtypes/test_inference.py              | 14 ++++++++++++++
 pandas/tests/series/methods/test_convert_dtypes.py |  7 +++++++
 4 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst
index f4bb8c580fb08..9919a8a3b19a2 100644
--- a/doc/source/whatsnew/v1.0.2.rst
+++ b/doc/source/whatsnew/v1.0.2.rst
@@ -32,6 +32,11 @@ Bug fixes
 
 - Using ``pd.NA`` with :meth:`DataFrame.to_json` now correctly outputs a null value instead of an empty object (:issue:`31615`)
 
+
+**Experimental dtypes**
+
+- Fix bug in :meth:`DataFrame.convert_dtypes` for columns that were already using the `"string"` dtype (:issue:`31731`).
+
 .. ---------------------------------------------------------------------------
 
 .. _whatsnew_102.contributors:
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 9702eb4615909..d2f0b2ffbaeec 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -1005,7 +1005,7 @@ _TYPE_MAP = {
     'complex64': 'complex',
     'complex128': 'complex',
     'c': 'complex',
-    'string': 'bytes',
+    'string': 'string',
     'S': 'bytes',
     'U': 'string',
     'bool': 'boolean',
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
index 48f9262ad3486..137b6253f462c 100644
--- a/pandas/tests/dtypes/test_inference.py
+++ b/pandas/tests/dtypes/test_inference.py
@@ -1200,6 +1200,20 @@ def test_interval(self):
         inferred = lib.infer_dtype(pd.Series(idx), skipna=False)
         assert inferred == "interval"
 
+    def test_string_dtype(self):
+        # StringArray
+        arr = pd.array(["a", "b", pd.NA], dtype="string")
+        for val in [list(arr), arr, pd.Series(arr)]:
+            inferred = lib.infer_dtype(val)
+            assert inferred == "string"
+
+    def test_boolean_dtype(self):
+        # BooleanArray
+        arr = pd.array([True, False, pd.NA], dtype="boolean")
+        for val in [list(arr), arr, pd.Series(arr)]:
+            inferred = lib.infer_dtype(val)
+            assert inferred == "boolean"
+
 
 class TestNumberScalar:
     def test_is_number(self):
diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py
index 923b5a94c5f41..83d10a603d3d7 100644
--- a/pandas/tests/series/methods/test_convert_dtypes.py
+++ b/pandas/tests/series/methods/test_convert_dtypes.py
@@ -246,3 +246,10 @@ def test_convert_dtypes(self, data, maindtype, params, answerdict):
 
         # Make sure original not changed
         tm.assert_series_equal(series, copy)
+
+    def test_convert_string_dtype(self):
+        # https://github.com/pandas-dev/pandas/issues/31731 -> converting columns
+        # that are already string dtype
+        df = pd.DataFrame({"A": ["a", "b", "c"], "B": ["ä", "ö", "ü"]}, dtype="string")
+        result = df.convert_dtypes()
+        tm.assert_frame_equal(df, result)

From fcaddd3a8dd957154d80f50a945835e97ef2b58b Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 11 Feb 2020 15:10:57 +0100
Subject: [PATCH 2/5] fix quoting

---
 doc/source/whatsnew/v1.0.2.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst
index 9919a8a3b19a2..5c402ed432715 100644
--- a/doc/source/whatsnew/v1.0.2.rst
+++ b/doc/source/whatsnew/v1.0.2.rst
@@ -35,7 +35,7 @@ Bug fixes
 
 **Experimental dtypes**
 
-- Fix bug in :meth:`DataFrame.convert_dtypes` for columns that were already using the `"string"` dtype (:issue:`31731`).
+- Fix bug in :meth:`DataFrame.convert_dtypes` for columns that were already using the ``"string"`` dtype (:issue:`31731`).
 
 .. ---------------------------------------------------------------------------
 

From f04932c8dcea4a5ff4fbf47f3d6741fc3d53c182 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 12 Feb 2020 14:13:22 +0100
Subject: [PATCH 3/5] add list case to fixture

---
 pandas/conftest.py                    | 2 ++
 pandas/tests/dtypes/test_inference.py | 4 ++--
 pandas/tests/test_strings.py          | 4 ++++
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index 7463b2b579c0c..821bec19d6115 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -744,6 +744,7 @@ def any_numpy_dtype(request):
 # categoricals are handled separately
 _any_skipna_inferred_dtype = [
     ("string", ["a", np.nan, "c"]),
+    ("string", ["a", pd.NA, "c"]),
     ("bytes", [b"a", np.nan, b"c"]),
     ("empty", [np.nan, np.nan, np.nan]),
     ("empty", []),
@@ -754,6 +755,7 @@ def any_numpy_dtype(request):
     ("mixed-integer-float", [1, np.nan, 2.0]),
     ("decimal", [Decimal(1), np.nan, Decimal(2)]),
     ("boolean", [True, np.nan, False]),
+    ("boolean", [True, pd.NA, False]),
     ("datetime64", [np.datetime64("2013-01-01"), np.nan, np.datetime64("2018-01-01")]),
     ("datetime", [pd.Timestamp("20130101"), np.nan, pd.Timestamp("20180101")]),
     ("date", [date(2013, 1, 1), np.nan, date(2018, 1, 1)]),
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
index 137b6253f462c..7c1837985472e 100644
--- a/pandas/tests/dtypes/test_inference.py
+++ b/pandas/tests/dtypes/test_inference.py
@@ -1203,14 +1203,14 @@ def test_interval(self):
     def test_string_dtype(self):
         # StringArray
         arr = pd.array(["a", "b", pd.NA], dtype="string")
-        for val in [list(arr), arr, pd.Series(arr)]:
+        for val in [arr, pd.Series(arr)]:
             inferred = lib.infer_dtype(val)
             assert inferred == "string"
 
     def test_boolean_dtype(self):
         # BooleanArray
         arr = pd.array([True, False, pd.NA], dtype="boolean")
-        for val in [list(arr), arr, pd.Series(arr)]:
+        for val in [arr, pd.Series(arr)]:
             inferred = lib.infer_dtype(val)
             assert inferred == "boolean"
 
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
index 62d26dacde67b..1338d801e39f4 100644
--- a/pandas/tests/test_strings.py
+++ b/pandas/tests/test_strings.py
@@ -7,6 +7,7 @@
 
 from pandas._libs import lib
 
+import pandas as pd
 from pandas import DataFrame, Index, MultiIndex, Series, concat, isna, notna
 import pandas._testing as tm
 import pandas.core.strings as strings
@@ -207,6 +208,9 @@ def test_api_per_dtype(self, index_or_series, dtype, any_skipna_inferred_dtype):
         box = index_or_series
         inferred_dtype, values = any_skipna_inferred_dtype
 
+        if dtype == "category" and len(values) and values[1] is pd.NA:
+            pytest.xfail(reason="Categorical does not yet support pd.NA")
+
         t = box(values, dtype=dtype)  # explicit dtype to avoid casting
 
         # TODO: get rid of these xfails

From 92b0083d6fda18a9f6fb2963f54ae99b26b33423 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 12 Feb 2020 14:21:30 +0100
Subject: [PATCH 4/5] parametrize class, skipna, data

---
 pandas/tests/dtypes/test_inference.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
index 7c1837985472e..48ae1f67297af 100644
--- a/pandas/tests/dtypes/test_inference.py
+++ b/pandas/tests/dtypes/test_inference.py
@@ -1200,19 +1200,23 @@ def test_interval(self):
         inferred = lib.infer_dtype(pd.Series(idx), skipna=False)
         assert inferred == "interval"
 
-    def test_string_dtype(self):
+    @pytest.mark.parametrize("klass", [pd.array, pd.Series])
+    @pytest.mark.parametrize("skipna", [True, False])
+    @pytest.mark.parametrize("data", [["a", "b", "c"], ["a", "b", pd.NA]])
+    def test_string_dtype(self, data, skipna, klass):
         # StringArray
-        arr = pd.array(["a", "b", pd.NA], dtype="string")
-        for val in [arr, pd.Series(arr)]:
-            inferred = lib.infer_dtype(val)
-            assert inferred == "string"
+        val = klass(data, dtype="string")
+        inferred = lib.infer_dtype(val, skipna=skipna)
+        assert inferred == "string"
 
-    def test_boolean_dtype(self):
+    @pytest.mark.parametrize("klass", [pd.array, pd.Series])
+    @pytest.mark.parametrize("skipna", [True, False])
+    @pytest.mark.parametrize("data", [[True, False, True], [True, False, pd.NA]])
+    def test_boolean_dtype(self, data, skipna, klass):
         # BooleanArray
-        arr = pd.array([True, False, pd.NA], dtype="boolean")
-        for val in [arr, pd.Series(arr)]:
-            inferred = lib.infer_dtype(val)
-            assert inferred == "boolean"
+        val = klass(data, dtype="boolean")
+        inferred = lib.infer_dtype(val, skipna=skipna)
+        assert inferred == "boolean"
 
 
 class TestNumberScalar:

From 45947a8445a6ca5762ed50a409e27fe4d1671cb2 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 12 Feb 2020 14:30:42 +0100
Subject: [PATCH 5/5] add null

---
 pandas/tests/series/methods/test_convert_dtypes.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py
index 83d10a603d3d7..a6b5fed40a9d7 100644
--- a/pandas/tests/series/methods/test_convert_dtypes.py
+++ b/pandas/tests/series/methods/test_convert_dtypes.py
@@ -250,6 +250,8 @@ def test_convert_dtypes(self, data, maindtype, params, answerdict):
     def test_convert_string_dtype(self):
         # https://github.com/pandas-dev/pandas/issues/31731 -> converting columns
         # that are already string dtype
-        df = pd.DataFrame({"A": ["a", "b", "c"], "B": ["ä", "ö", "ü"]}, dtype="string")
+        df = pd.DataFrame(
+            {"A": ["a", "b", pd.NA], "B": ["ä", "ö", "ü"]}, dtype="string"
+        )
         result = df.convert_dtypes()
         tm.assert_frame_equal(df, result)