pandas-dev · jreback · Jun 8, 2018 · Jun 7, 2018 · Jun 7, 2018 · Jun 7, 2018
diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt
@@ -11,6 +11,8 @@ and bug fixes. We recommend that all users upgrade to this version.
     :backlinks: none
 
 
+.. _whatsnew_0231.enhancements:
+
 .. _whatsnew_0231.fixed_regressions:
 
 Fixed Regressions
@@ -29,6 +31,7 @@ Fixed Regressions
 - Bug in :meth:`~DataFrame.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`)
 - Bug preventing pandas from being importable with -OO optimization (:issue:`21071`)
 - Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when `value` the individual categories are iterable and `value` is an iterable (:issue:`21097`, :issue:`19788`)
+- Fixed regression in constructors coercing NA values like ``None`` to strings when passing ``dtype=str`` (:issue:`21083`)
 
 
 .. _whatsnew_0231.performance:

diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -159,3 +159,14 @@ def tz_aware_fixture(request):
     Fixture for trying explicit timezones: {0}
     """
     return request.param
+
+
+@pytest.fixture(params=[str, 'str', 'U'])
+def string_dtype(request):
+    """Parametrized fixture for string dtypes.
+
+    * str
+    * 'str'
+    * 'U'
+    """
+    return request.param
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -4054,7 +4054,21 @@ def _try_cast(arr, take_fast_path):
                                            isinstance(subarr, np.ndarray))):
                 subarr = construct_1d_object_array_from_listlike(subarr)
             elif not is_extension_type(subarr):
-                subarr = np.array(subarr, dtype=dtype, copy=copy)
+                subarr2 = np.array(subarr, dtype=dtype, copy=copy)
+
+                if dtype is not None and dtype.kind in ("U", "S"):
+                    # GH-21083
+                    # We can't just return np.array(subarr, dtype='str') since
+                    # NumPy will convert the non-string objects into strings
+                    # Including NA values. Se we have to go
+                    # string -> object -> update NA, which requires an
+                    # additional pass over the data.
+                    na_values = isna(subarr)
+                    subarr2 = subarr2.astype(object)
+                    subarr2[na_values] = np.asarray(subarr,
+                                                    dtype=object)[na_values]
+
+                subarr = subarr2
         except (ValueError, TypeError):
             if is_categorical_dtype(dtype):
                 # We *do* allow casting to categorical, since we know

diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -151,6 +151,17 @@ def test_constructor_complex_dtypes(self):
         assert a.dtype == df.a.dtype
         assert b.dtype == df.b.dtype
 
+    def test_constructor_dtype_str_na_values(self, string_dtype):
+        # https://github.com/pandas-dev/pandas/issues/21083
+        df = DataFrame({'A': ['x', None]}, dtype=, string_dtype)
+        result = df.isna()
+        expected = DataFrame({"A": [False, True]})
+        tm.assert_frame_equal(result, expected)
+        assert df.iloc[1, 0] is None
+
+        df = DataFrame({'A': ['x', np.nan]}, dtype=, string_dtype)
+        assert np.isnan(df.iloc[1, 0])
+
     def test_constructor_rec(self):
         rec = self.frame.to_records(index=False)
 

diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py
@@ -794,22 +794,26 @@ def test_arg_for_errors_in_astype(self):
 
     @pytest.mark.parametrize('input_vals', [
         ([1, 2]),
-        ([1.0, 2.0, np.nan]),
         (['1', '2']),
         (list(pd.date_range('1/1/2011', periods=2, freq='H'))),
         (list(pd.date_range('1/1/2011', periods=2, freq='H',
                             tz='US/Eastern'))),
         ([pd.Interval(left=0, right=5)]),
     ])
-    def test_constructor_list_str(self, input_vals):
+    def test_constructor_list_str(self, input_vals, string_dtype):
         # GH 16605
         # Ensure that data elements are converted to strings when
         # dtype is str, 'str', or 'U'
 
-        for dtype in ['str', str, 'U']:
-            result = DataFrame({'A': input_vals}, dtype=dtype)
-            expected = DataFrame({'A': input_vals}).astype({'A': dtype})
-            assert_frame_equal(result, expected)
+        result = DataFrame({'A': input_vals}, dtype=string_dtype)
+        expected = DataFrame({'A': input_vals}).astype({'A': string_dtype})
+        assert_frame_equal(result, expected)
+
+    def test_constructor_list_str_na(self, string_dtype):
+
+        result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype)
+        expected = DataFrame({"A": ['1.0', '2.0', None]}, dtype=object)
+        assert_frame_equal(result, expected)
 
 
 class TestDataFrameDatetimeWithTZ(TestData):

diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py
@@ -1829,7 +1829,7 @@ def test_mode_str_obj(self, dropna, expected1, expected2, expected3):
 
         data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan]
 
-        s = Series(data, dtype=str)
+        s = Series(data, dtype=object).astype(str)
         result = s.mode(dropna)
         expected3 = Series(expected3, dtype=str)
         tm.assert_series_equal(result, expected3)

diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
@@ -137,6 +137,17 @@ def test_constructor_no_data_index_order(self):
         result = pd.Series(index=['b', 'a', 'c'])
         assert result.index.tolist() == ['b', 'a', 'c']
 
+    def test_constructor_dtype_str_na_values(self, string_dtype):
+        # https://github.com/pandas-dev/pandas/issues/21083
+        ser = Series(['x', None], dtype=string_dtype)
+        result = ser.isna()
+        expected = Series([False, True])
+        tm.assert_series_equal(result, expected)
+        assert ser.iloc[1] is None
+
+        ser = Series(['x', np.nan], dtype=string_dtype)
+        assert np.isnan(ser.iloc[1])
+
     def test_constructor_series(self):
         index1 = ['d', 'b', 'a', 'c']
         index2 = sorted(index1)
@@ -164,22 +175,25 @@ def test_constructor_list_like(self):
 
     @pytest.mark.parametrize('input_vals', [
         ([1, 2]),
-        ([1.0, 2.0, np.nan]),
         (['1', '2']),
         (list(pd.date_range('1/1/2011', periods=2, freq='H'))),
         (list(pd.date_range('1/1/2011', periods=2, freq='H',
                             tz='US/Eastern'))),
         ([pd.Interval(left=0, right=5)]),
     ])
-    def test_constructor_list_str(self, input_vals):
+    def test_constructor_list_str(self, input_vals, string_dtype):
         # GH 16605
         # Ensure that data elements from a list are converted to strings
         # when dtype is str, 'str', or 'U'
+        result = Series(input_vals, dtype=string_dtype)
+        expected = Series(input_vals).astype(string_dtype)
+        assert_series_equal(result, expected)
 
-        for dtype in ['str', str, 'U']:
-            result = Series(input_vals, dtype=dtype)
-            expected = Series(input_vals).astype(dtype)
-            assert_series_equal(result, expected)
+    def test_constructor_list_str_na(self, string_dtype):
+        result = Series([1.0, 2.0, np.nan], dtype=string_dtype)
+        expected = Series(['1.0', '2.0', np.nan], dtype=object)
+        assert_series_equal(result, expected)
+        assert np.isnan(result[2])
 
     def test_constructor_generator(self):
         gen = (i for i in range(10))