pandas-dev · nikoskaragiannakis · Mar 20, 2018 · Mar 20, 2018 · Mar 20, 2018 · Mar 20, 2018
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -1099,6 +1099,7 @@ I/O
 - Bug in :meth:`pandas.io.json.json_normalize` where subrecords are not properly normalized if any subrecords values are NoneType (:issue:`20030`)
 - Bug in ``usecols`` parameter in :func:`pandas.io.read_csv` and :func:`pandas.io.read_table` where error is not raised correctly when passing a string. (:issue:`20529`)
 - Bug in :func:`HDFStore.keys` when reading a file with a softlink causes exception (:issue:`20523`)
+- Bug in :func:`read_excel` and :func:`read_csv` where missing values turned to ``'nan'`` with ``dtype=str`` and ``na_filter=True``. Now, these missing values are converted to the string missing indicator, ``np.nan``. (:issue `20377`)
 
 Plotting
 ^^^^^^^^

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -461,11 +461,17 @@ cpdef ndarray[object] astype_unicode(ndarray arr):
     cdef:
         Py_ssize_t i, n = arr.size
         ndarray[object] result = np.empty(n, dtype=object)
+        object arr_i
 
     for i in range(n):
         # we can use the unsafe version because we know `result` is mutable
         # since it was created from `np.empty`
-        util.set_value_at_unsafe(result, i, unicode(arr[i]))
+        arr_i = arr[i]
+        util.set_value_at_unsafe(
+            result,
+            i,
+            unicode(arr_i) if not checknull(arr_i) else np.nan
+        )
 
     return result
 
@@ -474,11 +480,17 @@ cpdef ndarray[object] astype_str(ndarray arr):
     cdef:
         Py_ssize_t i, n = arr.size
         ndarray[object] result = np.empty(n, dtype=object)
+        object arr_i
 
     for i in range(n):
         # we can use the unsafe version because we know `result` is mutable
         # since it was created from `np.empty`
-        util.set_value_at_unsafe(result, i, str(arr[i]))
+        arr_i = arr[i]
+        util.set_value_at_unsafe(
+            result,
+            i,
+            str(arr_i) if not checknull(arr_i) else np.nan
+        )
 
     return result
 

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -1218,6 +1218,7 @@ cdef class TextReader:
             # treat as a regular string parsing
             return self._string_convert(i, start, end, na_filter,
                                         na_hashset)
+
         elif dtype.kind == 'U':
             width = dtype.itemsize
             if width > 0:
@@ -1227,6 +1228,7 @@ cdef class TextReader:
             # unicode variable width
             return self._string_convert(i, start, end, na_filter,
                                         na_hashset)
+
         elif is_categorical_dtype(dtype):
             # TODO: I suspect that _categorical_convert could be
             # optimized when dtype is an instance of CategoricalDtype

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -4153,4 +4153,8 @@ def _try_cast(arr, take_fast_path):
                 data = np.array(data, dtype=dtype, copy=False)
             subarr = np.array(data, dtype=object, copy=copy)
 
+            # GH 20377
+            # Turn all 'nan' to np.nan
+            subarr[subarr == 'nan'] = np.nan
+
     return subarr
diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py
@@ -529,7 +529,7 @@ def test_astype_str(self):
         # consistency in astype(str)
         for tt in set([str, compat.text_type]):
             result = DataFrame([np.NaN]).astype(tt)
-            expected = DataFrame(['nan'])
+            expected = DataFrame([np.NaN], dtype=object)
             assert_frame_equal(result, expected)
 
             result = DataFrame([1.12345678901234567890]).astype(tt)

diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py
@@ -369,3 +369,27 @@ def test_no_na_filter_on_index(self):
         expected = DataFrame({"a": [1, 4], "c": [3, 6]},
                              index=Index([np.nan, 5.0], name="b"))
         tm.assert_frame_equal(out, expected)
+
+    def test_na_values_with_dtype_str_and_na_filter_true(self):
+        # see gh-20377
+        data = "a,b,c\n1,,3\n4,5,6"
+
+        out = self.read_csv(StringIO(data), na_filter=True, dtype=str)
+
+        # missing data turn to np.nan, which stays as it is after dtype=str
+        expected = DataFrame({"a": ["1", "4"],
+                              "b": [np.nan, "5"],
+                              "c": ["3", "6"]})
+        tm.assert_frame_equal(out, expected)
+
+    def test_na_values_with_dtype_str_and_na_filter_false(self):
+        # see gh-20377
+        data = "a,b,c\n1,,3\n4,5,6"
+
+        out = self.read_csv(StringIO(data), na_filter=False, dtype=str)
+
+        # missing data turn to empty string
+        expected = DataFrame({"a": ["1", "4"],
+                              "b": ["", "5"],
+                              "c": ["3", "6"]})
+        tm.assert_frame_equal(out, expected)
diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py
@@ -360,6 +360,33 @@ def test_reader_dtype(self, ext):
         with pytest.raises(ValueError):
             actual = self.get_exceldf(basename, ext, dtype={'d': 'int64'})
 
+    def test_reader_dtype_str(self, ext):
+        # GH 20377
+        basename = 'testdtype'
+        actual = self.get_exceldf(basename, ext)
+
+        expected = DataFrame({
+            'a': [1, 2, 3, 4],
+            'b': [2.5, 3.5, 4.5, 5.5],
+            'c': [1, 2, 3, 4],
+            'd': [1.0, 2.0, np.nan, 4.0]}).reindex(
+                columns=['a', 'b', 'c', 'd'])
+
+        tm.assert_frame_equal(actual, expected)
+
+        actual = self.get_exceldf(basename, ext,
+                                  dtype={'a': 'float64',
+                                         'b': 'float32',
+                                         'c': str,
+                                         'd': str})
+
+        expected['a'] = expected['a'].astype('float64')
+        expected['b'] = expected['b'].astype('float32')
+        expected['c'] = ['001', '002', '003', '004']
+        expected['d'] = ['1', '2', np.nan, '4']
+
+        tm.assert_frame_equal(actual, expected)
+
     def test_reading_all_sheets(self, ext):
         # Test reading all sheetnames by setting sheetname to None,
         # Ensure a dict is returned.

diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py
@@ -149,6 +149,7 @@ def test_astype_str_map(self, dtype, series):
         # see gh-4405
         result = series.astype(dtype)
         expected = series.map(compat.text_type)
+        expected = expected.replace('nan', np.nan)  # see gh-20377
         tm.assert_series_equal(result, expected)
 
     @pytest.mark.parametrize("dtype", [str, compat.text_type])