BUG: Don't parse NaN as 'nan' in Data IO

gfyoung · gfyoung · commit a55e8d0d06e9 · 2018-10-14T22:55:51.000-07:00
Closes gh-20377.
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -416,15 +416,15 @@ In addition to these API breaking changes, many :ref:`performance improvements a
 Raise ValueError in ``DataFrame.to_dict(orient='index')``
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with 
+Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with
 ``orient='index'`` and a non-unique index instead of losing data (:issue:`22801`)
 
 .. ipython:: python
     :okexcept:
 
     df = pd.DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A'])
     df
-    
+
     df.to_dict(orient='index')
 
 .. _whatsnew_0240.api.datetimelike.normalize:
@@ -899,6 +899,41 @@ MultiIndex
 I/O
 ^^^
 
+.. _whatsnew_0240.bug_fixes.nan_with_str_dtype:
+
+Proper handling of `np.NaN` in a string data-typed column with the Python engine
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+There was bug in :func:`read_excel` and :func:`read_csv` with the Python
+engine, where missing values turned to ``'nan'`` with ``dtype=str`` and
+``na_filter=True``. Now, these missing values are converted to the string
+missing indicator, ``np.nan``. (:issue `20377`)
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+   In [5]: data = 'a,b,c\n1,,3\n4,5,6'
+   In [6]: df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True)
+   In [7]: df['b'][0]
+   Out[7]:
+   'nan'
+
+Current Behavior:
+
+.. ipython:: python
+   :suppress:
+
+   from pandas.compat import StringIO
+
+.. ipython:: python
+
+   data = 'a,b,c\n1,,3\n4,5,6'
+   df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True)
+   df['b'][0]
+
+Notice how we now instead output ``np.nan`` itself instead of a stringified form of it.
+
 - :func:`read_html()` no longer ignores all-whitespace ``<tr>`` within ``<thead>`` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`)
 - :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
 - :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`)
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -494,24 +494,38 @@ def astype_intsafe(ndarray[object] arr, new_dtype):
     return result
 
 
-def astype_unicode(arr: ndarray) -> ndarray[object]:
+def astype_unicode(arr: ndarray,
+                   skipna: bool=False) -> ndarray[object]:
     cdef:
+        object arr_i
         Py_ssize_t i, n = arr.size
         ndarray[object] result = np.empty(n, dtype=object)
 
     for i in range(n):
-        result[i] = unicode(arr[i])
+        arr_i = arr[i]
+
+        if not (skipna and checknull(arr_i)):
+            arr_i = unicode(arr_i)
+
+        result[i] = arr_i
 
     return result
 
 
-def astype_str(arr: ndarray) -> ndarray[object]:
+def astype_str(arr: ndarray,
+               skipna: bool=False) -> ndarray[object]:
     cdef:
+        object arr_i
         Py_ssize_t i, n = arr.size
         ndarray[object] result = np.empty(n, dtype=object)
 
     for i in range(n):
-        result[i] = str(arr[i])
+        arr_i = arr[i]
+
+        if not (skipna and checknull(arr_i)):
+            arr_i = str(arr_i)
+
+        result[i] = arr_i
 
     return result
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -14,9 +14,9 @@
 
 from pandas import compat
 from pandas.compat import (range, lrange, PY3, StringIO, lzip,
-                           zip, string_types, map, u)
+                           zip, text_type, string_types, map, u)
 from pandas.core.dtypes.common import (
-    is_integer, ensure_object,
+    pandas_dtype, is_integer, ensure_object,
     is_list_like, is_integer_dtype,
     is_float, is_dtype_equal,
     is_object_dtype, is_string_dtype,
@@ -1685,7 +1685,22 @@ def _cast_types(self, values, cast_type, column):
 
         else:
             try:
-                values = astype_nansafe(values, cast_type, copy=True)
+                # gh-20377
+                #
+                # The C parser does not convert np.NaN to 'nan' if
+                # the casted dtype is a string-like.
+                cast_type = pandas_dtype(cast_type)
+
+                if issubclass(cast_type.type, string_types + (text_type,)):
+                    val_shape = values.shape
+                    val_flat = values.ravel()
+
+                    method = ("astype_unicode" if issubclass(
+                        cast_type.type, text_type) else "astype_str")
+                    values = getattr(lib, method)(
+                        val_flat, skipna=True).reshape(val_shape)
+                else:
+                    values = astype_nansafe(values, cast_type, copy=True)
             except ValueError:
                 raise ValueError("Unable to convert column %s to "
                                  "type %s" % (column, cast_type))
diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py
@@ -5,6 +5,7 @@
 parsing for all of the parsers defined in parsers.py
 """
 
+import pytest
 import numpy as np
 from numpy import nan
 
@@ -380,3 +381,18 @@ def test_inf_na_values_with_int_index(self):
         expected = DataFrame({"col1": [3, np.nan], "col2": [4, np.nan]},
                              index=Index([1, 2], name="idx"))
         tm.assert_frame_equal(out, expected)
+
+    @pytest.mark.parametrize("na_filter", [True, False])
+    def test_na_values_with_dtype_str_and_na_filter(self, na_filter):
+        # see gh-20377
+        data = "a,b,c\n1,,3\n4,5,6"
+
+        # na_filter=True --> missing value becomes NaN.
+        # na_filter=False --> missing value remains empty string.
+        empty = np.nan if na_filter else ""
+        expected = DataFrame({"a": ["1", "4"],
+                              "b": [empty, "5"],
+                              "c": ["3", "6"]})
+
+        result = self.read_csv(StringIO(data), na_filter=na_filter, dtype=str)
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py
@@ -14,7 +14,7 @@
 import pandas as pd
 import pandas.util.testing as tm
 import pandas.util._test_decorators as td
-from pandas import DataFrame, Index, MultiIndex
+from pandas import DataFrame, Index, MultiIndex, Series
 from pandas.compat import u, range, map, BytesIO, iteritems, PY36
 from pandas.core.config import set_option, get_option
 from pandas.io.common import URLError
@@ -371,7 +371,34 @@ def test_reader_dtype(self, ext):
         tm.assert_frame_equal(actual, expected)
 
         with pytest.raises(ValueError):
-            actual = self.get_exceldf(basename, ext, dtype={'d': 'int64'})
+            self.get_exceldf(basename, ext, dtype={'d': 'int64'})
+
+    @pytest.mark.parametrize("dtype,expected", [
+        (None,
+         DataFrame({
+             "a": [1, 2, 3, 4],
+             "b": [2.5, 3.5, 4.5, 5.5],
+             "c": [1, 2, 3, 4],
+             "d": [1.0, 2.0, np.nan, 4.0]
+         })),
+        ({"a": "float64",
+          "b": "float32",
+          "c": str,
+          "d": str
+          },
+         DataFrame({
+             "a": Series([1, 2, 3, 4], dtype="float64"),
+             "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"),
+             "c": ["001", "002", "003", "004"],
+             "d": ["1", "2", np.nan, "4"]
+         })),
+    ])
+    def test_reader_dtype_str(self, ext, dtype, expected):
+        # see gh-20377
+        basename = "testdtype"
+
+        actual = self.get_exceldf(basename, ext, dtype=dtype)
+        tm.assert_frame_equal(actual, expected)
 
     def test_reading_all_sheets(self, ext):
         # Test reading all sheetnames by setting sheetname to None,