BUG: Don't parse NaN as 'nan' in Data IO

gfyoung · gfyoung · commit ddd1be32893c · 2018-10-17T19:51:15.000-07:00
Closes gh-20377.
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -440,15 +440,15 @@ In addition to these API breaking changes, many :ref:`performance improvements a
 Raise ValueError in ``DataFrame.to_dict(orient='index')``
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with 
+Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with
 ``orient='index'`` and a non-unique index instead of losing data (:issue:`22801`)
 
 .. ipython:: python
     :okexcept:
 
     df = pd.DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A'])
     df
-    
+
     df.to_dict(orient='index')
 
 .. _whatsnew_0240.api.datetimelike.normalize:
@@ -923,6 +923,41 @@ MultiIndex
 I/O
 ^^^
 
+.. _whatsnew_0240.bug_fixes.nan_with_str_dtype:
+
+Proper handling of `np.NaN` in a string data-typed column with the Python engine
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+There was bug in :func:`read_excel` and :func:`read_csv` with the Python
+engine, where missing values turned to ``'nan'`` with ``dtype=str`` and
+``na_filter=True``. Now, these missing values are converted to the string
+missing indicator, ``np.nan``. (:issue `20377`)
+
+.. ipython:: python
+   :suppress:
+
+   from pandas.compat import StringIO
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+   In [5]: data = 'a,b,c\n1,,3\n4,5,6'
+   In [6]: df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True)
+   In [7]: df.loc[0, 'b']
+   Out[7]:
+   'nan'
+
+Current Behavior:
+
+.. ipython:: python
+
+   data = 'a,b,c\n1,,3\n4,5,6'
+   df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True)
+   df.loc[0, 'b']
+
+Notice how we now instead output ``np.nan`` itself instead of a stringified form of it.
+
 - :func:`read_html()` no longer ignores all-whitespace ``<tr>`` within ``<thead>`` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`)
 - :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
 - :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`)
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -494,24 +494,70 @@ def astype_intsafe(ndarray[object] arr, new_dtype):
     return result
 
 
-def astype_unicode(arr: ndarray) -> ndarray[object]:
+def astype_unicode(arr: ndarray,
+                   skipna: bool=False) -> ndarray[object]:
+    """
+    Convert all elements in an array to unicode.
+
+    Parameters
+    ----------
+    arr : ndarray
+        The array whose elements we are casting.
+    skipna : bool, default False
+        Whether or not to coerce nulls to their stringified form
+        (e.g. NaN becomes 'nan').
+
+    Returns
+    -------
+    casted_arr : ndarray
+        A new array with the input array's elements casted.
+    """
     cdef:
+        object arr_i
         Py_ssize_t i, n = arr.size
         ndarray[object] result = np.empty(n, dtype=object)
 
     for i in range(n):
-        result[i] = unicode(arr[i])
+        arr_i = arr[i]
+
+        if not (skipna and checknull(arr_i)):
+            arr_i = unicode(arr_i)
+
+        result[i] = arr_i
 
     return result
 
 
-def astype_str(arr: ndarray) -> ndarray[object]:
+def astype_str(arr: ndarray,
+               skipna: bool=False) -> ndarray[object]:
+    """
+    Convert all elements in an array to string.
+
+    Parameters
+    ----------
+    arr : ndarray
+        The array whose elements we are casting.
+    skipna : bool, default False
+        Whether or not to coerce nulls to their stringified form
+        (e.g. NaN becomes 'nan').
+
+    Returns
+    -------
+    casted_arr : ndarray
+        A new array with the input array's elements casted.
+    """
     cdef:
+        object arr_i
         Py_ssize_t i, n = arr.size
         ndarray[object] result = np.empty(n, dtype=object)
 
     for i in range(n):
-        result[i] = str(arr[i])
+        arr_i = arr[i]
+
+        if not (skipna and checknull(arr_i)):
+            arr_i = str(arr_i)
+
+        result[i] = arr_i
 
     return result
 
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -645,17 +645,19 @@ def conv(r, dtype):
     return [conv(r, dtype) for r, dtype in zip(result, dtypes)]
 
 
-def astype_nansafe(arr, dtype, copy=True):
-    """ return a view if copy is False, but
-        need to be very careful as the result shape could change!
+def astype_nansafe(arr, dtype, copy=True, skipna=False):
+    """
+    Cast the elements of an array to a given dtype a nan-safe manner.
 
     Parameters
     ----------
     arr : ndarray
     dtype : np.dtype
     copy : bool, default True
         If False, a view will be attempted but may fail, if
-        e.g. the itemsizes don't align.
+        e.g. the item sizes don't align.
+    skipna: bool, default False
+        Whether or not we should skip NaN when casting as a string-type.
     """
 
     # dispatch on extension dtype if needed
@@ -668,10 +670,12 @@ def astype_nansafe(arr, dtype, copy=True):
 
     if issubclass(dtype.type, text_type):
         # in Py3 that's str, in Py2 that's unicode
-        return lib.astype_unicode(arr.ravel()).reshape(arr.shape)
+        return lib.astype_unicode(arr.ravel(),
+                                  skipna=skipna).reshape(arr.shape)
 
     elif issubclass(dtype.type, string_types):
-        return lib.astype_str(arr.ravel()).reshape(arr.shape)
+        return lib.astype_str(arr.ravel(),
+                              skipna=skipna).reshape(arr.shape)
 
     elif is_datetime64_dtype(arr):
         if is_object_dtype(dtype):
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1685,7 +1685,8 @@ def _cast_types(self, values, cast_type, column):
 
         else:
             try:
-                values = astype_nansafe(values, cast_type, copy=True)
+                values = astype_nansafe(values, cast_type,
+                                        copy=True, skipna=True)
             except ValueError:
                 raise ValueError("Unable to convert column %s to "
                                  "type %s" % (column, cast_type))
diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py
@@ -5,6 +5,7 @@
 parsing for all of the parsers defined in parsers.py
 """
 
+import pytest
 import numpy as np
 from numpy import nan
 
@@ -380,3 +381,18 @@ def test_inf_na_values_with_int_index(self):
         expected = DataFrame({"col1": [3, np.nan], "col2": [4, np.nan]},
                              index=Index([1, 2], name="idx"))
         tm.assert_frame_equal(out, expected)
+
+    @pytest.mark.parametrize("na_filter", [True, False])
+    def test_na_values_with_dtype_str_and_na_filter(self, na_filter):
+        # see gh-20377
+        data = "a,b,c\n1,,3\n4,5,6"
+
+        # na_filter=True --> missing value becomes NaN.
+        # na_filter=False --> missing value remains empty string.
+        empty = np.nan if na_filter else ""
+        expected = DataFrame({"a": ["1", "4"],
+                              "b": [empty, "5"],
+                              "c": ["3", "6"]})
+
+        result = self.read_csv(StringIO(data), na_filter=na_filter, dtype=str)
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py
@@ -14,7 +14,7 @@
 import pandas as pd
 import pandas.util.testing as tm
 import pandas.util._test_decorators as td
-from pandas import DataFrame, Index, MultiIndex
+from pandas import DataFrame, Index, MultiIndex, Series
 from pandas.compat import u, range, map, BytesIO, iteritems, PY36
 from pandas.core.config import set_option, get_option
 from pandas.io.common import URLError
@@ -371,7 +371,34 @@ def test_reader_dtype(self, ext):
         tm.assert_frame_equal(actual, expected)
 
         with pytest.raises(ValueError):
-            actual = self.get_exceldf(basename, ext, dtype={'d': 'int64'})
+            self.get_exceldf(basename, ext, dtype={'d': 'int64'})
+
+    @pytest.mark.parametrize("dtype,expected", [
+        (None,
+         DataFrame({
+             "a": [1, 2, 3, 4],
+             "b": [2.5, 3.5, 4.5, 5.5],
+             "c": [1, 2, 3, 4],
+             "d": [1.0, 2.0, np.nan, 4.0]
+         })),
+        ({"a": "float64",
+          "b": "float32",
+          "c": str,
+          "d": str
+          },
+         DataFrame({
+             "a": Series([1, 2, 3, 4], dtype="float64"),
+             "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"),
+             "c": ["001", "002", "003", "004"],
+             "d": ["1", "2", np.nan, "4"]
+         })),
+    ])
+    def test_reader_dtype_str(self, ext, dtype, expected):
+        # see gh-20377
+        basename = "testdtype"
+
+        actual = self.get_exceldf(basename, ext, dtype=dtype)
+        tm.assert_frame_equal(actual, expected)
 
     def test_reading_all_sheets(self, ext):
         # Test reading all sheetnames by setting sheetname to None,