diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
index 16f0b9ee99909..d3b15703dab1f 100644
--- a/doc/source/whatsnew/v0.24.0.txt
+++ b/doc/source/whatsnew/v0.24.0.txt
@@ -440,7 +440,7 @@ In addition to these API breaking changes, many :ref:`performance improvements a
Raise ValueError in ``DataFrame.to_dict(orient='index')``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with
+Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with
``orient='index'`` and a non-unique index instead of losing data (:issue:`22801`)
.. ipython:: python
@@ -448,7 +448,7 @@ Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with
df = pd.DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A'])
df
-
+
df.to_dict(orient='index')
.. _whatsnew_0240.api.datetimelike.normalize:
@@ -923,6 +923,41 @@ MultiIndex
I/O
^^^
+.. _whatsnew_0240.bug_fixes.nan_with_str_dtype:
+
+Proper handling of `np.NaN` in a string data-typed column with the Python engine
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+There was bug in :func:`read_excel` and :func:`read_csv` with the Python
+engine, where missing values turned to ``'nan'`` with ``dtype=str`` and
+``na_filter=True``. Now, these missing values are converted to the string
+missing indicator, ``np.nan``. (:issue `20377`)
+
+.. ipython:: python
+ :suppress:
+
+ from pandas.compat import StringIO
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+ In [5]: data = 'a,b,c\n1,,3\n4,5,6'
+ In [6]: df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True)
+ In [7]: df.loc[0, 'b']
+ Out[7]:
+ 'nan'
+
+Current Behavior:
+
+.. ipython:: python
+
+ data = 'a,b,c\n1,,3\n4,5,6'
+ df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True)
+ df.loc[0, 'b']
+
+Notice how we now instead output ``np.nan`` itself instead of a stringified form of it.
+
- :func:`read_html()` no longer ignores all-whitespace ``
`` within ```` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`)
- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`)
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 0b9793a6ef97a..c5d5a431e8139 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -494,24 +494,70 @@ def astype_intsafe(ndarray[object] arr, new_dtype):
return result
-def astype_unicode(arr: ndarray) -> ndarray[object]:
+def astype_unicode(arr: ndarray,
+ skipna: bool=False) -> ndarray[object]:
+ """
+ Convert all elements in an array to unicode.
+
+ Parameters
+ ----------
+ arr : ndarray
+ The array whose elements we are casting.
+ skipna : bool, default False
+ Whether or not to coerce nulls to their stringified form
+ (e.g. NaN becomes 'nan').
+
+ Returns
+ -------
+ casted_arr : ndarray
+ A new array with the input array's elements casted.
+ """
cdef:
+ object arr_i
Py_ssize_t i, n = arr.size
ndarray[object] result = np.empty(n, dtype=object)
for i in range(n):
- result[i] = unicode(arr[i])
+ arr_i = arr[i]
+
+ if not (skipna and checknull(arr_i)):
+ arr_i = unicode(arr_i)
+
+ result[i] = arr_i
return result
-def astype_str(arr: ndarray) -> ndarray[object]:
+def astype_str(arr: ndarray,
+ skipna: bool=False) -> ndarray[object]:
+ """
+ Convert all elements in an array to string.
+
+ Parameters
+ ----------
+ arr : ndarray
+ The array whose elements we are casting.
+ skipna : bool, default False
+ Whether or not to coerce nulls to their stringified form
+ (e.g. NaN becomes 'nan').
+
+ Returns
+ -------
+ casted_arr : ndarray
+ A new array with the input array's elements casted.
+ """
cdef:
+ object arr_i
Py_ssize_t i, n = arr.size
ndarray[object] result = np.empty(n, dtype=object)
for i in range(n):
- result[i] = str(arr[i])
+ arr_i = arr[i]
+
+ if not (skipna and checknull(arr_i)):
+ arr_i = str(arr_i)
+
+ result[i] = arr_i
return result
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index a95a45d5f9ae4..56bf394729773 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -645,9 +645,9 @@ def conv(r, dtype):
return [conv(r, dtype) for r, dtype in zip(result, dtypes)]
-def astype_nansafe(arr, dtype, copy=True):
- """ return a view if copy is False, but
- need to be very careful as the result shape could change!
+def astype_nansafe(arr, dtype, copy=True, skipna=False):
+ """
+ Cast the elements of an array to a given dtype a nan-safe manner.
Parameters
----------
@@ -655,7 +655,9 @@ def astype_nansafe(arr, dtype, copy=True):
dtype : np.dtype
copy : bool, default True
If False, a view will be attempted but may fail, if
- e.g. the itemsizes don't align.
+ e.g. the item sizes don't align.
+ skipna: bool, default False
+ Whether or not we should skip NaN when casting as a string-type.
"""
# dispatch on extension dtype if needed
@@ -668,10 +670,12 @@ def astype_nansafe(arr, dtype, copy=True):
if issubclass(dtype.type, text_type):
# in Py3 that's str, in Py2 that's unicode
- return lib.astype_unicode(arr.ravel()).reshape(arr.shape)
+ return lib.astype_unicode(arr.ravel(),
+ skipna=skipna).reshape(arr.shape)
elif issubclass(dtype.type, string_types):
- return lib.astype_str(arr.ravel()).reshape(arr.shape)
+ return lib.astype_str(arr.ravel(),
+ skipna=skipna).reshape(arr.shape)
elif is_datetime64_dtype(arr):
if is_object_dtype(dtype):
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 1edc6f6e14442..eeba30ed8a44f 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -1685,7 +1685,8 @@ def _cast_types(self, values, cast_type, column):
else:
try:
- values = astype_nansafe(values, cast_type, copy=True)
+ values = astype_nansafe(values, cast_type,
+ copy=True, skipna=True)
except ValueError:
raise ValueError("Unable to convert column %s to "
"type %s" % (column, cast_type))
diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py
index 880ab707cfd07..29aed63e657fb 100644
--- a/pandas/tests/io/parser/na_values.py
+++ b/pandas/tests/io/parser/na_values.py
@@ -5,6 +5,7 @@
parsing for all of the parsers defined in parsers.py
"""
+import pytest
import numpy as np
from numpy import nan
@@ -380,3 +381,18 @@ def test_inf_na_values_with_int_index(self):
expected = DataFrame({"col1": [3, np.nan], "col2": [4, np.nan]},
index=Index([1, 2], name="idx"))
tm.assert_frame_equal(out, expected)
+
+ @pytest.mark.parametrize("na_filter", [True, False])
+ def test_na_values_with_dtype_str_and_na_filter(self, na_filter):
+ # see gh-20377
+ data = "a,b,c\n1,,3\n4,5,6"
+
+ # na_filter=True --> missing value becomes NaN.
+ # na_filter=False --> missing value remains empty string.
+ empty = np.nan if na_filter else ""
+ expected = DataFrame({"a": ["1", "4"],
+ "b": [empty, "5"],
+ "c": ["3", "6"]})
+
+ result = self.read_csv(StringIO(data), na_filter=na_filter, dtype=str)
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py
index a639556eb07d6..1bd2fb5887e38 100644
--- a/pandas/tests/io/test_excel.py
+++ b/pandas/tests/io/test_excel.py
@@ -14,7 +14,7 @@
import pandas as pd
import pandas.util.testing as tm
import pandas.util._test_decorators as td
-from pandas import DataFrame, Index, MultiIndex
+from pandas import DataFrame, Index, MultiIndex, Series
from pandas.compat import u, range, map, BytesIO, iteritems, PY36
from pandas.core.config import set_option, get_option
from pandas.io.common import URLError
@@ -371,7 +371,34 @@ def test_reader_dtype(self, ext):
tm.assert_frame_equal(actual, expected)
with pytest.raises(ValueError):
- actual = self.get_exceldf(basename, ext, dtype={'d': 'int64'})
+ self.get_exceldf(basename, ext, dtype={'d': 'int64'})
+
+ @pytest.mark.parametrize("dtype,expected", [
+ (None,
+ DataFrame({
+ "a": [1, 2, 3, 4],
+ "b": [2.5, 3.5, 4.5, 5.5],
+ "c": [1, 2, 3, 4],
+ "d": [1.0, 2.0, np.nan, 4.0]
+ })),
+ ({"a": "float64",
+ "b": "float32",
+ "c": str,
+ "d": str
+ },
+ DataFrame({
+ "a": Series([1, 2, 3, 4], dtype="float64"),
+ "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"),
+ "c": ["001", "002", "003", "004"],
+ "d": ["1", "2", np.nan, "4"]
+ })),
+ ])
+ def test_reader_dtype_str(self, ext, dtype, expected):
+ # see gh-20377
+ basename = "testdtype"
+
+ actual = self.get_exceldf(basename, ext, dtype=dtype)
+ tm.assert_frame_equal(actual, expected)
def test_reading_all_sheets(self, ext):
# Test reading all sheetnames by setting sheetname to None,