Skip to content

Commit a55e8d0

Browse files
committed
BUG: Don't parse NaN as 'nan' in Data IO
Closes gh-20377.
1 parent cf11f71 commit a55e8d0

File tree

5 files changed

+118
-11
lines changed

5 files changed

+118
-11
lines changed

doc/source/whatsnew/v0.24.0.txt

+37-2
Original file line numberDiff line numberDiff line change
@@ -416,15 +416,15 @@ In addition to these API breaking changes, many :ref:`performance improvements a
416416
Raise ValueError in ``DataFrame.to_dict(orient='index')``
417417
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
418418

419-
Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with
419+
Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with
420420
``orient='index'`` and a non-unique index instead of losing data (:issue:`22801`)
421421

422422
.. ipython:: python
423423
:okexcept:
424424

425425
df = pd.DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A'])
426426
df
427-
427+
428428
df.to_dict(orient='index')
429429

430430
.. _whatsnew_0240.api.datetimelike.normalize:
@@ -899,6 +899,41 @@ MultiIndex
899899
I/O
900900
^^^
901901

902+
.. _whatsnew_0240.bug_fixes.nan_with_str_dtype:
903+
904+
Proper handling of `np.NaN` in a string data-typed column with the Python engine
905+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
906+
907+
There was bug in :func:`read_excel` and :func:`read_csv` with the Python
908+
engine, where missing values turned to ``'nan'`` with ``dtype=str`` and
909+
``na_filter=True``. Now, these missing values are converted to the string
910+
missing indicator, ``np.nan``. (:issue `20377`)
911+
912+
Previous Behavior:
913+
914+
.. code-block:: ipython
915+
916+
In [5]: data = 'a,b,c\n1,,3\n4,5,6'
917+
In [6]: df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True)
918+
In [7]: df['b'][0]
919+
Out[7]:
920+
'nan'
921+
922+
Current Behavior:
923+
924+
.. ipython:: python
925+
:suppress:
926+
927+
from pandas.compat import StringIO
928+
929+
.. ipython:: python
930+
931+
data = 'a,b,c\n1,,3\n4,5,6'
932+
df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True)
933+
df['b'][0]
934+
935+
Notice how we now instead output ``np.nan`` itself instead of a stringified form of it.
936+
902937
- :func:`read_html()` no longer ignores all-whitespace ``<tr>`` within ``<thead>`` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`)
903938
- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
904939
- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`)

pandas/_libs/lib.pyx

+18-4
Original file line numberDiff line numberDiff line change
@@ -494,24 +494,38 @@ def astype_intsafe(ndarray[object] arr, new_dtype):
494494
return result
495495

496496

497-
def astype_unicode(arr: ndarray) -> ndarray[object]:
497+
def astype_unicode(arr: ndarray,
498+
skipna: bool=False) -> ndarray[object]:
498499
cdef:
500+
object arr_i
499501
Py_ssize_t i, n = arr.size
500502
ndarray[object] result = np.empty(n, dtype=object)
501503

502504
for i in range(n):
503-
result[i] = unicode(arr[i])
505+
arr_i = arr[i]
506+
507+
if not (skipna and checknull(arr_i)):
508+
arr_i = unicode(arr_i)
509+
510+
result[i] = arr_i
504511

505512
return result
506513

507514

508-
def astype_str(arr: ndarray) -> ndarray[object]:
515+
def astype_str(arr: ndarray,
516+
skipna: bool=False) -> ndarray[object]:
509517
cdef:
518+
object arr_i
510519
Py_ssize_t i, n = arr.size
511520
ndarray[object] result = np.empty(n, dtype=object)
512521

513522
for i in range(n):
514-
result[i] = str(arr[i])
523+
arr_i = arr[i]
524+
525+
if not (skipna and checknull(arr_i)):
526+
arr_i = str(arr_i)
527+
528+
result[i] = arr_i
515529

516530
return result
517531

pandas/io/parsers.py

+18-3
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@
1414

1515
from pandas import compat
1616
from pandas.compat import (range, lrange, PY3, StringIO, lzip,
17-
zip, string_types, map, u)
17+
zip, text_type, string_types, map, u)
1818
from pandas.core.dtypes.common import (
19-
is_integer, ensure_object,
19+
pandas_dtype, is_integer, ensure_object,
2020
is_list_like, is_integer_dtype,
2121
is_float, is_dtype_equal,
2222
is_object_dtype, is_string_dtype,
@@ -1685,7 +1685,22 @@ def _cast_types(self, values, cast_type, column):
16851685

16861686
else:
16871687
try:
1688-
values = astype_nansafe(values, cast_type, copy=True)
1688+
# gh-20377
1689+
#
1690+
# The C parser does not convert np.NaN to 'nan' if
1691+
# the casted dtype is a string-like.
1692+
cast_type = pandas_dtype(cast_type)
1693+
1694+
if issubclass(cast_type.type, string_types + (text_type,)):
1695+
val_shape = values.shape
1696+
val_flat = values.ravel()
1697+
1698+
method = ("astype_unicode" if issubclass(
1699+
cast_type.type, text_type) else "astype_str")
1700+
values = getattr(lib, method)(
1701+
val_flat, skipna=True).reshape(val_shape)
1702+
else:
1703+
values = astype_nansafe(values, cast_type, copy=True)
16891704
except ValueError:
16901705
raise ValueError("Unable to convert column %s to "
16911706
"type %s" % (column, cast_type))

pandas/tests/io/parser/na_values.py

+16
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
parsing for all of the parsers defined in parsers.py
66
"""
77

8+
import pytest
89
import numpy as np
910
from numpy import nan
1011

@@ -380,3 +381,18 @@ def test_inf_na_values_with_int_index(self):
380381
expected = DataFrame({"col1": [3, np.nan], "col2": [4, np.nan]},
381382
index=Index([1, 2], name="idx"))
382383
tm.assert_frame_equal(out, expected)
384+
385+
@pytest.mark.parametrize("na_filter", [True, False])
386+
def test_na_values_with_dtype_str_and_na_filter(self, na_filter):
387+
# see gh-20377
388+
data = "a,b,c\n1,,3\n4,5,6"
389+
390+
# na_filter=True --> missing value becomes NaN.
391+
# na_filter=False --> missing value remains empty string.
392+
empty = np.nan if na_filter else ""
393+
expected = DataFrame({"a": ["1", "4"],
394+
"b": [empty, "5"],
395+
"c": ["3", "6"]})
396+
397+
result = self.read_csv(StringIO(data), na_filter=na_filter, dtype=str)
398+
tm.assert_frame_equal(result, expected)

pandas/tests/io/test_excel.py

+29-2
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
import pandas as pd
1515
import pandas.util.testing as tm
1616
import pandas.util._test_decorators as td
17-
from pandas import DataFrame, Index, MultiIndex
17+
from pandas import DataFrame, Index, MultiIndex, Series
1818
from pandas.compat import u, range, map, BytesIO, iteritems, PY36
1919
from pandas.core.config import set_option, get_option
2020
from pandas.io.common import URLError
@@ -371,7 +371,34 @@ def test_reader_dtype(self, ext):
371371
tm.assert_frame_equal(actual, expected)
372372

373373
with pytest.raises(ValueError):
374-
actual = self.get_exceldf(basename, ext, dtype={'d': 'int64'})
374+
self.get_exceldf(basename, ext, dtype={'d': 'int64'})
375+
376+
@pytest.mark.parametrize("dtype,expected", [
377+
(None,
378+
DataFrame({
379+
"a": [1, 2, 3, 4],
380+
"b": [2.5, 3.5, 4.5, 5.5],
381+
"c": [1, 2, 3, 4],
382+
"d": [1.0, 2.0, np.nan, 4.0]
383+
})),
384+
({"a": "float64",
385+
"b": "float32",
386+
"c": str,
387+
"d": str
388+
},
389+
DataFrame({
390+
"a": Series([1, 2, 3, 4], dtype="float64"),
391+
"b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"),
392+
"c": ["001", "002", "003", "004"],
393+
"d": ["1", "2", np.nan, "4"]
394+
})),
395+
])
396+
def test_reader_dtype_str(self, ext, dtype, expected):
397+
# see gh-20377
398+
basename = "testdtype"
399+
400+
actual = self.get_exceldf(basename, ext, dtype=dtype)
401+
tm.assert_frame_equal(actual, expected)
375402

376403
def test_reading_all_sheets(self, ext):
377404
# Test reading all sheetnames by setting sheetname to None,

0 commit comments

Comments
 (0)