Skip to content

Commit ddd1be3

Browse files
committed
BUG: Don't parse NaN as 'nan' in Data IO
Closes gh-20377.
1 parent 1546c35 commit ddd1be3

File tree

6 files changed

+144
-15
lines changed

6 files changed

+144
-15
lines changed

doc/source/whatsnew/v0.24.0.txt

+37-2
Original file line numberDiff line numberDiff line change
@@ -440,15 +440,15 @@ In addition to these API breaking changes, many :ref:`performance improvements a
440440
Raise ValueError in ``DataFrame.to_dict(orient='index')``
441441
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
442442

443-
Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with
443+
Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with
444444
``orient='index'`` and a non-unique index instead of losing data (:issue:`22801`)
445445

446446
.. ipython:: python
447447
:okexcept:
448448

449449
df = pd.DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A'])
450450
df
451-
451+
452452
df.to_dict(orient='index')
453453

454454
.. _whatsnew_0240.api.datetimelike.normalize:
@@ -923,6 +923,41 @@ MultiIndex
923923
I/O
924924
^^^
925925

926+
.. _whatsnew_0240.bug_fixes.nan_with_str_dtype:
927+
928+
Proper handling of `np.NaN` in a string data-typed column with the Python engine
929+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
930+
931+
There was bug in :func:`read_excel` and :func:`read_csv` with the Python
932+
engine, where missing values turned to ``'nan'`` with ``dtype=str`` and
933+
``na_filter=True``. Now, these missing values are converted to the string
934+
missing indicator, ``np.nan``. (:issue `20377`)
935+
936+
.. ipython:: python
937+
:suppress:
938+
939+
from pandas.compat import StringIO
940+
941+
Previous Behavior:
942+
943+
.. code-block:: ipython
944+
945+
In [5]: data = 'a,b,c\n1,,3\n4,5,6'
946+
In [6]: df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True)
947+
In [7]: df.loc[0, 'b']
948+
Out[7]:
949+
'nan'
950+
951+
Current Behavior:
952+
953+
.. ipython:: python
954+
955+
data = 'a,b,c\n1,,3\n4,5,6'
956+
df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True)
957+
df.loc[0, 'b']
958+
959+
Notice how we now instead output ``np.nan`` itself instead of a stringified form of it.
960+
926961
- :func:`read_html()` no longer ignores all-whitespace ``<tr>`` within ``<thead>`` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`)
927962
- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
928963
- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`)

pandas/_libs/lib.pyx

+50-4
Original file line numberDiff line numberDiff line change
@@ -494,24 +494,70 @@ def astype_intsafe(ndarray[object] arr, new_dtype):
494494
return result
495495

496496

497-
def astype_unicode(arr: ndarray) -> ndarray[object]:
497+
def astype_unicode(arr: ndarray,
498+
skipna: bool=False) -> ndarray[object]:
499+
"""
500+
Convert all elements in an array to unicode.
501+
502+
Parameters
503+
----------
504+
arr : ndarray
505+
The array whose elements we are casting.
506+
skipna : bool, default False
507+
Whether or not to coerce nulls to their stringified form
508+
(e.g. NaN becomes 'nan').
509+
510+
Returns
511+
-------
512+
casted_arr : ndarray
513+
A new array with the input array's elements casted.
514+
"""
498515
cdef:
516+
object arr_i
499517
Py_ssize_t i, n = arr.size
500518
ndarray[object] result = np.empty(n, dtype=object)
501519

502520
for i in range(n):
503-
result[i] = unicode(arr[i])
521+
arr_i = arr[i]
522+
523+
if not (skipna and checknull(arr_i)):
524+
arr_i = unicode(arr_i)
525+
526+
result[i] = arr_i
504527

505528
return result
506529

507530

508-
def astype_str(arr: ndarray) -> ndarray[object]:
531+
def astype_str(arr: ndarray,
532+
skipna: bool=False) -> ndarray[object]:
533+
"""
534+
Convert all elements in an array to string.
535+
536+
Parameters
537+
----------
538+
arr : ndarray
539+
The array whose elements we are casting.
540+
skipna : bool, default False
541+
Whether or not to coerce nulls to their stringified form
542+
(e.g. NaN becomes 'nan').
543+
544+
Returns
545+
-------
546+
casted_arr : ndarray
547+
A new array with the input array's elements casted.
548+
"""
509549
cdef:
550+
object arr_i
510551
Py_ssize_t i, n = arr.size
511552
ndarray[object] result = np.empty(n, dtype=object)
512553

513554
for i in range(n):
514-
result[i] = str(arr[i])
555+
arr_i = arr[i]
556+
557+
if not (skipna and checknull(arr_i)):
558+
arr_i = str(arr_i)
559+
560+
result[i] = arr_i
515561

516562
return result
517563

pandas/core/dtypes/cast.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -645,17 +645,19 @@ def conv(r, dtype):
645645
return [conv(r, dtype) for r, dtype in zip(result, dtypes)]
646646

647647

648-
def astype_nansafe(arr, dtype, copy=True):
649-
""" return a view if copy is False, but
650-
need to be very careful as the result shape could change!
648+
def astype_nansafe(arr, dtype, copy=True, skipna=False):
649+
"""
650+
Cast the elements of an array to a given dtype a nan-safe manner.
651651
652652
Parameters
653653
----------
654654
arr : ndarray
655655
dtype : np.dtype
656656
copy : bool, default True
657657
If False, a view will be attempted but may fail, if
658-
e.g. the itemsizes don't align.
658+
e.g. the item sizes don't align.
659+
skipna: bool, default False
660+
Whether or not we should skip NaN when casting as a string-type.
659661
"""
660662

661663
# dispatch on extension dtype if needed
@@ -668,10 +670,12 @@ def astype_nansafe(arr, dtype, copy=True):
668670

669671
if issubclass(dtype.type, text_type):
670672
# in Py3 that's str, in Py2 that's unicode
671-
return lib.astype_unicode(arr.ravel()).reshape(arr.shape)
673+
return lib.astype_unicode(arr.ravel(),
674+
skipna=skipna).reshape(arr.shape)
672675

673676
elif issubclass(dtype.type, string_types):
674-
return lib.astype_str(arr.ravel()).reshape(arr.shape)
677+
return lib.astype_str(arr.ravel(),
678+
skipna=skipna).reshape(arr.shape)
675679

676680
elif is_datetime64_dtype(arr):
677681
if is_object_dtype(dtype):

pandas/io/parsers.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1685,7 +1685,8 @@ def _cast_types(self, values, cast_type, column):
16851685

16861686
else:
16871687
try:
1688-
values = astype_nansafe(values, cast_type, copy=True)
1688+
values = astype_nansafe(values, cast_type,
1689+
copy=True, skipna=True)
16891690
except ValueError:
16901691
raise ValueError("Unable to convert column %s to "
16911692
"type %s" % (column, cast_type))

pandas/tests/io/parser/na_values.py

+16
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
parsing for all of the parsers defined in parsers.py
66
"""
77

8+
import pytest
89
import numpy as np
910
from numpy import nan
1011

@@ -380,3 +381,18 @@ def test_inf_na_values_with_int_index(self):
380381
expected = DataFrame({"col1": [3, np.nan], "col2": [4, np.nan]},
381382
index=Index([1, 2], name="idx"))
382383
tm.assert_frame_equal(out, expected)
384+
385+
@pytest.mark.parametrize("na_filter", [True, False])
386+
def test_na_values_with_dtype_str_and_na_filter(self, na_filter):
387+
# see gh-20377
388+
data = "a,b,c\n1,,3\n4,5,6"
389+
390+
# na_filter=True --> missing value becomes NaN.
391+
# na_filter=False --> missing value remains empty string.
392+
empty = np.nan if na_filter else ""
393+
expected = DataFrame({"a": ["1", "4"],
394+
"b": [empty, "5"],
395+
"c": ["3", "6"]})
396+
397+
result = self.read_csv(StringIO(data), na_filter=na_filter, dtype=str)
398+
tm.assert_frame_equal(result, expected)

pandas/tests/io/test_excel.py

+29-2
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
import pandas as pd
1515
import pandas.util.testing as tm
1616
import pandas.util._test_decorators as td
17-
from pandas import DataFrame, Index, MultiIndex
17+
from pandas import DataFrame, Index, MultiIndex, Series
1818
from pandas.compat import u, range, map, BytesIO, iteritems, PY36
1919
from pandas.core.config import set_option, get_option
2020
from pandas.io.common import URLError
@@ -371,7 +371,34 @@ def test_reader_dtype(self, ext):
371371
tm.assert_frame_equal(actual, expected)
372372

373373
with pytest.raises(ValueError):
374-
actual = self.get_exceldf(basename, ext, dtype={'d': 'int64'})
374+
self.get_exceldf(basename, ext, dtype={'d': 'int64'})
375+
376+
@pytest.mark.parametrize("dtype,expected", [
377+
(None,
378+
DataFrame({
379+
"a": [1, 2, 3, 4],
380+
"b": [2.5, 3.5, 4.5, 5.5],
381+
"c": [1, 2, 3, 4],
382+
"d": [1.0, 2.0, np.nan, 4.0]
383+
})),
384+
({"a": "float64",
385+
"b": "float32",
386+
"c": str,
387+
"d": str
388+
},
389+
DataFrame({
390+
"a": Series([1, 2, 3, 4], dtype="float64"),
391+
"b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"),
392+
"c": ["001", "002", "003", "004"],
393+
"d": ["1", "2", np.nan, "4"]
394+
})),
395+
])
396+
def test_reader_dtype_str(self, ext, dtype, expected):
397+
# see gh-20377
398+
basename = "testdtype"
399+
400+
actual = self.get_exceldf(basename, ext, dtype=dtype)
401+
tm.assert_frame_equal(actual, expected)
375402

376403
def test_reading_all_sheets(self, ext):
377404
# Test reading all sheetnames by setting sheetname to None,

0 commit comments

Comments
 (0)