Skip to content

BUG: Don't parse NaN as 'nan' in Data IO #23162

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 18, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 37 additions & 2 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -440,15 +440,15 @@ In addition to these API breaking changes, many :ref:`performance improvements a
Raise ValueError in ``DataFrame.to_dict(orient='index')``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with
Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with
``orient='index'`` and a non-unique index instead of losing data (:issue:`22801`)

.. ipython:: python
:okexcept:

df = pd.DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A'])
df

df.to_dict(orient='index')

.. _whatsnew_0240.api.datetimelike.normalize:
Expand Down Expand Up @@ -923,6 +923,41 @@ MultiIndex
I/O
^^^

.. _whatsnew_0240.bug_fixes.nan_with_str_dtype:

Proper handling of `np.NaN` in a string data-typed column with the Python engine
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

There was bug in :func:`read_excel` and :func:`read_csv` with the Python
engine, where missing values turned to ``'nan'`` with ``dtype=str`` and
``na_filter=True``. Now, these missing values are converted to the string
missing indicator, ``np.nan``. (:issue `20377`)

.. ipython:: python
:suppress:

from pandas.compat import StringIO

Previous Behavior:

.. code-block:: ipython

In [5]: data = 'a,b,c\n1,,3\n4,5,6'
In [6]: df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True)
In [7]: df.loc[0, 'b']
Out[7]:
'nan'

Current Behavior:

.. ipython:: python

data = 'a,b,c\n1,,3\n4,5,6'
df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True)
df.loc[0, 'b']

Notice how we now instead output ``np.nan`` itself instead of a stringified form of it.

- :func:`read_html()` no longer ignores all-whitespace ``<tr>`` within ``<thead>`` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`)
- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`)
Expand Down
54 changes: 50 additions & 4 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -494,24 +494,70 @@ def astype_intsafe(ndarray[object] arr, new_dtype):
return result


def astype_unicode(arr: ndarray) -> ndarray[object]:
def astype_unicode(arr: ndarray,
skipna: bool=False) -> ndarray[object]:
"""
Convert all elements in an array to unicode.

Parameters
----------
arr : ndarray
The array whose elements we are casting.
skipna : bool, default False
Whether or not to coerce nulls to their stringified form
(e.g. NaN becomes 'nan').

Returns
-------
casted_arr : ndarray
A new array with the input array's elements casted.
"""
cdef:
object arr_i
Py_ssize_t i, n = arr.size
ndarray[object] result = np.empty(n, dtype=object)

for i in range(n):
result[i] = unicode(arr[i])
arr_i = arr[i]

if not (skipna and checknull(arr_i)):
arr_i = unicode(arr_i)

result[i] = arr_i

return result


def astype_str(arr: ndarray) -> ndarray[object]:
def astype_str(arr: ndarray,
skipna: bool=False) -> ndarray[object]:
"""
Convert all elements in an array to string.

Parameters
----------
arr : ndarray
The array whose elements we are casting.
skipna : bool, default False
Whether or not to coerce nulls to their stringified form
(e.g. NaN becomes 'nan').

Returns
-------
casted_arr : ndarray
A new array with the input array's elements casted.
"""
cdef:
object arr_i
Py_ssize_t i, n = arr.size
ndarray[object] result = np.empty(n, dtype=object)

for i in range(n):
result[i] = str(arr[i])
arr_i = arr[i]

if not (skipna and checknull(arr_i)):
arr_i = str(arr_i)

result[i] = arr_i

return result

Expand Down
16 changes: 10 additions & 6 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -645,17 +645,19 @@ def conv(r, dtype):
return [conv(r, dtype) for r, dtype in zip(result, dtypes)]


def astype_nansafe(arr, dtype, copy=True):
""" return a view if copy is False, but
need to be very careful as the result shape could change!
def astype_nansafe(arr, dtype, copy=True, skipna=False):
"""
Cast the elements of an array to a given dtype a nan-safe manner.

Parameters
----------
arr : ndarray
dtype : np.dtype
copy : bool, default True
If False, a view will be attempted but may fail, if
e.g. the itemsizes don't align.
e.g. the item sizes don't align.
skipna: bool, default False
Whether or not we should skip NaN when casting as a string-type.
"""

# dispatch on extension dtype if needed
Expand All @@ -668,10 +670,12 @@ def astype_nansafe(arr, dtype, copy=True):

if issubclass(dtype.type, text_type):
# in Py3 that's str, in Py2 that's unicode
return lib.astype_unicode(arr.ravel()).reshape(arr.shape)
return lib.astype_unicode(arr.ravel(),
skipna=skipna).reshape(arr.shape)

elif issubclass(dtype.type, string_types):
return lib.astype_str(arr.ravel()).reshape(arr.shape)
return lib.astype_str(arr.ravel(),
skipna=skipna).reshape(arr.shape)

elif is_datetime64_dtype(arr):
if is_object_dtype(dtype):
Expand Down
3 changes: 2 additions & 1 deletion pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1685,7 +1685,8 @@ def _cast_types(self, values, cast_type, column):

else:
try:
values = astype_nansafe(values, cast_type, copy=True)
values = astype_nansafe(values, cast_type,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gfyoung this is a longshot but any idea why you passed skipna=True here but not 9 lines up?

copy=True, skipna=True)
except ValueError:
raise ValueError("Unable to convert column %s to "
"type %s" % (column, cast_type))
Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/io/parser/na_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
parsing for all of the parsers defined in parsers.py
"""

import pytest
import numpy as np
from numpy import nan

Expand Down Expand Up @@ -380,3 +381,18 @@ def test_inf_na_values_with_int_index(self):
expected = DataFrame({"col1": [3, np.nan], "col2": [4, np.nan]},
index=Index([1, 2], name="idx"))
tm.assert_frame_equal(out, expected)

@pytest.mark.parametrize("na_filter", [True, False])
def test_na_values_with_dtype_str_and_na_filter(self, na_filter):
# see gh-20377
data = "a,b,c\n1,,3\n4,5,6"

# na_filter=True --> missing value becomes NaN.
# na_filter=False --> missing value remains empty string.
empty = np.nan if na_filter else ""
expected = DataFrame({"a": ["1", "4"],
"b": [empty, "5"],
"c": ["3", "6"]})

result = self.read_csv(StringIO(data), na_filter=na_filter, dtype=str)
tm.assert_frame_equal(result, expected)
31 changes: 29 additions & 2 deletions pandas/tests/io/test_excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import pandas as pd
import pandas.util.testing as tm
import pandas.util._test_decorators as td
from pandas import DataFrame, Index, MultiIndex
from pandas import DataFrame, Index, MultiIndex, Series
from pandas.compat import u, range, map, BytesIO, iteritems, PY36
from pandas.core.config import set_option, get_option
from pandas.io.common import URLError
Expand Down Expand Up @@ -371,7 +371,34 @@ def test_reader_dtype(self, ext):
tm.assert_frame_equal(actual, expected)

with pytest.raises(ValueError):
actual = self.get_exceldf(basename, ext, dtype={'d': 'int64'})
self.get_exceldf(basename, ext, dtype={'d': 'int64'})

@pytest.mark.parametrize("dtype,expected", [
(None,
DataFrame({
"a": [1, 2, 3, 4],
"b": [2.5, 3.5, 4.5, 5.5],
"c": [1, 2, 3, 4],
"d": [1.0, 2.0, np.nan, 4.0]
})),
({"a": "float64",
"b": "float32",
"c": str,
"d": str
},
DataFrame({
"a": Series([1, 2, 3, 4], dtype="float64"),
"b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"),
"c": ["001", "002", "003", "004"],
"d": ["1", "2", np.nan, "4"]
})),
])
def test_reader_dtype_str(self, ext, dtype, expected):
# see gh-20377
basename = "testdtype"

actual = self.get_exceldf(basename, ext, dtype=dtype)
tm.assert_frame_equal(actual, expected)

def test_reading_all_sheets(self, ext):
# Test reading all sheetnames by setting sheetname to None,
Expand Down