diff --git a/doc/source/io.rst b/doc/source/io.rst index 7cc6d6eecfd6c..2537d52df6dac 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -546,6 +546,53 @@ The ``thousands`` keyword allows integers to be parsed correctly os.remove('tmp.csv') +.. _io.na_values: + +NA Values +~~~~~~~~~ + +To control which values are parsed as missing values (which are signified by ``NaN``), specifiy a +list of strings in ``na_values``. If you specify a number (a ``float``, like ``5.0`` or an ``integer`` like ``5``), +the corresponding equivalent values will also imply a missing value (in this case effectively +``[5.0,5]`` are recognized as ``NaN``. + +To completely override the default values that are recognized as missing, specify ``keep_default_na=False``. +The default ``NaN`` recognized values are ``['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', 'NA', +'#NA', 'NULL', 'NaN', 'nan']``. + +.. code-block:: python + + read_csv(path, na_values=[5]) + +the default values, in addition to ``5`` , ``5.0`` when interpreted as numbers are recognized as ``NaN`` + +.. code-block:: python + + read_csv(path, keep_default_na=False, na_values=[""]) + +only an empty field will be ``NaN`` + +.. code-block:: python + + read_csv(path, keep_default_na=False, na_values=["NA", "0"]) + +only ``NA`` and ``0`` as strings are ``NaN`` + +.. code-block:: python + + read_csv(path, na_values=["Nope"]) + +the default values, in addition to the string ``"Nope"`` are recognized as ``NaN`` + +.. _io.infinity: + +Infinity +~~~~~~~~ + +``inf`` like values will be parsed as ``np.inf`` (positive infinity), and ``-inf`` as ``-np.inf`` (negative infinity). +These will ignore the case of the value, meaning ``Inf``, will also be parsed as ``np.inf``. + + .. _io.comments: Comments diff --git a/doc/source/release.rst b/doc/source/release.rst index 90d5b1600b4eb..8c6cf34b0dbbe 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -82,6 +82,7 @@ pandas 0.13 local variable was undefined (:issue:`4381`) - In ``to_json``, raise if a passed ``orient`` would cause loss of data because of a duplicate index (:issue:`4359`) + - Fixed passing ``keep_default_na=False`` when ``na_values=None`` (:issue:`4318`) pandas 0.12 =========== diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f76b1c563a7a5..a6c8584441daf 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1774,8 +1774,11 @@ def _try_convert_dates(parser, colspec, data_dict, columns): def _clean_na_values(na_values, keep_default_na=True): - if na_values is None and keep_default_na: - na_values = _NA_VALUES + if na_values is None: + if keep_default_na: + na_values = _NA_VALUES + else: + na_values = [] na_fvalues = set() elif isinstance(na_values, dict): if keep_default_na: diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index a46a3de60fe04..730450e373341 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -108,6 +108,27 @@ def test_empty_string(self): np.nan, 'seven']}) tm.assert_frame_equal(xp.reindex(columns=df.columns), df) + + # GH4318, passing na_values=None and keep_default_na=False yields 'None' as a na_value + data = """\ +One,Two,Three +a,1,None +b,2,two +,3,None +d,4,nan +e,5,five +nan,6, +g,7,seven +""" + df = self.read_csv( + StringIO(data), keep_default_na=False) + xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'], + 'Two': [1, 2, 3, 4, 5, 6, 7], + 'Three': ['None', 'two', 'None', 'nan', 'five', '', + 'seven']}) + tm.assert_frame_equal(xp.reindex(columns=df.columns), df) + + def test_read_csv(self): if not compat.PY3: if 'win' in sys.platform: