diff --git a/doc/source/io.rst b/doc/source/io.rst index 0c31bfe014a12..bd81b478b5326 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -226,7 +226,7 @@ NA and Missing Data Handling na_values : scalar, str, list-like, or dict, default ``None`` Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as NaN: - ``'-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'NA', + ``'-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', ''``. keep_default_na : boolean, default ``True`` If na_values are specified and keep_default_na is ``False`` the default NaN diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 6dad03f086a6c..3dd8bb2ac2de5 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -38,7 +38,7 @@ Other Enhancements - :func:`read_feather` has gained the ``nthreads`` parameter for multi-threaded operations (:issue:`16359`) - :func:`DataFrame.clip()` and :func: `Series.cip()` have gained an inplace argument. (:issue: `15388`) - :func:`crosstab` has gained a ``margins_name`` parameter to define the name of the row / column that will contain the totals when margins=True. (:issue:`15972`) -- :func:`read_csv` has gained 'null' as an additional default missing value.(:issue:`16471`) + .. _whatsnew_0210.api_breaking: Backwards incompatible API changes @@ -49,6 +49,8 @@ Backwards incompatible API changes - Accessing a non-existent attribute on a closed :class:`HDFStore` will now raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`) +- :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`) +- :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`) .. _whatsnew_0210.api: diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 7a6f366d5b1a9..2549c8545908d 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -277,7 +277,7 @@ DEFAULT_CHUNKSIZE = 256 * 1024 # no longer excluding inf representations # '1.#INF','-1.#INF', '1.#INF000000', _NA_VALUES = [b'-1.#IND', b'1.#QNAN', b'1.#IND', b'-1.#QNAN', - b'#N/A N/A', b'NA', b'#NA', b'NULL', b'null', b'NaN', + b'#N/A N/A', b'n/a', b'NA', b'#NA', b'NULL', b'null', b'NaN', b'nan', b''] diff --git a/pandas/io/common.py b/pandas/io/common.py index 1c987f6a9dfc3..cbfc33dbebb81 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -31,7 +31,7 @@ # '1.#INF','-1.#INF', '1.#INF000000', _NA_VALUES = set([ '-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', - 'N/A', 'NA', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '' + 'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '' ]) try: diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py index 6f72ed51d76c6..170f9d428c9cc 100644 --- a/pandas/tests/io/parser/na_values.py +++ b/pandas/tests/io/parser/na_values.py @@ -70,8 +70,8 @@ def test_non_string_na_values(self): def test_default_na_values(self): _NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', - '#N/A', 'N/A', 'NA', '#NA', 'NULL', 'null', 'NaN', - 'nan', '-NaN', '-nan', '#N/A N/A', '']) + '#N/A', 'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null', + 'NaN', 'nan', '-NaN', '-nan', '#N/A N/A', '']) assert _NA_VALUES == parsers._NA_VALUES nv = len(_NA_VALUES)