diff --git a/ci/code_checks.sh b/ci/code_checks.sh index a90774d2e8ff1..83ceb11dfcbf4 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -298,8 +298,11 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then -k"-from_arrays -from_breaks -from_intervals -from_tuples -set_closed -to_tuples -interval_range" RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Doctests arrays/string_.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/arrays/string_.py + MSG='Doctests arrays'; echo $MSG + pytest -q --doctest-modules \ + pandas/core/arrays/string_.py \ + pandas/core/arrays/integer.py \ + pandas/core/arrays/boolean.py RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Doctests arrays/boolean.py' ; echo $MSG diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index e174060647018..55bbf6848820b 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1153,7 +1153,7 @@ To completely override the default values that are recognized as missing, specif .. _io.navaluesconst: The default ``NaN`` recognized values are ``['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', -'n/a', 'NA', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '']``. +'n/a', 'NA', '', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '']``. Let us consider some examples: diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 11a6f2628ac52..216e53036c44f 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -576,6 +576,7 @@ Other API changes Supplying anything else than ``how`` to ``**kwargs`` raised a ``TypeError`` previously (:issue:`29388`) - When testing pandas, the new minimum required version of pytest is 5.0.1 (:issue:`29664`) - :meth:`Series.str.__iter__` was deprecated and will be removed in future releases (:issue:`28277`). +- Added ```` to the list of default NA values for :meth:`read_csv` (:issue:`30821`) .. _whatsnew_100.api.documentation: diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index afaf9115abfd3..26653438356b1 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -354,10 +354,7 @@ class NAType(C_NAType): return NAType._instance def __repr__(self) -> str: - return "NA" - - def __str__(self) -> str: - return "NA" + return "" def __bool__(self): raise TypeError("boolean value of NA is ambiguous") diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 3df2362f41f0f..377d49f2bbd29 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1369,6 +1369,7 @@ STR_NA_VALUES = { "N/A", "n/a", "NA", + "", "#NA", "NULL", "null", diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index c065fdeba2177..af6232fcc3367 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -244,7 +244,7 @@ class BooleanArray(BaseMaskedArray): >>> pd.array([True, False, None], dtype="boolean") - [True, False, NA] + [True, False, ] Length: 3, dtype: boolean """ @@ -527,7 +527,7 @@ def any(self, skipna: bool = True, **kwargs): >>> pd.array([True, False, pd.NA]).any(skipna=False) True >>> pd.array([False, False, pd.NA]).any(skipna=False) - NA + """ kwargs.pop("axis", None) nv.validate_any((), kwargs) @@ -592,7 +592,7 @@ def all(self, skipna: bool = True, **kwargs): required (whether ``pd.NA`` is True or False influences the result): >>> pd.array([True, True, pd.NA]).all(skipna=False) - NA + >>> pd.array([True, False, pd.NA]).all(skipna=False) False """ diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 50062f09495aa..be93bb7363ed3 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -301,19 +301,19 @@ class IntegerArray(BaseMaskedArray): >>> int_array = pd.array([1, None, 3], dtype=pd.Int32Dtype()) >>> int_array - [1, NaN, 3] + [1, , 3] Length: 3, dtype: Int32 String aliases for the dtypes are also available. They are capitalized. >>> pd.array([1, None, 3], dtype='Int32') - [1, NaN, 3] + [1, , 3] Length: 3, dtype: Int32 >>> pd.array([1, None, 3], dtype='UInt16') - [1, NaN, 3] + [1, , 3] Length: 3, dtype: UInt16 """ diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 0da877fb1ad45..b41b6d53e90f4 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -131,7 +131,7 @@ class StringArray(PandasArray): -------- >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string") - ['This is', 'some text', NA, 'data.'] + ['This is', 'some text', , 'data.'] Length: 4, dtype: string Unlike ``object`` dtype arrays, ``StringArray`` doesn't allow non-string @@ -146,7 +146,7 @@ class StringArray(PandasArray): >>> pd.array(["a", None, "c"], dtype="string") == "a" - [True, NA, False] + [True, , False] Length: 3, dtype: boolean """ diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 5d240a3d7821f..f74033924f64e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1777,12 +1777,8 @@ def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): values = values[slicer] mask = isna(values) - try: - values[mask] = na_rep - except Exception: - # eg SparseArray does not support setitem, needs to be converted to ndarray - return super().to_native_types(slicer, na_rep, quoting, **kwargs) - values = values.astype(str) + values = np.asarray(values.astype(object)) + values[mask] = na_rep # we are expected to return a 2-d ndarray return values.reshape(1, len(values)) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 5c4b7d103d271..b981c2feea380 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1230,7 +1230,7 @@ def _format(x): if x is None: return "None" elif x is NA: - return "NA" + return formatter(x) elif x is NaT or np.isnat(x): return "NaT" except (TypeError, ValueError): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index ec7e35e5c6db4..2c61744ef953c 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -9,14 +9,16 @@ import pandas._testing as tm -def test_repr_with_NA(): - a = pd.array(["a", pd.NA, "b"], dtype="string") - for obj in [a, pd.Series(a), pd.DataFrame({"a": a})]: - assert "NA" in repr(obj) and "NaN" not in repr(obj) - assert "NA" in str(obj) and "NaN" not in str(obj) - if hasattr(obj, "_repr_html_"): - html_repr = obj._repr_html_() - assert "NA" in html_repr and "NaN" not in html_repr +def test_repr(): + df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype="string")}) + expected = " A\n0 a\n1 \n2 b" + assert repr(df) == expected + + expected = "0 a\n1 \n2 b\nName: A, dtype: string" + assert repr(df.A) == expected + + expected = "\n['a', , 'b']\nLength: 3, dtype: string" + assert repr(df.A.array) == expected def test_none_to_nan(): diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index b89aece3f982c..fc1abce8c077a 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -251,6 +251,18 @@ def test_coerce_to_numpy_array(): np.array(arr, dtype="bool") +def test_repr(): + df = pd.DataFrame({"A": pd.array([True, False, None], dtype="boolean")}) + expected = " A\n0 True\n1 False\n2 " + assert repr(df) == expected + + expected = "0 True\n1 False\n2 \nName: A, dtype: boolean" + assert repr(df.A) == expected + + expected = "\n[True, False, ]\nLength: 3, dtype: boolean" + assert repr(df.A.array) == expected + + @pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) def test_to_numpy(box): con = pd.Series if box else pd.array @@ -335,7 +347,7 @@ def test_astype(): tm.assert_numpy_array_equal(result, expected) result = arr.astype("str") - expected = np.array(["True", "False", "NA"], dtype="object") + expected = np.array(["True", "False", ""], dtype="object") tm.assert_numpy_array_equal(result, expected) # no missing values diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 6a3ef75157d5d..4ccaa4431c998 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -90,7 +90,7 @@ def test_repr_dtype(dtype, expected): def test_repr_array(): result = repr(integer_array([1, None, 3])) - expected = "\n[1, NA, 3]\nLength: 3, dtype: Int64" + expected = "\n[1, , 3]\nLength: 3, dtype: Int64" assert result == expected @@ -98,9 +98,9 @@ def test_repr_array_long(): data = integer_array([1, 2, None] * 1000) expected = ( "\n" - "[ 1, 2, NA, 1, 2, NA, 1, 2, NA, 1,\n" + "[ 1, 2, , 1, 2, , 1, 2, , 1,\n" " ...\n" - " NA, 1, 2, NA, 1, 2, NA, 1, 2, NA]\n" + " , 1, 2, , 1, 2, , 1, 2, ]\n" "Length: 3000, dtype: Int64" ) result = repr(data) @@ -673,7 +673,7 @@ def test_to_numpy_na_raises(self, dtype): def test_astype_str(self): a = pd.array([1, 2, None], dtype="Int64") - expected = np.array(["1", "2", "NA"], dtype=object) + expected = np.array(["1", "2", ""], dtype=object) tm.assert_numpy_array_equal(a.astype(str), expected) tm.assert_numpy_array_equal(a.astype("str"), expected) @@ -683,7 +683,7 @@ def test_frame_repr(data_missing): df = pd.DataFrame({"A": data_missing}) result = repr(df) - expected = " A\n0 NA\n1 1" + expected = " A\n0 \n1 1" assert result == expected diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index c9a0889cdd8b7..f9a083d7f5d22 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -89,6 +89,7 @@ def test_default_na_values(all_parsers): "N/A", "n/a", "NA", + "", "#NA", "NULL", "null", diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index a72378e02bec6..7d05511239ebc 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -16,8 +16,8 @@ def test_singleton(): def test_repr(): - assert repr(NA) == "NA" - assert str(NA) == "NA" + assert repr(NA) == "" + assert str(NA) == "" def test_truthiness():