From 12aa3d3e850b922cde95fea92325cafef0aa1a1f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 8 Jan 2020 13:53:44 -0600 Subject: [PATCH 1/8] Update NA repr Closes https://github.com/pandas-dev/pandas/issues/30415 --- pandas/_libs/missing.pyx | 5 +---- pandas/io/formats/format.py | 2 +- pandas/tests/arrays/test_boolean.py | 2 +- pandas/tests/arrays/test_integer.py | 10 +++++----- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index afaf9115abfd3..26653438356b1 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -354,10 +354,7 @@ class NAType(C_NAType): return NAType._instance def __repr__(self) -> str: - return "NA" - - def __str__(self) -> str: - return "NA" + return "" def __bool__(self): raise TypeError("boolean value of NA is ambiguous") diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 5c4b7d103d271..b981c2feea380 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1230,7 +1230,7 @@ def _format(x): if x is None: return "None" elif x is NA: - return "NA" + return formatter(x) elif x is NaT or np.isnat(x): return "NaT" except (TypeError, ValueError): diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index b89aece3f982c..9346c120172d6 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -335,7 +335,7 @@ def test_astype(): tm.assert_numpy_array_equal(result, expected) result = arr.astype("str") - expected = np.array(["True", "False", "NA"], dtype="object") + expected = np.array(["True", "False", ""], dtype="object") tm.assert_numpy_array_equal(result, expected) # no missing values diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 6a3ef75157d5d..4ccaa4431c998 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -90,7 +90,7 @@ def test_repr_dtype(dtype, expected): def test_repr_array(): result = repr(integer_array([1, None, 3])) - expected = "\n[1, NA, 3]\nLength: 3, dtype: Int64" + expected = "\n[1, , 3]\nLength: 3, dtype: Int64" assert result == expected @@ -98,9 +98,9 @@ def test_repr_array_long(): data = integer_array([1, 2, None] * 1000) expected = ( "\n" - "[ 1, 2, NA, 1, 2, NA, 1, 2, NA, 1,\n" + "[ 1, 2, , 1, 2, , 1, 2, , 1,\n" " ...\n" - " NA, 1, 2, NA, 1, 2, NA, 1, 2, NA]\n" + " , 1, 2, , 1, 2, , 1, 2, ]\n" "Length: 3000, dtype: Int64" ) result = repr(data) @@ -673,7 +673,7 @@ def test_to_numpy_na_raises(self, dtype): def test_astype_str(self): a = pd.array([1, 2, None], dtype="Int64") - expected = np.array(["1", "2", "NA"], dtype=object) + expected = np.array(["1", "2", ""], dtype=object) tm.assert_numpy_array_equal(a.astype(str), expected) tm.assert_numpy_array_equal(a.astype("str"), expected) @@ -683,7 +683,7 @@ def test_frame_repr(data_missing): df = pd.DataFrame({"A": data_missing}) result = repr(df) - expected = " A\n0 NA\n1 1" + expected = " A\n0 \n1 1" assert result == expected From 197efa97109b457087c9a67cc795036a919361ac Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 8 Jan 2020 14:40:17 -0600 Subject: [PATCH 2/8] na --- doc/source/user_guide/io.rst | 2 +- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/_libs/parsers.pyx | 1 + pandas/core/internals/blocks.py | 9 +++------ 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index f6315ea894e62..344846651771e 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1153,7 +1153,7 @@ To completely override the default values that are recognized as missing, specif .. _io.navaluesconst: The default ``NaN`` recognized values are ``['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', -'n/a', 'NA', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '']``. +'n/a', 'NA', '', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '']``. Let us consider some examples: diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 78a8ba5cddea0..3c165a7e89cf6 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -575,6 +575,7 @@ Other API changes Supplying anything else than ``how`` to ``**kwargs`` raised a ``TypeError`` previously (:issue:`29388`) - When testing pandas, the new minimum required version of pytest is 5.0.1 (:issue:`29664`) - :meth:`Series.str.__iter__` was deprecated and will be removed in future releases (:issue:`28277`). +- Added ```` to the list of default NA values for :meth:`read_csv` (:issue:`30821`) .. _whatsnew_1000.api.documentation: diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 3df2362f41f0f..377d49f2bbd29 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1369,6 +1369,7 @@ STR_NA_VALUES = { "N/A", "n/a", "NA", + "", "#NA", "NULL", "null", diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 5d240a3d7821f..45e68e37cb6e1 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -664,6 +664,7 @@ def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): else: values = np.array(values, dtype="object") + breakpoint() values[mask] = na_rep return values @@ -1777,12 +1778,8 @@ def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): values = values[slicer] mask = isna(values) - try: - values[mask] = na_rep - except Exception: - # eg SparseArray does not support setitem, needs to be converted to ndarray - return super().to_native_types(slicer, na_rep, quoting, **kwargs) - values = values.astype(str) + values = np.asarray(values.astype(str)) + values[mask] = na_rep # we are expected to return a 2-d ndarray return values.reshape(1, len(values)) From 2b0ad63c81dea5adf9abbdf4ff88fd21b4a0781e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 8 Jan 2020 15:26:06 -0600 Subject: [PATCH 3/8] fixup --- pandas/core/internals/blocks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 45e68e37cb6e1..a121906e6e676 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -664,7 +664,6 @@ def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): else: values = np.array(values, dtype="object") - breakpoint() values[mask] = na_rep return values From a148e45c9576a4d55c9f124dbec4cbec42378ff7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 8 Jan 2020 16:28:36 -0600 Subject: [PATCH 4/8] fixups --- pandas/tests/io/parser/test_na_values.py | 1 + pandas/tests/scalar/test_na_scalar.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index c9a0889cdd8b7..f9a083d7f5d22 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -89,6 +89,7 @@ def test_default_na_values(all_parsers): "N/A", "n/a", "NA", + "", "#NA", "NULL", "null", diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index a72378e02bec6..7d05511239ebc 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -16,8 +16,8 @@ def test_singleton(): def test_repr(): - assert repr(NA) == "NA" - assert str(NA) == "NA" + assert repr(NA) == "" + assert str(NA) == "" def test_truthiness(): From 40e8e9ef5a6264182d15901722a25baf4fa834bb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 8 Jan 2020 16:31:46 -0600 Subject: [PATCH 5/8] fixups --- pandas/core/internals/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index a121906e6e676..f74033924f64e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1777,7 +1777,7 @@ def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): values = values[slicer] mask = isna(values) - values = np.asarray(values.astype(str)) + values = np.asarray(values.astype(object)) values[mask] = na_rep # we are expected to return a 2-d ndarray From 24995d8ca5e9104240b14c4f1be18efb9fd0b816 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 9 Jan 2020 07:40:52 -0600 Subject: [PATCH 6/8] doctests --- ci/code_checks.sh | 7 +++++-- pandas/core/arrays/boolean.py | 6 +++--- pandas/core/arrays/integer.py | 6 +++--- pandas/core/arrays/string_.py | 4 ++-- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index a90774d2e8ff1..ab09d0049089c 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -298,8 +298,11 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then -k"-from_arrays -from_breaks -from_intervals -from_tuples -set_closed -to_tuples -interval_range" RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Doctests arrays/string_.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/arrays/string_.py + MSG='Doctests arrays; echo $MSG + pytest -q --doctest-modules \ + pandas/core/arrays/string_.py \ + pandas/core/arrays/integer.py \ + pandas/core/arrays/boolean.py RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Doctests arrays/boolean.py' ; echo $MSG diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index c065fdeba2177..af6232fcc3367 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -244,7 +244,7 @@ class BooleanArray(BaseMaskedArray): >>> pd.array([True, False, None], dtype="boolean") - [True, False, NA] + [True, False, ] Length: 3, dtype: boolean """ @@ -527,7 +527,7 @@ def any(self, skipna: bool = True, **kwargs): >>> pd.array([True, False, pd.NA]).any(skipna=False) True >>> pd.array([False, False, pd.NA]).any(skipna=False) - NA + """ kwargs.pop("axis", None) nv.validate_any((), kwargs) @@ -592,7 +592,7 @@ def all(self, skipna: bool = True, **kwargs): required (whether ``pd.NA`` is True or False influences the result): >>> pd.array([True, True, pd.NA]).all(skipna=False) - NA + >>> pd.array([True, False, pd.NA]).all(skipna=False) False """ diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 50062f09495aa..be93bb7363ed3 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -301,19 +301,19 @@ class IntegerArray(BaseMaskedArray): >>> int_array = pd.array([1, None, 3], dtype=pd.Int32Dtype()) >>> int_array - [1, NaN, 3] + [1, , 3] Length: 3, dtype: Int32 String aliases for the dtypes are also available. They are capitalized. >>> pd.array([1, None, 3], dtype='Int32') - [1, NaN, 3] + [1, , 3] Length: 3, dtype: Int32 >>> pd.array([1, None, 3], dtype='UInt16') - [1, NaN, 3] + [1, , 3] Length: 3, dtype: UInt16 """ diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 0da877fb1ad45..b41b6d53e90f4 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -131,7 +131,7 @@ class StringArray(PandasArray): -------- >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string") - ['This is', 'some text', NA, 'data.'] + ['This is', 'some text', , 'data.'] Length: 4, dtype: string Unlike ``object`` dtype arrays, ``StringArray`` doesn't allow non-string @@ -146,7 +146,7 @@ class StringArray(PandasArray): >>> pd.array(["a", None, "c"], dtype="string") == "a" - [True, NA, False] + [True, , False] Length: 3, dtype: boolean """ From 3c1e367892eabb5d3e54512507e5937eeb10a23c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 9 Jan 2020 08:07:46 -0600 Subject: [PATCH 7/8] fixup script --- ci/code_checks.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index ab09d0049089c..83ceb11dfcbf4 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -298,7 +298,7 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then -k"-from_arrays -from_breaks -from_intervals -from_tuples -set_closed -to_tuples -interval_range" RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Doctests arrays; echo $MSG + MSG='Doctests arrays'; echo $MSG pytest -q --doctest-modules \ pandas/core/arrays/string_.py \ pandas/core/arrays/integer.py \ From c51c5d7304d82b511ca581c7712e6501f9900cb2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 9 Jan 2020 08:21:52 -0600 Subject: [PATCH 8/8] fixup --- pandas/tests/arrays/string_/test_string.py | 18 ++++++++++-------- pandas/tests/arrays/test_boolean.py | 12 ++++++++++++ 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index ec7e35e5c6db4..2c61744ef953c 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -9,14 +9,16 @@ import pandas._testing as tm -def test_repr_with_NA(): - a = pd.array(["a", pd.NA, "b"], dtype="string") - for obj in [a, pd.Series(a), pd.DataFrame({"a": a})]: - assert "NA" in repr(obj) and "NaN" not in repr(obj) - assert "NA" in str(obj) and "NaN" not in str(obj) - if hasattr(obj, "_repr_html_"): - html_repr = obj._repr_html_() - assert "NA" in html_repr and "NaN" not in html_repr +def test_repr(): + df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype="string")}) + expected = " A\n0 a\n1 \n2 b" + assert repr(df) == expected + + expected = "0 a\n1 \n2 b\nName: A, dtype: string" + assert repr(df.A) == expected + + expected = "\n['a', , 'b']\nLength: 3, dtype: string" + assert repr(df.A.array) == expected def test_none_to_nan(): diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index 9346c120172d6..fc1abce8c077a 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -251,6 +251,18 @@ def test_coerce_to_numpy_array(): np.array(arr, dtype="bool") +def test_repr(): + df = pd.DataFrame({"A": pd.array([True, False, None], dtype="boolean")}) + expected = " A\n0 True\n1 False\n2 " + assert repr(df) == expected + + expected = "0 True\n1 False\n2 \nName: A, dtype: boolean" + assert repr(df.A) == expected + + expected = "\n[True, False, ]\nLength: 3, dtype: boolean" + assert repr(df.A.array) == expected + + @pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) def test_to_numpy(box): con = pd.Series if box else pd.array