From 8cdbcdbc2b643c1fc1c292ceb18d26b7b03b776c Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Wed, 28 Nov 2018 00:02:11 -0800 Subject: [PATCH 1/9] BUG-20591 read_csv raises ValueError for bool columns with missing values --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/_libs/parsers.pyx | 4 ++++ pandas/tests/io/parser/test_na_values.py | 9 +++++++++ 3 files changed, 14 insertions(+) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index c74bcb505b6be..7cc2ff3ad0483 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -330,6 +330,7 @@ Backwards incompatible API changes - :meth:`Series.str.cat` will now raise if `others` is a `set` (:issue:`23009`) - Passing scalar values to :class:`DatetimeIndex` or :class:`TimedeltaIndex` will now raise ``TypeError`` instead of ``ValueError`` (:issue:`23539`) - ``max_rows`` and ``max_cols`` parameters removed from :class:`HTMLFormatter` since truncation is handled by :class:`DataFrameFormatter` (:issue:`23818`) +- :meth:`read_csv` will now throw a ``ValueError`` if a column with missing values is declared as having ``dtype`` ``bool`` (:issue:`20591`) .. _whatsnew_0240.api_breaking.deps: diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 1dc71264c94dd..a459057555cf3 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1245,6 +1245,10 @@ cdef class TextReader: result, na_count = _try_bool_flex(self.parser, i, start, end, na_filter, na_hashset, self.true_set, self.false_set) + if user_dtype and na_count is not None: + if na_count > 0: + raise ValueError("Bool column has NA values in " + "column {column}".format(column=i)) return result, na_count elif dtype.kind == 'S': diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 921984bc44e50..b10402d88dac1 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -421,3 +421,12 @@ def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): result = parser.read_csv(StringIO(data), na_filter=na_filter, dtype=str) tm.assert_frame_equal(result, expected) + + +@pytest.mark.xfail +def test_cast_NA_to_bool_raises_error(all_parsers): + parser = all_parsers + data = "false,1\n,1\ntrue," + + parser.read_csv(StringIO(data), header=None, names=['a', 'b'], + dtype={'a': 'bool'}) From c4282878d93f1ba03288633da332f558e1a65d51 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Wed, 28 Nov 2018 11:42:27 -0800 Subject: [PATCH 2/9] BUG-20591 specify error in test --- doc/source/whatsnew/v0.24.0.rst | 2 +- pandas/tests/io/parser/test_na_values.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 7cc2ff3ad0483..10dc1ed58e449 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -330,7 +330,7 @@ Backwards incompatible API changes - :meth:`Series.str.cat` will now raise if `others` is a `set` (:issue:`23009`) - Passing scalar values to :class:`DatetimeIndex` or :class:`TimedeltaIndex` will now raise ``TypeError`` instead of ``ValueError`` (:issue:`23539`) - ``max_rows`` and ``max_cols`` parameters removed from :class:`HTMLFormatter` since truncation is handled by :class:`DataFrameFormatter` (:issue:`23818`) -- :meth:`read_csv` will now throw a ``ValueError`` if a column with missing values is declared as having ``dtype`` ``bool`` (:issue:`20591`) +- :meth:`read_csv` with C engine will now throw a ``ValueError`` if a column with missing values is declared as having ``dtype`` ``bool`` (:issue:`20591`) .. _whatsnew_0240.api_breaking.deps: diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index b10402d88dac1..d10d0479edd06 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -423,10 +423,10 @@ def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail -def test_cast_NA_to_bool_raises_error(all_parsers): - parser = all_parsers +def test_cast_NA_to_bool_raises_error(c_parser_only): + parser = c_parser_only data = "false,1\n,1\ntrue," - - parser.read_csv(StringIO(data), header=None, names=['a', 'b'], - dtype={'a': 'bool'}) + msg = "Bool column has NA values in column 0" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), header=None, names=['a', 'b'], + dtype={'a': 'bool'}) From 0e9f23dc25aae3b54c2ae524e667ca537060ff16 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Wed, 28 Nov 2018 15:11:21 -0800 Subject: [PATCH 3/9] BUG-20591 modify python parser as well --- doc/source/whatsnew/v0.24.0.rst | 2 +- pandas/io/parsers.py | 20 +++++++++++++++++--- pandas/tests/io/parser/test_na_values.py | 6 +++--- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 10dc1ed58e449..7ec6a7d371366 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -330,7 +330,7 @@ Backwards incompatible API changes - :meth:`Series.str.cat` will now raise if `others` is a `set` (:issue:`23009`) - Passing scalar values to :class:`DatetimeIndex` or :class:`TimedeltaIndex` will now raise ``TypeError`` instead of ``ValueError`` (:issue:`23539`) - ``max_rows`` and ``max_cols`` parameters removed from :class:`HTMLFormatter` since truncation is handled by :class:`DataFrameFormatter` (:issue:`23818`) -- :meth:`read_csv` with C engine will now throw a ``ValueError`` if a column with missing values is declared as having ``dtype`` ``bool`` (:issue:`20591`) +- :meth:`read_csv` will now throw a ``ValueError`` if a column with missing values is declared as having dtype ``bool`` (:issue:`20591`) .. _whatsnew_0240.api_breaking.deps: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 7ade35f93a858..dec74c55bec10 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -27,9 +27,9 @@ from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( - ensure_object, is_categorical_dtype, is_dtype_equal, is_float, is_integer, - is_integer_dtype, is_list_like, is_object_dtype, is_scalar, - is_string_dtype) + ensure_object, is_bool_dtype, is_categorical_dtype, is_dtype_equal, + is_float, is_integer, is_integer_dtype, is_list_like, is_object_dtype, + is_scalar, is_string_dtype) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import isna @@ -2435,6 +2435,20 @@ def _clean_mapping(mapping): clean_na_values = self.na_values clean_na_fvalues = self.na_fvalues + try: + if isinstance(clean_dtypes, dict): + for col, dt in clean_dtypes.items(): + if is_bool_dtype(dt) and data[col][data[col] == ''].size: + raise ValueError("Bool column has NA values in " + "column {column}".format(column=col)) + elif isinstance(clean_dtypes, string_types): + for col, values in data.items(): + if any(isna(values)): + raise ValueError("Bool column has NA values in " + "column {column}".format(column=col)) + except (AttributeError, TypeError): # invalid input to is_bool_dtype + pass + return self._convert_to_ndarrays(data, clean_na_values, clean_na_fvalues, self.verbose, clean_conv, clean_dtypes) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index d10d0479edd06..04a4311df1476 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -423,10 +423,10 @@ def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): tm.assert_frame_equal(result, expected) -def test_cast_NA_to_bool_raises_error(c_parser_only): - parser = c_parser_only +def test_cast_NA_to_bool_raises_error(all_parsers): + parser = all_parsers data = "false,1\n,1\ntrue," - msg = "Bool column has NA values in column 0" + msg = "Bool column has NA values in column [0a]" with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), header=None, names=['a', 'b'], dtype={'a': 'bool'}) From 0d67f22986dd74af59db612714fef06b92b61db0 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Wed, 28 Nov 2018 17:02:02 -0800 Subject: [PATCH 4/9] fix typo --- pandas/io/parsers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index dec74c55bec10..7a2ca474bd824 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2441,7 +2441,8 @@ def _clean_mapping(mapping): if is_bool_dtype(dt) and data[col][data[col] == ''].size: raise ValueError("Bool column has NA values in " "column {column}".format(column=col)) - elif isinstance(clean_dtypes, string_types): + elif (isinstance(clean_dtypes, string_types) and + is_bool_dtype(clean_dtypes)): for col, values in data.items(): if any(isna(values)): raise ValueError("Bool column has NA values in " From b4a1780cbfb268424fef523cca1621591f667e38 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Wed, 28 Nov 2018 23:08:08 -0800 Subject: [PATCH 5/9] BUG-20591 move logic to _convert_to_ndarrays --- doc/source/whatsnew/v0.24.0.rst | 2 +- pandas/io/parsers.py | 25 ++++++++++-------------- pandas/tests/io/parser/test_na_values.py | 12 +++++++++--- 3 files changed, 20 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 7ec6a7d371366..62f036f4fcf0f 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -330,7 +330,7 @@ Backwards incompatible API changes - :meth:`Series.str.cat` will now raise if `others` is a `set` (:issue:`23009`) - Passing scalar values to :class:`DatetimeIndex` or :class:`TimedeltaIndex` will now raise ``TypeError`` instead of ``ValueError`` (:issue:`23539`) - ``max_rows`` and ``max_cols`` parameters removed from :class:`HTMLFormatter` since truncation is handled by :class:`DataFrameFormatter` (:issue:`23818`) -- :meth:`read_csv` will now throw a ``ValueError`` if a column with missing values is declared as having dtype ``bool`` (:issue:`20591`) +- :meth:`read_csv` will now raise a ``ValueError`` if a column with missing values is declared as having dtype ``bool`` (:issue:`20591`) .. _whatsnew_0240.api_breaking.deps: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 7a2ca474bd824..5e8a15184075a 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1669,6 +1669,16 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, # type specified in dtype param if cast_type and not is_dtype_equal(cvals, cast_type): + try: + if (is_bool_dtype(cast_type) and + not is_categorical_dtype(cast_type) + and set(values) - set(col_na_values)): + raise ValueError("Bool column has NA values in " + "column {column}" + .format(column=c)) + except (AttributeError, TypeError): + # invalid input to is_bool_dtype + pass cvals = self._cast_types(cvals, cast_type, c) result[c] = cvals @@ -2435,21 +2445,6 @@ def _clean_mapping(mapping): clean_na_values = self.na_values clean_na_fvalues = self.na_fvalues - try: - if isinstance(clean_dtypes, dict): - for col, dt in clean_dtypes.items(): - if is_bool_dtype(dt) and data[col][data[col] == ''].size: - raise ValueError("Bool column has NA values in " - "column {column}".format(column=col)) - elif (isinstance(clean_dtypes, string_types) and - is_bool_dtype(clean_dtypes)): - for col, values in data.items(): - if any(isna(values)): - raise ValueError("Bool column has NA values in " - "column {column}".format(column=col)) - except (AttributeError, TypeError): # invalid input to is_bool_dtype - pass - return self._convert_to_ndarrays(data, clean_na_values, clean_na_fvalues, self.verbose, clean_conv, clean_dtypes) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 04a4311df1476..dcb48cca64a72 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -423,10 +423,16 @@ def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): tm.assert_frame_equal(result, expected) -def test_cast_NA_to_bool_raises_error(all_parsers): +@pytest.mark.parametrize("data", [ + "false,1\n,1\ntrue,", + "false,1\nnull,1\ntrue,", + "false,1\nnan,1\ntrue,", +]) +def test_cast_NA_to_bool_raises_error(all_parsers, data): parser = all_parsers - data = "false,1\n,1\ntrue," - msg = "Bool column has NA values in column [0a]" + msg = ("(Bool column has NA values in column [0a])|" + "(cannot safely convert passed user dtype of " + " bool for object dtyped data in column 0)") with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), header=None, names=['a', 'b'], dtype={'a': 'bool'}) From b9a0f134f67f5cf50e08b08bf62060d7b890998d Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Wed, 28 Nov 2018 23:14:05 -0800 Subject: [PATCH 6/9] fix lint --- pandas/tests/io/parser/test_na_values.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index dcb48cca64a72..7186bb7ea7be4 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -431,8 +431,8 @@ def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): def test_cast_NA_to_bool_raises_error(all_parsers, data): parser = all_parsers msg = ("(Bool column has NA values in column [0a])|" - "(cannot safely convert passed user dtype of " - " bool for object dtyped data in column 0)") + "(cannot safely convert passed user dtype of " + " bool for object dtyped data in column 0)") with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), header=None, names=['a', 'b'], dtype={'a': 'bool'}) From 92b0cc3fc57e1d47b70626277e6de08d8cf78dec Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Thu, 29 Nov 2018 01:33:02 -0800 Subject: [PATCH 7/9] BUG-20591 test custom na values --- pandas/io/parsers.py | 2 +- pandas/tests/io/parser/test_na_values.py | 17 ++++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 5e8a15184075a..aadca1fcb3bef 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1672,7 +1672,7 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, try: if (is_bool_dtype(cast_type) and not is_categorical_dtype(cast_type) - and set(values) - set(col_na_values)): + and na_count > 0): raise ValueError("Bool column has NA values in " "column {column}" .format(column=c)) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 7186bb7ea7be4..1b6d2ee8a062e 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -423,16 +423,19 @@ def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data", [ - "false,1\n,1\ntrue,", - "false,1\nnull,1\ntrue,", - "false,1\nnan,1\ntrue,", +@pytest.mark.parametrize("data, na_values", [ + ("false,1\n,1\ntrue", None), + ("false,1\nnull,1\ntrue", None), + ("false,1\nnan,1\ntrue", None), + ("false,1\nfoo,1\ntrue", 'foo'), + ("false,1\nfoo,1\ntrue", ['foo']), + ("false,1\nfoo,1\ntrue", {'a': 'foo'}), ]) -def test_cast_NA_to_bool_raises_error(all_parsers, data): +def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): parser = all_parsers msg = ("(Bool column has NA values in column [0a])|" "(cannot safely convert passed user dtype of " - " bool for object dtyped data in column 0)") + "bool for object dtyped data in column 0)") with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), header=None, names=['a', 'b'], - dtype={'a': 'bool'}) + dtype={'a': 'bool'}, na_values=na_values) From b038137457bb9a0e20267a98bc23010f6a53a5b6 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Thu, 29 Nov 2018 01:35:14 -0800 Subject: [PATCH 8/9] fix typo --- pandas/io/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index aadca1fcb3bef..55e39b4a7762a 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1672,7 +1672,7 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, try: if (is_bool_dtype(cast_type) and not is_categorical_dtype(cast_type) - and na_count > 0): + and na_count): raise ValueError("Bool column has NA values in " "column {column}" .format(column=c)) From b8bca36163feafda187e464bc044472adb0b90fc Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Thu, 29 Nov 2018 16:06:10 -0800 Subject: [PATCH 9/9] BUG-20591 fix na_count --- pandas/io/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 55e39b4a7762a..aadca1fcb3bef 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1672,7 +1672,7 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, try: if (is_bool_dtype(cast_type) and not is_categorical_dtype(cast_type) - and na_count): + and na_count > 0): raise ValueError("Bool column has NA values in " "column {column}" .format(column=c))