From 1ffe298cf3269c994742dd10c5a493abeacea000 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 7 Jan 2021 00:50:38 +0100 Subject: [PATCH 01/12] BUG: read_csv raising ValueError for tru_values/false_values and boolean dtype --- pandas/io/parsers.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 6e9cc18358153..b231a595e3d3a 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1691,9 +1691,11 @@ def _convert_to_ndarrays( else: is_ea = is_extension_array_dtype(cast_type) is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) - # skip inference if specified dtype is object - # or casting to an EA - try_num_bool = not (cast_type and is_str_or_ea_dtype) + # skip inference if specified dtype is object or casting to an EA, + # but cast if is_bool_dtype, categorical is handled later + try_num_bool = not (cast_type and is_str_or_ea_dtype) or ( + is_bool_dtype(cast_type) and not is_categorical_dtype(cast_type) + ) # general type inference and conversion cvals, na_count = self._infer_types( @@ -1812,11 +1814,15 @@ def _cast_types(self, values, cast_type, column): cast_type = pandas_dtype(cast_type) array_type = cast_type.construct_array_type() try: - return array_type._from_sequence_of_strings(values, dtype=cast_type) + if is_object_dtype(values.dtype): + return array_type._from_sequence_of_strings(values, dtype=cast_type) + else: + return array_type._from_sequence(values, dtype=cast_type) except NotImplementedError as err: raise NotImplementedError( f"Extension Array: {array_type} must implement " - "_from_sequence_of_strings in order to be used in parser methods" + "_from_sequence_of_strings or _from_sequence in case of boolean " + "in order to be used in parser methods" ) from err else: From 44a9a555602ae255fd5269b83370e37fc92bd4ac Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 7 Jan 2021 00:50:50 +0100 Subject: [PATCH 02/12] Add tests --- .../io/parser/dtypes/test_dtypes_basic.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index ec1ccf009b8de..aee2531157d18 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -213,3 +213,21 @@ def decimal_number_check(parser, numeric_decimal, thousands, float_precision): ) val = df.iloc[0, 0] assert val == numeric_decimal[1] + + +def test_true_values_cast_to_bool(all_parsers): + # GH#34655 + text = """a,b +yes,xxx +no,yyy + """ + parser = all_parsers + result = parser.read_csv( + StringIO(text), + true_values=["yes"], + false_values=["no"], + dtype={"a": "boolean"}, + ) + expected = DataFrame({"a": [True, False], "b": ["xxx", "yyy"]}) + expected["a"] = expected["a"].astype("boolean") + tm.assert_frame_equal(result, expected) From 1428478025e4b85548f956ba23e9ad3d00c080ca Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 7 Jan 2021 00:51:02 +0100 Subject: [PATCH 03/12] Fix bug for c engine --- pandas/_libs/parsers.pyx | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 4995252d7aafd..7d5feddf21469 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1084,6 +1084,9 @@ cdef class TextReader: elif is_extension_array_dtype(dtype): result, na_count = self._string_convert(i, start, end, na_filter, na_hashset) + if is_bool_dtype(dtype): + _try_switch_true_false_values(result, self.true_values, + self.false_values) array_type = dtype.construct_array_type() try: # use _from_sequence_of_strings if the class defines it @@ -1859,6 +1862,21 @@ cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col, return 0 +cdef _try_switch_true_false_values(ndarray[object] values, list true_values, + list false_values): + cdef: + int i, n = len(values) + object word + + for i in range(n): + word = values[i] + word = word.encode('utf-8') if isinstance(word, str) else word + if word in true_values: + values[i] = "1" + elif word in false_values: + values[i] = "0" + + cdef kh_str_starts_t* kset_from_list(list values) except NULL: # caller takes responsibility for freeing the hash table cdef: From 8404740a4df48691b41d91ef4b1937d5db3ba562 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 7 Jan 2021 00:51:10 +0100 Subject: [PATCH 04/12] Add whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 886469837d184..6f6b6743d8289 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -284,6 +284,7 @@ I/O - Bug in :func:`json_normalize` resulting in the first element of a generator object not being included in the returned ``DataFrame`` (:issue:`35923`) - Bug in :func:`read_excel` forward filling :class:`MultiIndex` names with multiple header and index columns specified (:issue:`34673`) - :func:`pandas.read_excel` now respects :func:``pandas.set_option`` (:issue:`34252`) +- Bug in :func:`read_csv` not switching ``true_values`` and ``false_values`` for nullable ``boolean`` dtype (:issue:`34655`) - Bug in :func:``read_json`` when ``orient="split"`` does not maintan numeric string index (:issue:`28556`) Period From 0a892175088bddb5a04b3fe8bd46b5edae298ad2 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 7 Jan 2021 00:54:07 +0100 Subject: [PATCH 05/12] Fix function header --- pandas/_libs/parsers.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 7d5feddf21469..90e247faab79f 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1863,7 +1863,7 @@ cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col, cdef _try_switch_true_false_values(ndarray[object] values, list true_values, - list false_values): + list false_values): cdef: int i, n = len(values) object word From 546754d0e6ae9838ad19a81d89da014fad267686 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 7 Jan 2021 22:00:57 +0100 Subject: [PATCH 06/12] Do switch in boolean --- pandas/_libs/parsers.pyx | 25 ++++--------------- pandas/core/arrays/base.py | 2 +- pandas/core/arrays/boolean.py | 20 ++++++++++++--- pandas/core/arrays/floating.py | 2 +- pandas/core/arrays/integer.py | 2 +- pandas/core/arrays/period.py | 2 +- pandas/core/arrays/string_.py | 2 +- pandas/core/arrays/string_arrow.py | 2 +- pandas/io/parsers.py | 23 +++++++++-------- .../io/parser/dtypes/test_dtypes_basic.py | 6 ++++- 10 files changed, 45 insertions(+), 41 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 90e247faab79f..9d9e94ea471aa 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1084,14 +1084,14 @@ cdef class TextReader: elif is_extension_array_dtype(dtype): result, na_count = self._string_convert(i, start, end, na_filter, na_hashset) - if is_bool_dtype(dtype): - _try_switch_true_false_values(result, self.true_values, - self.false_values) + true_values = [x.decode() for x in self.true_values] + false_values = [x.decode() for x in self.false_values] array_type = dtype.construct_array_type() try: # use _from_sequence_of_strings if the class defines it - result = array_type._from_sequence_of_strings(result, - dtype=dtype) + result = array_type._from_sequence_of_strings( + result, dtype=dtype, + **{"true_values": true_values, "false_values": false_values}) except NotImplementedError: raise NotImplementedError( f"Extension Array: {array_type} must implement " @@ -1862,21 +1862,6 @@ cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col, return 0 -cdef _try_switch_true_false_values(ndarray[object] values, list true_values, - list false_values): - cdef: - int i, n = len(values) - object word - - for i in range(n): - word = values[i] - word = word.encode('utf-8') if isinstance(word, str) else word - if word in true_values: - values[i] = "1" - elif word in false_values: - values[i] = "0" - - cdef kh_str_starts_t* kset_from_list(list values) except NULL: # caller takes responsibility for freeing the hash table cdef: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 9a8b37e0785e0..10874f55511cf 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -212,7 +212,7 @@ def _from_sequence(cls, scalars, *, dtype: Optional[Dtype] = None, copy=False): @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: Optional[Dtype] = None, copy=False + cls, strings, *, dtype: Optional[Dtype] = None, copy=False, **kwargs ): """ Construct a new ExtensionArray from a sequence of strings. diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index bbbc0911b4846..43470819d12f0 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -282,14 +282,28 @@ def _from_sequence( @classmethod def _from_sequence_of_strings( - cls, strings: List[str], *, dtype: Optional[Dtype] = None, copy: bool = False + cls, + strings: List[str], + *, + dtype: Optional[Dtype] = None, + copy: bool = False, + **kwargs, ) -> "BooleanArray": + true_values = kwargs.get("true_values") + false_values = kwargs.get("false_values") + true_values_union = {"True", "TRUE", "true", "1", "1.0"} + false_values_union = {"False", "FALSE", "false", "0", "0.0"} + if true_values is not None: + true_values_union.update(true_values) + if false_values is not None: + false_values_union.update(false_values) + def map_string(s): if isna(s): return s - elif s in ["True", "TRUE", "true", "1", "1.0"]: + elif s in true_values_union: return True - elif s in ["False", "FALSE", "false", "0", "0.0"]: + elif s in false_values_union: return False else: raise ValueError(f"{s} cannot be cast to bool") diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 1ac23d7893fbf..b87d2a88b5427 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -250,7 +250,7 @@ def _from_sequence( @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype=None, copy: bool = False + cls, strings, *, dtype=None, copy: bool = False, **kwargs ) -> "FloatingArray": scalars = to_numeric(strings, errors="raise") return cls._from_sequence(scalars, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index f8378fb7d1500..728648aee673f 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -311,7 +311,7 @@ def _from_sequence( @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: Optional[Dtype] = None, copy: bool = False + cls, strings, *, dtype: Optional[Dtype] = None, copy: bool = False, **kwargs ) -> "IntegerArray": scalars = to_numeric(strings, errors="raise") return cls._from_sequence(scalars, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index edcc1f29a5ec2..7e56667de08d4 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -224,7 +224,7 @@ def _from_sequence( @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: Optional[Dtype] = None, copy=False + cls, strings, *, dtype: Optional[Dtype] = None, copy=False, **kwargs ) -> "PeriodArray": return cls._from_sequence(strings, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 3d0ac3380ec39..21ad07d132c6b 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -235,7 +235,7 @@ def _from_sequence(cls, scalars, *, dtype: Optional[Dtype] = None, copy=False): @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: Optional[Dtype] = None, copy=False + cls, strings, *, dtype: Optional[Dtype] = None, copy=False, **kwargs ): return cls._from_sequence(strings, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 274feb75e9452..1f415347e2e9b 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -211,7 +211,7 @@ def _from_sequence(cls, scalars, dtype: Optional[Dtype] = None, copy=False): @classmethod def _from_sequence_of_strings( - cls, strings, dtype: Optional[Dtype] = None, copy=False + cls, strings, dtype: Optional[Dtype] = None, copy=False, **kwargs ): return cls._from_sequence(strings, dtype=dtype, copy=copy) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index b231a595e3d3a..17ddfa289ecf7 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1691,11 +1691,9 @@ def _convert_to_ndarrays( else: is_ea = is_extension_array_dtype(cast_type) is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) - # skip inference if specified dtype is object or casting to an EA, - # but cast if is_bool_dtype, categorical is handled later - try_num_bool = not (cast_type and is_str_or_ea_dtype) or ( - is_bool_dtype(cast_type) and not is_categorical_dtype(cast_type) - ) + # skip inference if specified dtype is object + # or casting to an EA + try_num_bool = not (cast_type and is_str_or_ea_dtype) # general type inference and conversion cvals, na_count = self._infer_types( @@ -1814,15 +1812,18 @@ def _cast_types(self, values, cast_type, column): cast_type = pandas_dtype(cast_type) array_type = cast_type.construct_array_type() try: - if is_object_dtype(values.dtype): - return array_type._from_sequence_of_strings(values, dtype=cast_type) - else: - return array_type._from_sequence(values, dtype=cast_type) + return array_type._from_sequence_of_strings( + values, + dtype=cast_type, + **{ + "true_values": self.true_values, + "false_values": self.false_values, + }, + ) except NotImplementedError as err: raise NotImplementedError( f"Extension Array: {array_type} must implement " - "_from_sequence_of_strings or _from_sequence in case of boolean " - "in order to be used in parser methods" + "_from_sequence_of_strings in order to be used in parser methods" ) from err else: diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index aee2531157d18..5ffd909d316bf 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -220,6 +220,8 @@ def test_true_values_cast_to_bool(all_parsers): text = """a,b yes,xxx no,yyy +1,zzz +0,aaa """ parser = all_parsers result = parser.read_csv( @@ -228,6 +230,8 @@ def test_true_values_cast_to_bool(all_parsers): false_values=["no"], dtype={"a": "boolean"}, ) - expected = DataFrame({"a": [True, False], "b": ["xxx", "yyy"]}) + expected = DataFrame( + {"a": [True, False, True, False], "b": ["xxx", "yyy", "zzz", "aaa"]} + ) expected["a"] = expected["a"].astype("boolean") tm.assert_frame_equal(result, expected) From f9ab895c981c6d9b8de831bfe9974a675513e1a5 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 7 Jan 2021 22:09:42 +0100 Subject: [PATCH 07/12] Change function signature --- pandas/core/arrays/boolean.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 43470819d12f0..b0ad9a9ad7664 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -287,10 +287,10 @@ def _from_sequence_of_strings( *, dtype: Optional[Dtype] = None, copy: bool = False, + true_values: Optional[List[str]] = None, + false_values: Optional[List[str]] = None, **kwargs, ) -> "BooleanArray": - true_values = kwargs.get("true_values") - false_values = kwargs.get("false_values") true_values_union = {"True", "TRUE", "true", "1", "1.0"} false_values_union = {"False", "FALSE", "false", "0", "0.0"} if true_values is not None: From a00807396d8c8e4ca9ac3a39afd27562eec57f07 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 8 Jan 2021 22:16:11 +0100 Subject: [PATCH 08/12] Make bool call explicit --- pandas/_libs/parsers.pyx | 14 +++++++++----- pandas/core/arrays/base.py | 2 +- pandas/core/arrays/boolean.py | 13 ++++++------- pandas/core/arrays/floating.py | 2 +- pandas/core/arrays/integer.py | 2 +- pandas/core/arrays/period.py | 2 +- pandas/core/arrays/string_.py | 2 +- pandas/core/arrays/string_arrow.py | 2 +- pandas/io/parsers.py | 17 +++++++++-------- 9 files changed, 30 insertions(+), 26 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 9d9e94ea471aa..a72a2ff8eaf28 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1084,14 +1084,18 @@ cdef class TextReader: elif is_extension_array_dtype(dtype): result, na_count = self._string_convert(i, start, end, na_filter, na_hashset) - true_values = [x.decode() for x in self.true_values] - false_values = [x.decode() for x in self.false_values] + array_type = dtype.construct_array_type() try: # use _from_sequence_of_strings if the class defines it - result = array_type._from_sequence_of_strings( - result, dtype=dtype, - **{"true_values": true_values, "false_values": false_values}) + if is_bool_dtype(dtype): + true_values = [x.decode() for x in self.true_values] + false_values = [x.decode() for x in self.false_values] + result = array_type._from_sequence_of_strings( + result, dtype=dtype, true_values=true_values, + false_values=false_values) + else: + result = array_type._from_sequence_of_strings(result, dtype=dtype) except NotImplementedError: raise NotImplementedError( f"Extension Array: {array_type} must implement " diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 10874f55511cf..9a8b37e0785e0 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -212,7 +212,7 @@ def _from_sequence(cls, scalars, *, dtype: Optional[Dtype] = None, copy=False): @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: Optional[Dtype] = None, copy=False, **kwargs + cls, strings, *, dtype: Optional[Dtype] = None, copy=False ): """ Construct a new ExtensionArray from a sequence of strings. diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index b0ad9a9ad7664..b73345572754d 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -257,6 +257,8 @@ class BooleanArray(BaseMaskedArray): # The value used to fill '_data' to avoid upcasting _internal_fill_value = False + TRUE_VALUES = {"True", "TRUE", "true", "1", "1.0"} + FALSE_VALUES = {"False", "FALSE", "false", "0", "0.0"} def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): if not (isinstance(values, np.ndarray) and values.dtype == np.bool_): @@ -289,21 +291,18 @@ def _from_sequence_of_strings( copy: bool = False, true_values: Optional[List[str]] = None, false_values: Optional[List[str]] = None, - **kwargs, ) -> "BooleanArray": - true_values_union = {"True", "TRUE", "true", "1", "1.0"} - false_values_union = {"False", "FALSE", "false", "0", "0.0"} if true_values is not None: - true_values_union.update(true_values) + cls.TRUE_VALUES.update(true_values) if false_values is not None: - false_values_union.update(false_values) + cls.FALSE_VALUES.update(false_values) def map_string(s): if isna(s): return s - elif s in true_values_union: + elif s in cls.TRUE_VALUES: return True - elif s in false_values_union: + elif s in cls.FALSE_VALUES: return False else: raise ValueError(f"{s} cannot be cast to bool") diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index b87d2a88b5427..1ac23d7893fbf 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -250,7 +250,7 @@ def _from_sequence( @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype=None, copy: bool = False, **kwargs + cls, strings, *, dtype=None, copy: bool = False ) -> "FloatingArray": scalars = to_numeric(strings, errors="raise") return cls._from_sequence(scalars, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 728648aee673f..f8378fb7d1500 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -311,7 +311,7 @@ def _from_sequence( @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: Optional[Dtype] = None, copy: bool = False, **kwargs + cls, strings, *, dtype: Optional[Dtype] = None, copy: bool = False ) -> "IntegerArray": scalars = to_numeric(strings, errors="raise") return cls._from_sequence(scalars, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 7e56667de08d4..edcc1f29a5ec2 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -224,7 +224,7 @@ def _from_sequence( @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: Optional[Dtype] = None, copy=False, **kwargs + cls, strings, *, dtype: Optional[Dtype] = None, copy=False ) -> "PeriodArray": return cls._from_sequence(strings, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 21ad07d132c6b..3d0ac3380ec39 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -235,7 +235,7 @@ def _from_sequence(cls, scalars, *, dtype: Optional[Dtype] = None, copy=False): @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: Optional[Dtype] = None, copy=False, **kwargs + cls, strings, *, dtype: Optional[Dtype] = None, copy=False ): return cls._from_sequence(strings, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 1f415347e2e9b..274feb75e9452 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -211,7 +211,7 @@ def _from_sequence(cls, scalars, dtype: Optional[Dtype] = None, copy=False): @classmethod def _from_sequence_of_strings( - cls, strings, dtype: Optional[Dtype] = None, copy=False, **kwargs + cls, strings, dtype: Optional[Dtype] = None, copy=False ): return cls._from_sequence(strings, dtype=dtype, copy=copy) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 17ddfa289ecf7..ca817be5d2ff6 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1812,14 +1812,15 @@ def _cast_types(self, values, cast_type, column): cast_type = pandas_dtype(cast_type) array_type = cast_type.construct_array_type() try: - return array_type._from_sequence_of_strings( - values, - dtype=cast_type, - **{ - "true_values": self.true_values, - "false_values": self.false_values, - }, - ) + if is_bool_dtype(cast_type): + return array_type._from_sequence_of_strings( + values, + dtype=cast_type, + true_values=self.true_values, + false_values=self.false_values, + ) + else: + return array_type._from_sequence_of_strings(values, dtype=cast_type) except NotImplementedError as err: raise NotImplementedError( f"Extension Array: {array_type} must implement " From 114883348f0263b87e178731228d045bb3e80dc4 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 8 Jan 2021 22:33:49 +0100 Subject: [PATCH 09/12] Do not update in place --- pandas/core/arrays/boolean.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index b73345572754d..f1e0ec394a68b 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -293,16 +293,20 @@ def _from_sequence_of_strings( false_values: Optional[List[str]] = None, ) -> "BooleanArray": if true_values is not None: - cls.TRUE_VALUES.update(true_values) + true_values = cls.TRUE_VALUES.union(true_values) + else: + true_values = cls.TRUE_VALUES if false_values is not None: - cls.FALSE_VALUES.update(false_values) + false_values = cls.FALSE_VALUES.union(false_values) + else: + false_values = cls.FALSE_VALUES def map_string(s): if isna(s): return s - elif s in cls.TRUE_VALUES: + elif s in true_values: return True - elif s in cls.FALSE_VALUES: + elif s in false_values: return False else: raise ValueError(f"{s} cannot be cast to bool") From aaf2977b466cd7aa11b3f0b3f3c76b574605d3eb Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 8 Jan 2021 22:35:25 +0100 Subject: [PATCH 10/12] Make private --- pandas/core/arrays/boolean.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index f1e0ec394a68b..fbce256576ee5 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -257,8 +257,8 @@ class BooleanArray(BaseMaskedArray): # The value used to fill '_data' to avoid upcasting _internal_fill_value = False - TRUE_VALUES = {"True", "TRUE", "true", "1", "1.0"} - FALSE_VALUES = {"False", "FALSE", "false", "0", "0.0"} + _TRUE_VALUES = {"True", "TRUE", "true", "1", "1.0"} + _FALSE_VALUES = {"False", "FALSE", "false", "0", "0.0"} def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): if not (isinstance(values, np.ndarray) and values.dtype == np.bool_): @@ -293,13 +293,13 @@ def _from_sequence_of_strings( false_values: Optional[List[str]] = None, ) -> "BooleanArray": if true_values is not None: - true_values = cls.TRUE_VALUES.union(true_values) + true_values = cls._TRUE_VALUES.union(true_values) else: - true_values = cls.TRUE_VALUES + true_values = cls._TRUE_VALUES if false_values is not None: - false_values = cls.FALSE_VALUES.union(false_values) + false_values = cls._FALSE_VALUES.union(false_values) else: - false_values = cls.FALSE_VALUES + false_values = cls._FALSE_VALUES def map_string(s): if isna(s): From be7b3b8b8246aeb4a578d0d738cedbe990aa1822 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 8 Jan 2021 23:32:28 +0100 Subject: [PATCH 11/12] Fix mypy issues --- pandas/core/arrays/boolean.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index fbce256576ee5..b7e55fe3a3852 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -293,20 +293,20 @@ def _from_sequence_of_strings( false_values: Optional[List[str]] = None, ) -> "BooleanArray": if true_values is not None: - true_values = cls._TRUE_VALUES.union(true_values) + true_values_union = cls._TRUE_VALUES.union(true_values) else: - true_values = cls._TRUE_VALUES + true_values_union = cls._TRUE_VALUES if false_values is not None: - false_values = cls._FALSE_VALUES.union(false_values) + false_values_union = cls._FALSE_VALUES.union(false_values) else: - false_values = cls._FALSE_VALUES + false_values_union = cls._FALSE_VALUES def map_string(s): if isna(s): return s - elif s in true_values: + elif s in true_values_union: return True - elif s in false_values: + elif s in false_values_union: return False else: raise ValueError(f"{s} cannot be cast to bool") From 4df4ee69e6e8be4dae60eca710b1273f0af73e82 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 8 Jan 2021 23:47:52 +0100 Subject: [PATCH 12/12] Simplify code --- pandas/core/arrays/boolean.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index b7e55fe3a3852..2bc908186f7f4 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -292,14 +292,8 @@ def _from_sequence_of_strings( true_values: Optional[List[str]] = None, false_values: Optional[List[str]] = None, ) -> "BooleanArray": - if true_values is not None: - true_values_union = cls._TRUE_VALUES.union(true_values) - else: - true_values_union = cls._TRUE_VALUES - if false_values is not None: - false_values_union = cls._FALSE_VALUES.union(false_values) - else: - false_values_union = cls._FALSE_VALUES + true_values_union = cls._TRUE_VALUES.union(true_values or []) + false_values_union = cls._FALSE_VALUES.union(false_values or []) def map_string(s): if isna(s):