From d1c0b5161bf694b6be996ba290a047b8770762d8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 21 Sep 2022 21:13:44 +0200 Subject: [PATCH 1/6] ENH: Add option to use nullable dtypes in read_csv --- pandas/_libs/parsers.pyx | 9 ++++--- pandas/io/parsers/base_parser.py | 46 +++++++++++++++++++++++++------- pandas/io/parsers/readers.py | 17 ++++++++++++ 3 files changed, 60 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 07bf7f69ec907..6657cf6b8b46d 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -342,6 +342,7 @@ cdef class TextReader: object index_col object skiprows object dtype + bint use_nullable_dtypes object usecols set unnamed_cols # set[str] @@ -380,7 +381,8 @@ cdef class TextReader: bint mangle_dupe_cols=True, float_precision=None, bint skip_blank_lines=True, - encoding_errors=b"strict"): + encoding_errors=b"strict", + use_nullable_dtypes=False): # set encoding for native Python and C library if isinstance(encoding_errors, str): @@ -505,6 +507,7 @@ cdef class TextReader: # - DtypeObj # - dict[Any, DtypeObj] self.dtype = dtype + self.use_nullable_dtypes = use_nullable_dtypes # XXX self.noconvert = set() @@ -1053,8 +1056,8 @@ cdef class TextReader: self._free_na_set(na_hashset) # don't try to upcast EAs - if na_count > 0 and not is_extension_array_dtype(col_dtype): - col_res = _maybe_upcast(col_res) + if na_count > 0 and not is_extension_array_dtype(col_dtype) or self.use_nullable_dtypes: + col_res = _maybe_upcast(col_res, use_nullable_dtypes=self.use_nullable_dtypes) if col_res is None: raise ParserError(f'Unable to parse column {i}') diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index f90a0549a4320..f4a395fa4cc98 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -50,6 +50,7 @@ is_dict_like, is_dtype_equal, is_extension_array_dtype, + is_float_dtype, is_integer, is_integer_dtype, is_list_like, @@ -61,8 +62,14 @@ from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import isna +from pandas import StringDtype from pandas.core import algorithms -from pandas.core.arrays import Categorical +from pandas.core.arrays import ( + BooleanArray, + Categorical, + FloatingArray, + IntegerArray, +) from pandas.core.indexes.api import ( Index, MultiIndex, @@ -110,6 +117,7 @@ def __init__(self, kwds) -> None: self.dtype = copy(kwds.get("dtype", None)) self.converters = kwds.get("converters") + self.use_nullable_dtypes = kwds.get("use_nullable_dtypes") self.true_values = kwds.get("true_values") self.false_values = kwds.get("false_values") @@ -589,10 +597,7 @@ def _convert_to_ndarrays( ) # type specified in dtype param or cast_type is an EA - if cast_type and ( - not is_dtype_equal(cvals, cast_type) - or is_extension_array_dtype(cast_type) - ): + if cast_type and (not is_dtype_equal(cvals, cast_type) or is_ea): if not is_ea and na_count > 0: try: if is_bool_dtype(cast_type): @@ -710,14 +715,36 @@ def _infer_types(self, values, na_values, try_num_bool: bool = True): if try_num_bool and is_object_dtype(values.dtype): # exclude e.g DatetimeIndex here try: - result, _ = lib.maybe_convert_numeric(values, na_values, False) + result, result_mask = lib.maybe_convert_numeric( + values, + na_values, + False, + convert_to_masked_nullable=self.use_nullable_dtypes, + ) except (ValueError, TypeError): # e.g. encountering datetime string gets ValueError # TypeError can be raised in floatify - result = values - na_count = parsers.sanitize_objects(result, na_values) + na_count = parsers.sanitize_objects(values, na_values) + + if self.use_nullable_dtypes: + result = StringDtype().construct_array_type()._from_sequence(values) + else: + result = values else: - na_count = isna(result).sum() + if self.use_nullable_dtypes: + if result_mask is None: + result_mask = np.zeros(result.shape, dtype="bool") + + if is_integer_dtype(result): + result = IntegerArray(result, result_mask) + elif is_bool_dtype(result): + result = BooleanArray(result, result_mask) + elif is_float_dtype(result): + result = FloatingArray(result, result_mask) + + na_count = result_mask.sum() + else: + na_count = isna(result).sum() else: result = values if values.dtype == np.object_: @@ -1146,6 +1173,7 @@ def converter(*date_cols): "on_bad_lines": ParserBase.BadLineHandleMethod.ERROR, "error_bad_lines": None, "warn_bad_lines": None, + "use_nullable_dtypes": False, } diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index eaec4c6bd5991..9f0bc153e56d5 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -427,6 +427,13 @@ .. versionadded:: 1.2 +use_nullable_dtypes: bool = False + Whether or not to use nullable dtypes as default when reading data. If + set to True, nullable dtypes are used for all dtypes that have a nullable + implementation, even if no nulls are present. + + .. versionadded:: 2.0 + Returns ------- DataFrame or TextFileReader @@ -669,6 +676,7 @@ def read_csv( memory_map: bool = ..., float_precision: Literal["high", "legacy"] | None = ..., storage_options: StorageOptions = ..., + use_nullable_dtypes: bool = ..., ) -> TextFileReader: ... @@ -729,6 +737,7 @@ def read_csv( memory_map: bool = ..., float_precision: Literal["high", "legacy"] | None = ..., storage_options: StorageOptions = ..., + use_nullable_dtypes: bool = ..., ) -> TextFileReader: ... @@ -789,6 +798,7 @@ def read_csv( memory_map: bool = ..., float_precision: Literal["high", "legacy"] | None = ..., storage_options: StorageOptions = ..., + use_nullable_dtypes: bool = ..., ) -> DataFrame: ... @@ -849,6 +859,7 @@ def read_csv( memory_map: bool = ..., float_precision: Literal["high", "legacy"] | None = ..., storage_options: StorageOptions = ..., + use_nullable_dtypes: bool = ..., ) -> DataFrame | TextFileReader: ... @@ -928,6 +939,7 @@ def read_csv( memory_map: bool = False, float_precision: Literal["high", "legacy"] | None = None, storage_options: StorageOptions = None, + use_nullable_dtypes: bool = False, ) -> DataFrame | TextFileReader: # locals() should never be modified kwds = locals().copy() @@ -1008,6 +1020,7 @@ def read_table( memory_map: bool = ..., float_precision: str | None = ..., storage_options: StorageOptions = ..., + use_nullable_dtypes: bool = ..., ) -> TextFileReader: ... @@ -1068,6 +1081,7 @@ def read_table( memory_map: bool = ..., float_precision: str | None = ..., storage_options: StorageOptions = ..., + use_nullable_dtypes: bool = ..., ) -> TextFileReader: ... @@ -1128,6 +1142,7 @@ def read_table( memory_map: bool = ..., float_precision: str | None = ..., storage_options: StorageOptions = ..., + use_nullable_dtypes: bool = ..., ) -> DataFrame: ... @@ -1188,6 +1203,7 @@ def read_table( memory_map: bool = ..., float_precision: str | None = ..., storage_options: StorageOptions = ..., + use_nullable_dtypes: bool = ..., ) -> DataFrame | TextFileReader: ... @@ -1267,6 +1283,7 @@ def read_table( memory_map: bool = False, float_precision: str | None = None, storage_options: StorageOptions = None, + use_nullable_dtypes: bool = False, ) -> DataFrame | TextFileReader: # locals() should never be modified kwds = locals().copy() From d7a7eca6c1a8f541e9fcba1b2ec93a5a05809f1d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 25 Sep 2022 13:38:21 -0700 Subject: [PATCH 2/6] Finish implementation --- doc/source/whatsnew/v1.6.0.rst | 1 + pandas/_libs/parsers.pyx | 12 +++++- pandas/io/parsers/base_parser.py | 35 ++++++++++------ .../io/parser/dtypes/test_dtypes_basic.py | 42 +++++++++++++++++++ 4 files changed, 75 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index ae062ca30a9fa..ec2919e7caf8a 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -32,6 +32,7 @@ Other enhancements - :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` now preserve nullable dtypes instead of casting to numpy dtypes (:issue:`37493`) - :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support an ``axis`` argument. If ``axis`` is set, the default behaviour of which axis to consider can be overwritten (:issue:`47819`) - :func:`assert_frame_equal` now shows the first element where the DataFrames differ, analogously to ``pytest``'s output (:issue:`47910`) +- Added new argument ``use_nullable_dtypes`` to :func:`read_csv` to enable automatic conversion to nullable dtypes (:issue:`36712`) - Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`) - Added metadata propagation for binary operators on :class:`DataFrame` (:issue:`28283`) - :class:`.CategoricalConversionWarning`, :class:`.InvalidComparison`, :class:`.InvalidVersion`, :class:`.LossySetitemError`, and :class:`.NoBufferPresent` are now exposed in ``pandas.errors`` (:issue:`27656`) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 6657cf6b8b46d..694858c3322c2 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -936,6 +936,7 @@ cdef class TextReader: bint na_filter = 0 int64_t num_cols dict result + bint use_nullable_dtypes start = self.parser_start @@ -1056,8 +1057,15 @@ cdef class TextReader: self._free_na_set(na_hashset) # don't try to upcast EAs - if na_count > 0 and not is_extension_array_dtype(col_dtype) or self.use_nullable_dtypes: - col_res = _maybe_upcast(col_res, use_nullable_dtypes=self.use_nullable_dtypes) + print(col_dtype) + if ( + na_count > 0 and not is_extension_array_dtype(col_dtype) + or self.use_nullable_dtypes + ): + use_nullable_dtypes = self.use_nullable_dtypes and col_dtype is None + col_res = _maybe_upcast( + col_res, use_nullable_dtypes=use_nullable_dtypes + ) if col_res is None: raise ParserError(f'Unable to parse column {i}') diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index f4a395fa4cc98..3c16b3be07bc1 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -117,7 +117,7 @@ def __init__(self, kwds) -> None: self.dtype = copy(kwds.get("dtype", None)) self.converters = kwds.get("converters") - self.use_nullable_dtypes = kwds.get("use_nullable_dtypes") + self.use_nullable_dtypes = kwds.get("use_nullable_dtypes", False) self.true_values = kwds.get("true_values") self.false_values = kwds.get("false_values") @@ -516,7 +516,7 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: ) arr, _ = self._infer_types( - arr, col_na_values | col_na_fvalues, try_num_bool + arr, col_na_values | col_na_fvalues, cast_type, try_num_bool ) arrays.append(arr) @@ -582,7 +582,10 @@ def _convert_to_ndarrays( values = lib.map_infer_mask(values, conv_f, mask) cvals, na_count = self._infer_types( - values, set(col_na_values) | col_na_fvalues, try_num_bool=False + values, + set(col_na_values) | col_na_fvalues, + cast_type, + try_num_bool=False, ) else: is_ea = is_extension_array_dtype(cast_type) @@ -593,7 +596,7 @@ def _convert_to_ndarrays( # general type inference and conversion cvals, na_count = self._infer_types( - values, set(col_na_values) | col_na_fvalues, try_num_bool + values, set(col_na_values) | col_na_fvalues, cast_type, try_num_bool ) # type specified in dtype param or cast_type is an EA @@ -684,7 +687,7 @@ def _set(x) -> int: return noconvert_columns - def _infer_types(self, values, na_values, try_num_bool: bool = True): + def _infer_types(self, values, na_values, cast_type, try_num_bool: bool = True): """ Infer types of values, possibly casting @@ -692,6 +695,7 @@ def _infer_types(self, values, na_values, try_num_bool: bool = True): ---------- values : ndarray na_values : set + cast_type: Specifies if we want to cast explicitly try_num_bool : bool, default try try to cast values to numeric (first preference) or boolean @@ -712,6 +716,8 @@ def _infer_types(self, values, na_values, try_num_bool: bool = True): np.putmask(values, mask, np.nan) return values, na_count + use_nullable_dtypes = self.use_nullable_dtypes and cast_type is None + if try_num_bool and is_object_dtype(values.dtype): # exclude e.g DatetimeIndex here try: @@ -719,19 +725,15 @@ def _infer_types(self, values, na_values, try_num_bool: bool = True): values, na_values, False, - convert_to_masked_nullable=self.use_nullable_dtypes, + convert_to_masked_nullable=use_nullable_dtypes, ) except (ValueError, TypeError): # e.g. encountering datetime string gets ValueError # TypeError can be raised in floatify na_count = parsers.sanitize_objects(values, na_values) - - if self.use_nullable_dtypes: - result = StringDtype().construct_array_type()._from_sequence(values) - else: - result = values + result = values else: - if self.use_nullable_dtypes: + if use_nullable_dtypes: if result_mask is None: result_mask = np.zeros(result.shape, dtype="bool") @@ -751,11 +753,18 @@ def _infer_types(self, values, na_values, try_num_bool: bool = True): na_count = parsers.sanitize_objects(values, na_values) if result.dtype == np.object_ and try_num_bool: - result, _ = libops.maybe_convert_bool( + result, mask = libops.maybe_convert_bool( np.asarray(values), true_values=self.true_values, false_values=self.false_values, + convert_to_masked_nullable=use_nullable_dtypes, ) + if result.dtype == np.bool_ and use_nullable_dtypes: + if mask is None: + mask = np.zeros(result.shape, dtype=np.bool_) + result = BooleanArray(result, mask) + elif result.dtype == np.object_ and use_nullable_dtypes: + result = StringDtype().construct_array_type()._from_sequence(values) return result, na_count diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 2c18d461cddf8..1312308bcaa71 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -385,3 +385,45 @@ def test_dtypes_defaultdict_invalid(all_parsers): parser = all_parsers with pytest.raises(TypeError, match="not understood"): parser.read_csv(StringIO(data), dtype=dtype) + + +def test_use_nullabla_dtypes(all_parsers): + # GH#36712 + + parser = all_parsers + + data = """a,b,c,d,e,f,g,h,i +1,2.5,True,a,,,,,12-31-2019 +3,4.5,False,b,6,7.5,True,a,12-31-2019 +""" + result = parser.read_csv( + StringIO(data), use_nullable_dtypes=True, parse_dates=["i"] + ) + expected = DataFrame( + { + "a": pd.Series([1, 3], dtype="Int64"), + "b": pd.Series([2.5, 4.5], dtype="Float64"), + "c": pd.Series([True, False], dtype="boolean"), + "d": pd.Series(["a", "b"], dtype="string"), + "e": pd.Series([pd.NA, 6], dtype="Int64"), + "f": pd.Series([pd.NA, 7.5], dtype="Float64"), + "g": pd.Series([pd.NA, True], dtype="boolean"), + "h": pd.Series([pd.NA, "a"], dtype="string"), + "i": pd.Series([Timestamp("2019-12-31")] * 2), + } + ) + tm.assert_frame_equal(result, expected) + + +def test_use_nullabla_dtypes_and_dtype(all_parsers): + # GH#36712 + + parser = all_parsers + + data = """a,b +1,2.5 +, +""" + result = parser.read_csv(StringIO(data), use_nullable_dtypes=True, dtype="float64") + expected = DataFrame({"a": [1.0, np.nan], "b": [2.5, np.nan]}) + tm.assert_frame_equal(result, expected) From 4f05540ed53303156a3a9aaed2c28244ac7b0254 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 25 Sep 2022 13:45:02 -0700 Subject: [PATCH 3/6] Update --- doc/source/user_guide/io.rst | 8 ++++++++ pandas/_libs/parsers.pyx | 1 - pandas/io/parsers/base_parser.py | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 15b3b894c68b6..543489194535d 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -197,6 +197,14 @@ dtype : Type name or dict of column -> type, default ``None`` Support for defaultdict was added. Specify a defaultdict as input where the default determines the dtype of the columns which are not explicitly listed. + +use_nullable_dtypes: bool = False + Whether or not to use nullable dtypes as default when reading data. If + set to True, nullable dtypes are used for all dtypes that have a nullable + implementation, even if no nulls are present. + + .. versionadded:: 2.0 + engine : {``'c'``, ``'python'``, ``'pyarrow'``} Parser engine to use. The C and pyarrow engines are faster, while the python engine is currently more feature-complete. Multithreading is currently only supported by diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 694858c3322c2..b77c141f39502 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1057,7 +1057,6 @@ cdef class TextReader: self._free_na_set(na_hashset) # don't try to upcast EAs - print(col_dtype) if ( na_count > 0 and not is_extension_array_dtype(col_dtype) or self.use_nullable_dtypes diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 3c16b3be07bc1..ffeafd6289999 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -735,7 +735,7 @@ def _infer_types(self, values, na_values, cast_type, try_num_bool: bool = True): else: if use_nullable_dtypes: if result_mask is None: - result_mask = np.zeros(result.shape, dtype="bool") + result_mask = np.zeros(result.shape, dtype=np.bool_) if is_integer_dtype(result): result = IntegerArray(result, result_mask) From af6056bd660154935767aded2047918b5812738c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 29 Sep 2022 10:11:43 +0200 Subject: [PATCH 4/6] Fix mypy --- doc/source/user_guide/io.rst | 2 +- pandas/io/parsers/base_parser.py | 14 +++++++++----- pandas/io/parsers/readers.py | 2 +- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 543489194535d..1552f2a8d257b 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -198,7 +198,7 @@ dtype : Type name or dict of column -> type, default ``None`` the default determines the dtype of the columns which are not explicitly listed. -use_nullable_dtypes: bool = False +use_nullable_dtypes : bool = False Whether or not to use nullable dtypes as default when reading data. If set to True, nullable dtypes are used for all dtypes that have a nullable implementation, even if no nulls are present. diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index ffeafd6289999..fdf806d883151 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -15,6 +15,7 @@ Hashable, Iterable, List, + Literal, Mapping, Sequence, Tuple, @@ -716,7 +717,10 @@ def _infer_types(self, values, na_values, cast_type, try_num_bool: bool = True): np.putmask(values, mask, np.nan) return values, na_count - use_nullable_dtypes = self.use_nullable_dtypes and cast_type is None + use_nullable_dtypes: Literal[True] | Literal[False] = ( + self.use_nullable_dtypes and cast_type is None + ) + result: ArrayLike if try_num_bool and is_object_dtype(values.dtype): # exclude e.g DatetimeIndex here @@ -753,16 +757,16 @@ def _infer_types(self, values, na_values, cast_type, try_num_bool: bool = True): na_count = parsers.sanitize_objects(values, na_values) if result.dtype == np.object_ and try_num_bool: - result, mask = libops.maybe_convert_bool( + result, bool_mask = libops.maybe_convert_bool( np.asarray(values), true_values=self.true_values, false_values=self.false_values, convert_to_masked_nullable=use_nullable_dtypes, ) if result.dtype == np.bool_ and use_nullable_dtypes: - if mask is None: - mask = np.zeros(result.shape, dtype=np.bool_) - result = BooleanArray(result, mask) + if bool_mask is None: + bool_mask = np.zeros(result.shape, dtype=np.bool_) + result = BooleanArray(result, bool_mask) elif result.dtype == np.object_ and use_nullable_dtypes: result = StringDtype().construct_array_type()._from_sequence(values) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 4a59966384897..ae8d778b9138d 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -427,7 +427,7 @@ .. versionadded:: 1.2 -use_nullable_dtypes: bool = False +use_nullable_dtypes : bool = False Whether or not to use nullable dtypes as default when reading data. If set to True, nullable dtypes are used for all dtypes that have a nullable implementation, even if no nulls are present. From 8a4d2063a5a3fc31682730c10000a98a8373ea5e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 30 Sep 2022 13:09:52 +0200 Subject: [PATCH 5/6] Add tests and fix call --- pandas/io/parsers/base_parser.py | 23 +++++++--- .../io/parser/dtypes/test_dtypes_basic.py | 45 +++++++++++++++++-- 2 files changed, 58 insertions(+), 10 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index fdf806d883151..ae9a66eeaf353 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -517,7 +517,7 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: ) arr, _ = self._infer_types( - arr, col_na_values | col_na_fvalues, cast_type, try_num_bool + arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool ) arrays.append(arr) @@ -585,7 +585,7 @@ def _convert_to_ndarrays( cvals, na_count = self._infer_types( values, set(col_na_values) | col_na_fvalues, - cast_type, + cast_type is None, try_num_bool=False, ) else: @@ -597,7 +597,10 @@ def _convert_to_ndarrays( # general type inference and conversion cvals, na_count = self._infer_types( - values, set(col_na_values) | col_na_fvalues, cast_type, try_num_bool + values, + set(col_na_values) | col_na_fvalues, + cast_type is None, + try_num_bool, ) # type specified in dtype param or cast_type is an EA @@ -688,7 +691,9 @@ def _set(x) -> int: return noconvert_columns - def _infer_types(self, values, na_values, cast_type, try_num_bool: bool = True): + def _infer_types( + self, values, na_values, no_dtype_specified, try_num_bool: bool = True + ): """ Infer types of values, possibly casting @@ -696,7 +701,7 @@ def _infer_types(self, values, na_values, cast_type, try_num_bool: bool = True): ---------- values : ndarray na_values : set - cast_type: Specifies if we want to cast explicitly + no_dtype_specified: Specifies if we want to cast explicitly try_num_bool : bool, default try try to cast values to numeric (first preference) or boolean @@ -718,7 +723,7 @@ def _infer_types(self, values, na_values, cast_type, try_num_bool: bool = True): return values, na_count use_nullable_dtypes: Literal[True] | Literal[False] = ( - self.use_nullable_dtypes and cast_type is None + self.use_nullable_dtypes and no_dtype_specified ) result: ArrayLike @@ -741,7 +746,11 @@ def _infer_types(self, values, na_values, cast_type, try_num_bool: bool = True): if result_mask is None: result_mask = np.zeros(result.shape, dtype=np.bool_) - if is_integer_dtype(result): + if result_mask.all(): + result = IntegerArray( + np.ones(result_mask.shape, dtype=np.int64), result_mask + ) + elif is_integer_dtype(result): result = IntegerArray(result, result_mask) elif is_bool_dtype(result): result = BooleanArray(result, result_mask) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 1312308bcaa71..76803591951d6 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -9,6 +9,7 @@ import pytest from pandas.errors import ParserWarning +import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -16,6 +17,10 @@ Timestamp, ) import pandas._testing as tm +from pandas.core.arrays import ( + ArrowStringArray, + StringArray, +) # TODO(1.4): Change me into xfail at release time # and xfail individual tests @@ -392,9 +397,9 @@ def test_use_nullabla_dtypes(all_parsers): parser = all_parsers - data = """a,b,c,d,e,f,g,h,i -1,2.5,True,a,,,,,12-31-2019 -3,4.5,False,b,6,7.5,True,a,12-31-2019 + data = """a,b,c,d,e,f,g,h,i,j +1,2.5,True,a,,,,,12-31-2019, +3,4.5,False,b,6,7.5,True,a,12-31-2019, """ result = parser.read_csv( StringIO(data), use_nullable_dtypes=True, parse_dates=["i"] @@ -410,6 +415,7 @@ def test_use_nullabla_dtypes(all_parsers): "g": pd.Series([pd.NA, True], dtype="boolean"), "h": pd.Series([pd.NA, "a"], dtype="string"), "i": pd.Series([Timestamp("2019-12-31")] * 2), + "j": pd.Series([pd.NA, pd.NA], dtype="Int64"), } ) tm.assert_frame_equal(result, expected) @@ -427,3 +433,36 @@ def test_use_nullabla_dtypes_and_dtype(all_parsers): result = parser.read_csv(StringIO(data), use_nullable_dtypes=True, dtype="float64") expected = DataFrame({"a": [1.0, np.nan], "b": [2.5, np.nan]}) tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("pyarrow") +@pytest.mark.parametrize("storage", ["pyarrow", "python"]) +def test_use_nullabla_dtypes_string(all_parsers, storage): + # GH#36712 + import pyarrow as pa + + with pd.option_context("mode.string_storage", storage): + + parser = all_parsers + + data = """a,b +a,x +b, +""" + result = parser.read_csv(StringIO(data), use_nullable_dtypes=True) + + if storage == "python": + expected = DataFrame( + { + "a": StringArray(np.array(["a", "b"], dtype=np.object_)), + "b": StringArray(np.array(["x", pd.NA], dtype=np.object_)), + } + ) + else: + expected = DataFrame( + { + "a": ArrowStringArray(pa.array(["a", "b"])), + "b": ArrowStringArray(pa.array(["x", None])), + } + ) + tm.assert_frame_equal(result, expected) From 30d68a81db859867cf3e577c9e981816c20489cc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 7 Oct 2022 13:45:35 +0200 Subject: [PATCH 6/6] Fix typo --- pandas/tests/io/parser/dtypes/test_dtypes_basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 76803591951d6..345da0bca8668 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -392,7 +392,7 @@ def test_dtypes_defaultdict_invalid(all_parsers): parser.read_csv(StringIO(data), dtype=dtype) -def test_use_nullabla_dtypes(all_parsers): +def test_use_nullable_dtypes(all_parsers): # GH#36712 parser = all_parsers