From 148fc9d33a5fd937cbda81bdd257085bd9c6d6b2 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 29 Mar 2021 16:51:12 -0700 Subject: [PATCH 01/26] Add nullable dtypes to read_csv --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/_libs/lib.pyx | 50 +++++++++++++--- pandas/_libs/ops.pyx | 32 ++++++----- pandas/_libs/parsers.pyx | 41 ++++++++++--- pandas/io/parsers/base_parser.py | 30 +++++++--- pandas/io/parsers/readers.py | 10 +++- pandas/tests/dtypes/test_inference.py | 12 ++++ pandas/tests/io/parser/test_na_values.py | 73 +++++++++++++++++++++--- 8 files changed, 203 insertions(+), 46 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index e245bf797d932..0c3d87d225819 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -142,6 +142,7 @@ Other enhancements - Add support for unary operators in :class:`FloatingArray` (:issue:`38749`) - :class:`RangeIndex` can now be constructed by passing a ``range`` object directly e.g. ``pd.RangeIndex(range(3))`` (:issue:`12067`) - :meth:`round` being enabled for the nullable integer and floating dtypes (:issue:`38844`) +- :meth:`pandas.read_csv` now accepts an argument ``use_nullable_dtypes`` that allows reading data directly into the nullable integer and boolean data types (:issue:`36712`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 1ff481553e413..53980548ff6cf 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1984,7 +1984,8 @@ def maybe_convert_numeric( set na_values, bint convert_empty=True, bint coerce_numeric=False, -) -> ndarray: + bint convert_to_nullable_integer=False, +) -> "ArrayLike": """ Convert object array to a numeric array if possible. @@ -2008,6 +2009,9 @@ def maybe_convert_numeric( numeric array has no suitable numerical dtype to return (i.e. uint64, int32, uint8). If set to False, the original object array will be returned. Otherwise, a ValueError will be raised. + convert_to_nullable_integer : bool, default False + If an array-like object contains only integer values (and NaN) is + encountered, whether to convert and return an IntegerArray. Returns ------- @@ -2039,21 +2043,34 @@ def maybe_convert_numeric( ndarray[int64_t] ints = np.empty(n, dtype='i8') ndarray[uint64_t] uints = np.empty(n, dtype='u8') ndarray[uint8_t] bools = np.empty(n, dtype='u1') + ndarray[uint8_t] mask = np.zeros(n, dtype="u1") float64_t fval for i in range(n): val = values[i] + # We only want to disable NaNs showing as float if + # a) convert_to_nullable_integer = True + # b) no floats have been seen ( assuming an int shows up later ) + # However, if no ints present (all null array), we need to return floats + allow_nullable_dtypes = convert_to_nullable_integer and not seen.float_ if val.__hash__ is not None and val in na_values: - seen.saw_null() - floats[i] = complexes[i] = NaN + if allow_nullable_dtypes: + seen.null_ = True + mask[i] = 1 + else: + floats[i] = complexes[i] = NaN + seen.saw_null() elif util.is_float_object(val): fval = val if fval != fval: + mask[i] = 1 seen.null_ = True - + if not allow_nullable_dtypes: + seen.float_ = True + else: + seen.float_ = True floats[i] = complexes[i] = fval - seen.float_ = True elif util.is_integer_object(val): floats[i] = complexes[i] = val @@ -2076,7 +2093,11 @@ def maybe_convert_numeric( floats[i] = uints[i] = ints[i] = bools[i] = val seen.bool_ = True elif val is None or val is C_NA: - seen.saw_null() + if allow_nullable_dtypes: + seen.null_ = True + mask[i] = 1 + else: + seen.saw_null() floats[i] = complexes[i] = NaN elif hasattr(val, '__len__') and len(val) == 0: if convert_empty or seen.coerce_numeric: @@ -2100,6 +2121,7 @@ def maybe_convert_numeric( else: if fval != fval: seen.null_ = True + mask[i] = 1 floats[i] = fval @@ -2107,7 +2129,10 @@ def maybe_convert_numeric( as_int = int(val) if as_int in na_values: - seen.saw_null() + mask[i] = 1 + seen.null_ = True + if not convert_to_nullable_integer: + seen.float_ = True else: seen.saw_int(as_int) @@ -2137,11 +2162,22 @@ def maybe_convert_numeric( if seen.check_uint64_conflict(): return values + # This occurs since we disabled float nulls showing as null in anticipation + # of seeing ints that were never seen. So then, we return float + if convert_to_nullable_integer and seen.null_ and not seen.int_: + seen.float_ = True + if seen.complex_: return complexes elif seen.float_: return floats elif seen.int_: + if seen.null_ and convert_to_nullable_integer: + from pandas.core.arrays import IntegerArray + if seen.uint_: + return IntegerArray(uints, mask.view(np.bool_)) + else: + return IntegerArray(ints, mask.view(np.bool_)) if seen.uint_: return uints else: diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx index 1e51a578c44ea..8b17d311747ac 100644 --- a/pandas/_libs/ops.pyx +++ b/pandas/_libs/ops.pyx @@ -24,10 +24,7 @@ import_array() from pandas._libs.missing cimport checknull -from pandas._libs.util cimport ( - UINT8_MAX, - is_nan, -) +from pandas._libs.util cimport is_nan @cython.wraparound(False) @@ -258,17 +255,20 @@ def vec_binop(object[:] left, object[:] right, object op): def maybe_convert_bool(ndarray[object] arr, - true_values=None, false_values=None): + true_values=None, + false_values=None, + convert_to_nullable_boolean=False): cdef: Py_ssize_t i, n ndarray[uint8_t] result + ndarray[uint8_t] mask object val set true_vals, false_vals - int na_count = 0 + bint has_na = False n = len(arr) result = np.empty(n, dtype=np.uint8) - + mask = np.zeros(n, dtype=np.uint8) # the defaults true_vals = {'True', 'TRUE', 'true'} false_vals = {'False', 'FALSE', 'false'} @@ -292,15 +292,19 @@ def maybe_convert_bool(ndarray[object] arr, elif val in false_vals: result[i] = 0 elif isinstance(val, float): - result[i] = UINT8_MAX - na_count += 1 + mask[i] = 1 + result[i] = 0 # Value here doesn't matter, will be replaced w/ nan + has_na = True else: return arr - if na_count > 0: - mask = result == UINT8_MAX - arr = result.view(np.bool_).astype(object) - np.putmask(arr, mask, np.nan) - return arr + if has_na: + if convert_to_nullable_boolean: + from pandas.core.arrays import BooleanArray + return BooleanArray(result.view(np.bool_), mask.view(np.bool_)) + else: + arr = result.view(np.bool_).astype(object) + np.putmask(arr, mask, np.nan) + return arr else: return result.view(np.bool_) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index c4d98ccb88ba5..b50ab5f71cbe2 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -105,6 +105,11 @@ from pandas.core.dtypes.common import ( ) from pandas.core.dtypes.concat import union_categoricals +from pandas.core.arrays import ( + BooleanArray, + IntegerArray, +) + cdef: float64_t INF = np.inf float64_t NEGINF = -INF @@ -323,6 +328,7 @@ cdef class TextReader: int64_t leading_cols, table_width, skipfooter, buffer_lines bint allow_leading_cols, mangle_dupe_cols, low_memory bint delim_whitespace + bint use_nullable_dtypes object delimiter, converters object na_values object header, orig_header, names, header_start, header_end @@ -370,7 +376,8 @@ cdef class TextReader: bint verbose=False, bint mangle_dupe_cols=True, float_precision=None, - bint skip_blank_lines=True): + bint skip_blank_lines=True, + bint use_nullable_dtypes=False): # set encoding for native Python and C library self.c_encoding = NULL @@ -430,6 +437,7 @@ cdef class TextReader: # consistent with csv module semantics, cast all to float dtype_order = dtype_order[1:] self.dtype_cast_order = [np.dtype(x) for x in dtype_order] + self.use_nullable_dtypes = use_nullable_dtypes if comment is not None: if len(comment) > 1: @@ -1021,7 +1029,8 @@ cdef class TextReader: # don't try to upcast EAs try_upcast = upcast_na and na_count > 0 if try_upcast and not is_extension_array_dtype(col_dtype): - col_res = _maybe_upcast(col_res) + col_res = _maybe_upcast(col_res, + use_nullable_dtypes=self.use_nullable_dtypes) if col_res is None: raise ParserError(f'Unable to parse column {i}') @@ -1318,18 +1327,36 @@ STR_NA_VALUES = { _NA_VALUES = _ensure_encoded(list(STR_NA_VALUES)) -def _maybe_upcast(arr): +def _maybe_upcast(arr, use_nullable_dtypes=False): """ + Tries to upcast null values for integer and boolean data types. + If arr of boolean dtype, arr is upcast to object dtype or BooleanArray + and if arr is of integer dtype, arr is upcast to float dtype, or IntegerArray. + + Parameters + ---------- + arr : ndarray + Array to upcast. + use_nullable_dtypes: bool, default False + Whether to use nullable integer/boolean(IntegerArray/BooleanArray) + datatypes instead of upcasting. """ if issubclass(arr.dtype.type, np.integer): na_value = na_values[arr.dtype] - arr = arr.astype(float) - np.putmask(arr, arr == na_value, np.nan) + mask = arr == na_value + if use_nullable_dtypes: + arr = IntegerArray(arr, mask) + else: + arr = arr.astype(float) + np.putmask(arr, mask, np.nan) elif arr.dtype == np.bool_: mask = arr.view(np.uint8) == na_values[np.uint8] - arr = arr.astype(object) - np.putmask(arr, mask, np.nan) + if use_nullable_dtypes: + arr = BooleanArray(arr, mask) + else: + arr = arr.astype(object) + np.putmask(arr, mask, np.nan) return arr diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 2d17978b60327..404c2e2df6288 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -54,6 +54,10 @@ from pandas.core.dtypes.missing import isna from pandas.core import algorithms +from pandas.core.api import ( + NA, + array as pd_array, +) from pandas.core.arrays import Categorical from pandas.core.indexes.api import ( Index, @@ -109,6 +113,7 @@ "mangle_dupe_cols": True, "infer_datetime_format": False, "skip_blank_lines": True, + "use_nullable_dtypes": False, } @@ -199,6 +204,7 @@ def __init__(self, kwds): self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"]) + self.use_nullable_dtypes = kwds.get("use_nullable_dtypes", False) self.handles: Optional[IOHandles] = None def _open_handles(self, src: FilePathOrBuffer, kwds: Dict[str, Any]) -> None: @@ -548,10 +554,7 @@ def _convert_to_ndarrays( ) # type specified in dtype param or cast_type is an EA - if cast_type and ( - not is_dtype_equal(cvals, cast_type) - or is_extension_array_dtype(cast_type) - ): + if cast_type and (not is_dtype_equal(cvals, cast_type) or is_ea): if not is_ea and na_count > 0: try: if is_bool_dtype(cast_type): @@ -645,12 +648,12 @@ def _infer_types(self, values, na_values, try_num_bool=True): ---------- values : ndarray na_values : set - try_num_bool : bool, default try + try_num_bool : bool, default True try to cast values to numeric (first preference) or boolean Returns ------- - converted : ndarray + converted : ndarray or ExtensionArray na_count : int """ na_count = 0 @@ -659,14 +662,24 @@ def _infer_types(self, values, na_values, try_num_bool=True): na_count = mask.sum() if na_count > 0: if is_integer_dtype(values): - values = values.astype(np.float64) + if self.use_nullable_dtypes: + values = pd_array(values, dtype="Int64") + values[mask] = NA # <- This is pd.NA + return values, na_count + else: + values = values.astype(np.float64) np.putmask(values, mask, np.nan) return values, na_count if try_num_bool and is_object_dtype(values.dtype): # exclude e.g DatetimeIndex here try: - result = lib.maybe_convert_numeric(values, na_values, False) + result = lib.maybe_convert_numeric( + values, + na_values, + convert_empty=False, + convert_to_nullable_integer=self.use_nullable_dtypes, + ) except (ValueError, TypeError): # e.g. encountering datetime string gets ValueError # TypeError can be raised in floatify @@ -684,6 +697,7 @@ def _infer_types(self, values, na_values, try_num_bool=True): np.asarray(values), true_values=self.true_values, false_values=self.false_values, + convert_to_nullable_boolean=self.use_nullable_dtypes, ) return result, na_count diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index edfc7ee0b6258..a02a9d1fdf90d 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -339,6 +339,13 @@ .. versionchanged:: 1.2 +use_nullable_dtypes : bool, default False + If True, use dtypes that use pd.NA as missing value indicator for + the resulting DataFrame. Currently supports reading data into the nullable boolean + and integer array types, but not string arrays. + + .. versionadded:: 1.3 + {storage_options} .. versionadded:: 1.2 @@ -524,6 +531,7 @@ def read_csv( low_memory=_c_parser_defaults["low_memory"], memory_map=False, float_precision=None, + use_nullable_dtypes=False, storage_options: StorageOptions = None, ): kwds = locals() @@ -604,6 +612,7 @@ def read_table( low_memory=_c_parser_defaults["low_memory"], memory_map=False, float_precision=None, + use_nullable_dtypes=False, ): kwds = locals() del kwds["filepath_or_buffer"] @@ -812,7 +821,6 @@ def _clean_options(self, options, engine): sep = options["delimiter"] delim_whitespace = options["delim_whitespace"] - if sep is None and not delim_whitespace: if engine == "c": fallback_reason = ( diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 78a62c832833f..f46bbeedf7ff4 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -607,6 +607,18 @@ def test_maybe_convert_objects_nullable_integer(self, exp): tm.assert_extension_array_equal(result, exp) + @pytest.mark.parametrize( + "exp", + [ + IntegerArray(np.array([2, 0], dtype="i8"), np.array([False, True])), + IntegerArray(np.array([2, 0], dtype="int64"), np.array([False, True])), + ], + ) + def test_maybe_convert_numeric_nullable_integer(self, exp): + arr = np.array([2, np.NaN], dtype=object) + result = lib.maybe_convert_numeric(arr, set(), convert_to_nullable_integer=True) + tm.assert_extension_array_equal(result, exp) + def test_maybe_convert_objects_bool_nan(self): # GH32146 ind = Index([True, False, np.nan], dtype=object) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index fecba8bd81404..96b43286cc854 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -10,9 +10,11 @@ from pandas._libs.parsers import STR_NA_VALUES from pandas import ( + NA, DataFrame, Index, MultiIndex, + array as pd_array, ) import pandas._testing as tm @@ -146,20 +148,73 @@ def test_custom_na_values(all_parsers, na_values): tm.assert_frame_equal(result, expected) -def test_bool_na_values(all_parsers): +@pytest.mark.parametrize( + "use_nullable_dtypes, expected", + [ + ( + True, + DataFrame( + { + "A": pd_array([True, NA, False], dtype="boolean"), + "B": pd_array([False, True, NA], dtype="boolean"), + "C": [True, False, True], + } + ), + ), + ( + False, + DataFrame( + { + "A": np.array([True, np.nan, False], dtype=object), + "B": np.array([False, True, np.nan], dtype=object), + "C": [True, False, True], + } + ), + ), + ], +) +def test_bool_na_values(all_parsers, use_nullable_dtypes, expected): data = """A,B,C True,False,True NA,True,False False,NA,True""" parser = all_parsers - result = parser.read_csv(StringIO(data)) - expected = DataFrame( - { - "A": np.array([True, np.nan, False], dtype=object), - "B": np.array([False, True, np.nan], dtype=object), - "C": [True, False, True], - } - ) + result = parser.read_csv(StringIO(data), use_nullable_dtypes=use_nullable_dtypes) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "use_nullable_dtypes, expected", + [ + ( + True, + DataFrame( + { + "A": pd_array([1, NA, 2], dtype="Int64"), + "B": pd_array([3, 2, NA], dtype="Int64"), + "C": [1, 2, 3], + } + ), + ), + ( + False, + DataFrame( + { + "A": np.array([1.0, np.nan, 2.0], dtype="float64"), + "B": np.array([3.0, 2.0, np.nan], dtype="float64"), + "C": [1, 2, 3], + } + ), + ), + ], +) +def test_int_na_values(all_parsers, use_nullable_dtypes, expected): + data = """A,B,C +1,3,1 +NA,2,2 +2,NA,3""" + parser = all_parsers + result = parser.read_csv(StringIO(data), use_nullable_dtypes=use_nullable_dtypes) tm.assert_frame_equal(result, expected) From a70f3a4c3ce634e8982af59c91682783d56079e7 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 29 Mar 2021 18:41:42 -0700 Subject: [PATCH 02/26] Updates --- pandas/_libs/lib.pyx | 2 +- pandas/tests/dtypes/test_inference.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 53980548ff6cf..7d7ca4e238dd0 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2015,7 +2015,7 @@ def maybe_convert_numeric( Returns ------- - np.ndarray + np.ndarray or ExtensionArray Array of converted object values to numerical ones. """ if len(values) == 0: diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index f46bbeedf7ff4..b8102cd613078 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -615,6 +615,7 @@ def test_maybe_convert_objects_nullable_integer(self, exp): ], ) def test_maybe_convert_numeric_nullable_integer(self, exp): + # GH 40687 arr = np.array([2, np.NaN], dtype=object) result = lib.maybe_convert_numeric(arr, set(), convert_to_nullable_integer=True) tm.assert_extension_array_equal(result, exp) From 2504be6ef7545983ce24e6a0de59f1fa429adbb3 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Tue, 30 Mar 2021 08:50:30 -0700 Subject: [PATCH 03/26] More thorough testing --- pandas/_libs/parsers.pyx | 9 ++++++++ pandas/tests/dtypes/test_inference.py | 29 +++++++++++++++++++++++- pandas/tests/io/parser/test_na_values.py | 28 +++++++++++++---------- 3 files changed, 53 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 7cea17327bdca..9256bbb7d018c 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1314,6 +1314,9 @@ def _maybe_upcast(arr, use_nullable_dtypes=False): If arr of boolean dtype, arr is upcast to object dtype or BooleanArray and if arr is of integer dtype, arr is upcast to float dtype, or IntegerArray. + Note: If all values are null, array will be upcast to float64 even if + use_nullable_dtypes is True + Parameters ---------- arr : ndarray @@ -1326,6 +1329,9 @@ def _maybe_upcast(arr, use_nullable_dtypes=False): if issubclass(arr.dtype.type, np.integer): na_value = na_values[arr.dtype] mask = arr == na_value + if np.count_nonzero(mask) == len(arr): + # Array of all NaN, dtype -> float64 + use_nullable_dtypes = False if use_nullable_dtypes: arr = IntegerArray(arr, mask) else: @@ -1333,6 +1339,9 @@ def _maybe_upcast(arr, use_nullable_dtypes=False): np.putmask(arr, mask, np.nan) elif arr.dtype == np.bool_: mask = arr.view(np.uint8) == na_values[np.uint8] + if np.count_nonzero(mask) == len(arr): + # Array of all NaN, dtype -> float64 + use_nullable_dtypes = False if use_nullable_dtypes: arr = BooleanArray(arr, mask) else: diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index f057415070fe1..ac5e29de41768 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -24,6 +24,7 @@ from pandas._libs import ( lib, missing as libmissing, + ops as libops, ) import pandas.util._test_decorators as td @@ -60,7 +61,10 @@ Timestamp, ) import pandas._testing as tm -from pandas.core.arrays import IntegerArray +from pandas.core.arrays import ( + BooleanArray, + IntegerArray, +) @pytest.fixture(params=[True, False], ids=str) @@ -415,6 +419,29 @@ def test_isneginf_scalar(self, value, expected): result = libmissing.isneginf_scalar(value) assert result is expected + @pytest.mark.parametrize( + "convert_to_nullable_boolean, exp", + [ + ( + True, + BooleanArray( + np.array([True, False], dtype="bool"), np.array([False, True]) + ), + ), + (False, np.array([True, np.nan], dtype="object")), + ], + ) + def test_maybe_convert_nullable_boolean(self, convert_to_nullable_boolean, exp): + # GH 40687 + arr = np.array([True, np.NaN], dtype=object) + result = libops.maybe_convert_bool( + arr, set(), convert_to_nullable_boolean=convert_to_nullable_boolean + ) + if convert_to_nullable_boolean: + tm.assert_extension_array_equal(result, exp) + else: + tm.assert_numpy_array_equal(result, exp) + @pytest.mark.parametrize("coerce_numeric", [True, False]) @pytest.mark.parametrize( "infinity", ["inf", "inF", "iNf", "Inf", "iNF", "InF", "INf", "INF"] diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 96b43286cc854..e38257a680e6d 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -157,7 +157,8 @@ def test_custom_na_values(all_parsers, na_values): { "A": pd_array([True, NA, False], dtype="boolean"), "B": pd_array([False, True, NA], dtype="boolean"), - "C": [True, False, True], + "C": np.array([np.nan, np.nan, np.nan], dtype="float64"), + "D": np.array([True, False, True], dtype="bool"), } ), ), @@ -167,17 +168,18 @@ def test_custom_na_values(all_parsers, na_values): { "A": np.array([True, np.nan, False], dtype=object), "B": np.array([False, True, np.nan], dtype=object), - "C": [True, False, True], + "C": np.array([np.nan, np.nan, np.nan], dtype="float64"), + "D": np.array([True, False, True], dtype="bool"), } ), ), ], ) def test_bool_na_values(all_parsers, use_nullable_dtypes, expected): - data = """A,B,C -True,False,True -NA,True,False -False,NA,True""" + data = """A,B,C,D +True,False,NA,True +NA,True,NA,False +False,NA,NA,True""" parser = all_parsers result = parser.read_csv(StringIO(data), use_nullable_dtypes=use_nullable_dtypes) tm.assert_frame_equal(result, expected) @@ -192,7 +194,8 @@ def test_bool_na_values(all_parsers, use_nullable_dtypes, expected): { "A": pd_array([1, NA, 2], dtype="Int64"), "B": pd_array([3, 2, NA], dtype="Int64"), - "C": [1, 2, 3], + "C": pd_array([NA, 1, 2], dtype="Int64"), + "D": np.array([1, 2, 3], dtype="int64"), } ), ), @@ -202,17 +205,18 @@ def test_bool_na_values(all_parsers, use_nullable_dtypes, expected): { "A": np.array([1.0, np.nan, 2.0], dtype="float64"), "B": np.array([3.0, 2.0, np.nan], dtype="float64"), - "C": [1, 2, 3], + "C": np.array([np.nan, 1.0, 2.0], dtype="float64"), + "D": np.array([1, 2, 3], dtype="int64"), } ), ), ], ) def test_int_na_values(all_parsers, use_nullable_dtypes, expected): - data = """A,B,C -1,3,1 -NA,2,2 -2,NA,3""" + data = """A,B,C,D +1,3,NA,1 +NA,2,1,2 +2,NA,2,3""" parser = all_parsers result = parser.read_csv(StringIO(data), use_nullable_dtypes=use_nullable_dtypes) tm.assert_frame_equal(result, expected) From 63733dcbc346d9ec2b0c9497d82c564493f9e870 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 31 Mar 2021 12:06:54 -0700 Subject: [PATCH 04/26] Optimizations & Found a bug! --- pandas/_libs/lib.pyx | 2 +- pandas/_libs/ops.pyi | 3 ++- pandas/_libs/parsers.pyx | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 3d4df9e53e603..46bfcfd6b49e6 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2081,8 +2081,8 @@ def maybe_convert_numeric( seen.null_ = True mask[i] = 1 else: - floats[i] = complexes[i] = NaN seen.saw_null() + floats[i] = complexes[i] = NaN elif util.is_float_object(val): fval = val if fval != fval: diff --git a/pandas/_libs/ops.pyi b/pandas/_libs/ops.pyi index b4f42f217a5db..aace09c50bc28 100644 --- a/pandas/_libs/ops.pyi +++ b/pandas/_libs/ops.pyi @@ -39,5 +39,6 @@ def vec_binop( def maybe_convert_bool( arr: np.ndarray, # np.ndarray[object] true_values=..., - false_values=... + false_values=..., + convert_to_nullable_boolean: bool = True, ) -> np.ndarray: ... diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 9256bbb7d018c..47d0826d8ea13 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1329,7 +1329,7 @@ def _maybe_upcast(arr, use_nullable_dtypes=False): if issubclass(arr.dtype.type, np.integer): na_value = na_values[arr.dtype] mask = arr == na_value - if np.count_nonzero(mask) == len(arr): + if mask.all(): # Array of all NaN, dtype -> float64 use_nullable_dtypes = False if use_nullable_dtypes: @@ -1339,7 +1339,7 @@ def _maybe_upcast(arr, use_nullable_dtypes=False): np.putmask(arr, mask, np.nan) elif arr.dtype == np.bool_: mask = arr.view(np.uint8) == na_values[np.uint8] - if np.count_nonzero(mask) == len(arr): + if mask.all(): # Array of all NaN, dtype -> float64 use_nullable_dtypes = False if use_nullable_dtypes: From 8baf12030e3be4872fb0c5c970f568eac80c639e Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sun, 4 Apr 2021 12:19:12 -0700 Subject: [PATCH 05/26] WIP --- pandas/_libs/lib.pyx | 25 +++++++++++++----- pandas/_libs/parsers.pyx | 33 ++++++++++++++---------- pandas/io/parsers/base_parser.py | 30 +++++++++++++++------ pandas/tests/dtypes/test_inference.py | 25 ++++++++++++++++++ pandas/tests/io/parser/test_na_values.py | 27 +++++++++++++++++++ 5 files changed, 113 insertions(+), 27 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 46bfcfd6b49e6..f1d42272ba8b1 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2007,6 +2007,7 @@ def maybe_convert_numeric( bint convert_empty=True, bint coerce_numeric=False, bint convert_to_nullable_integer=False, + bint convert_to_floating_array=False, ) -> "ArrayLike": """ Convert object array to a numeric array if possible. @@ -2034,7 +2035,9 @@ def maybe_convert_numeric( convert_to_nullable_integer : bool, default False If an array-like object contains only integer values (and NaN) is encountered, whether to convert and return an IntegerArray. - + convert_to_floating_array : bool, default False + If an array-like object contains only float values (and NaN) is + encountered, whether to convert and return an FloatingArray. Returns ------- np.ndarray or ExtensionArray @@ -2074,21 +2077,26 @@ def maybe_convert_numeric( # a) convert_to_nullable_integer = True # b) no floats have been seen ( assuming an int shows up later ) # However, if no ints present (all null array), we need to return floats - allow_nullable_dtypes = convert_to_nullable_integer and not seen.float_ + allow_null_in_int = convert_to_nullable_integer and not seen.float_ if val.__hash__ is not None and val in na_values: - if allow_nullable_dtypes: + if allow_null_in_int: seen.null_ = True mask[i] = 1 else: + if convert_to_floating_array: + mask[i] = 1 seen.saw_null() floats[i] = complexes[i] = NaN elif util.is_float_object(val): fval = val if fval != fval: - mask[i] = 1 seen.null_ = True - if not allow_nullable_dtypes: + if allow_null_in_int: + mask[i] = 1 + else: + if convert_to_floating_array: + mask[i] = 1 seen.float_ = True else: seen.float_ = True @@ -2115,10 +2123,12 @@ def maybe_convert_numeric( floats[i] = uints[i] = ints[i] = bools[i] = val seen.bool_ = True elif val is None or val is C_NA: - if allow_nullable_dtypes: + if allow_null_in_int: seen.null_ = True mask[i] = 1 else: + if convert_to_floating_array: + mask[i] = 1 seen.saw_null() floats[i] = complexes[i] = NaN elif hasattr(val, '__len__') and len(val) == 0: @@ -2192,6 +2202,9 @@ def maybe_convert_numeric( if seen.complex_: return complexes elif seen.float_: + if seen.null_ and convert_to_floating_array: + from pandas.core.arrays import FloatingArray + return FloatingArray(floats, mask.view(np.bool_)) return floats elif seen.int_: if seen.null_ and convert_to_nullable_integer: diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 47d0826d8ea13..76fa6efd4f88d 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -92,6 +92,7 @@ from pandas._libs.khash cimport ( kh_strbox_t, khiter_t, ) +from pandas._libs.missing cimport C_NA from pandas.errors import ( DtypeWarning, @@ -114,7 +115,9 @@ from pandas.core.dtypes.concat import union_categoricals from pandas.core.arrays import ( BooleanArray, + FloatingArray, IntegerArray, + StringArray, ) cdef: @@ -1017,8 +1020,11 @@ cdef class TextReader: # don't try to upcast EAs try_upcast = upcast_na and na_count > 0 if try_upcast and not is_extension_array_dtype(col_dtype): - col_res = _maybe_upcast(col_res, - use_nullable_dtypes=self.use_nullable_dtypes) + if na_count < len(col_res): + col_res = _maybe_upcast(col_res, self.use_nullable_dtypes) + else: + # All NaN -> float64 + col_res = col_res.astype("float64") if col_res is None: raise ParserError(f'Unable to parse column {i}') @@ -1314,9 +1320,6 @@ def _maybe_upcast(arr, use_nullable_dtypes=False): If arr of boolean dtype, arr is upcast to object dtype or BooleanArray and if arr is of integer dtype, arr is upcast to float dtype, or IntegerArray. - Note: If all values are null, array will be upcast to float64 even if - use_nullable_dtypes is True - Parameters ---------- arr : ndarray @@ -1326,27 +1329,31 @@ def _maybe_upcast(arr, use_nullable_dtypes=False): datatypes instead of upcasting. """ + na_value = na_values[arr.dtype] if issubclass(arr.dtype.type, np.integer): - na_value = na_values[arr.dtype] mask = arr == na_value - if mask.all(): - # Array of all NaN, dtype -> float64 - use_nullable_dtypes = False if use_nullable_dtypes: arr = IntegerArray(arr, mask) else: arr = arr.astype(float) np.putmask(arr, mask, np.nan) elif arr.dtype == np.bool_: - mask = arr.view(np.uint8) == na_values[np.uint8] - if mask.all(): - # Array of all NaN, dtype -> float64 - use_nullable_dtypes = False + mask = arr.view(np.uint8) == na_value if use_nullable_dtypes: arr = BooleanArray(arr, mask) else: arr = arr.astype(object) np.putmask(arr, mask, np.nan) + elif use_nullable_dtypes and arr.dtype == np.floating: + mask = arr == na_value + if mask.any(): + arr = FloatingArray(arr, mask) + elif use_nullable_dtypes and arr.dtype == np.object_: + # Maybe convert StringArray & catch error for non-strings + try: + arr = StringArray(arr) + except ValueError: + pass return arr diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 4667b28fce3e2..9fa1dc0f9ec8a 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -58,7 +58,10 @@ NA, array as pd_array, ) -from pandas.core.arrays import Categorical +from pandas.core.arrays import ( + Categorical, + StringArray, +) from pandas.core.indexes.api import ( Index, MultiIndex, @@ -689,6 +692,7 @@ def _infer_types(self, values, na_values, try_num_bool=True): na_values, convert_empty=False, convert_to_nullable_integer=self.use_nullable_dtypes, + convert_to_floating_array=self.use_nullable_dtypes, ) except (ValueError, TypeError): # e.g. encountering datetime string gets ValueError @@ -702,13 +706,23 @@ def _infer_types(self, values, na_values, try_num_bool=True): if values.dtype == np.object_: na_count = parsers.sanitize_objects(values, na_values, False) - if result.dtype == np.object_ and try_num_bool: - result = libops.maybe_convert_bool( - np.asarray(values), - true_values=self.true_values, - false_values=self.false_values, - convert_to_nullable_boolean=self.use_nullable_dtypes, - ) + if result.dtype == np.object_: + if try_num_bool: + result = libops.maybe_convert_bool( + np.asarray(values), + true_values=self.true_values, + false_values=self.false_values, + convert_to_nullable_boolean=self.use_nullable_dtypes, + ) + # Maybe StringArray? Must have NA value to trigger + # Since it is called use_nullable_dtypes after all + # However, all NA -> float64 not StringArray + if self.use_nullable_dtypes and na_count > 0 and na_count < len(result): + try: + result = StringArray(result) + except ValueError as e: + print(e) + pass return result, na_count diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index ac5e29de41768..858da789f998a 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -63,7 +63,9 @@ import pandas._testing as tm from pandas.core.arrays import ( BooleanArray, + FloatingArray, IntegerArray, + StringArray, ) @@ -647,6 +649,29 @@ def test_maybe_convert_numeric_nullable_integer(self, exp): result = lib.maybe_convert_numeric(arr, set(), convert_to_nullable_integer=True) tm.assert_extension_array_equal(result, exp) + @pytest.mark.parametrize( + "convert_to_floating_array, exp", + [ + ( + True, + FloatingArray( + np.array([2.0, 0.0], dtype="float64"), np.array([False, True]) + ), + ), + (False, np.array([2.0, np.nan])), + ], + ) + def test_maybe_convert_numeric_floating_array(self, convert_to_floating_array, exp): + # GH 40687 + arr = np.array([2, np.nan], dtype=object) + result = lib.maybe_convert_numeric( + arr, set(), convert_to_floating_array=convert_to_floating_array + ) + if convert_to_floating_array: + tm.assert_extension_array_equal(result, exp) + else: + tm.assert_numpy_array_equal(result, exp) + def test_maybe_convert_objects_bool_nan(self): # GH32146 ind = Index([True, False, np.nan], dtype=object) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index e38257a680e6d..248a9472bb61a 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -222,6 +222,33 @@ def test_int_na_values(all_parsers, use_nullable_dtypes, expected): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "use_nullable_dtypes, expected", + [ + ( + True, + DataFrame( + { + "A": np.array(["hi", "hello", "hey"], dtype=object), + "B": pd_array(["hi", NA, "hello"], dtype="string"), + "C": pd_array([NA, "hi", "hey"], dtype="string"), + "D": np.array([np.nan, np.nan, np.nan], dtype="float64"), + } + ), + ) + ], +) +def test_string_na_values(all_parsers, use_nullable_dtypes, expected): + data = """A,B,C,D +hi,hi,NA,NA +hello,NA,hi,NA +hey,hello,hey,NA""" + parser = all_parsers + result = parser.read_csv(StringIO(data), use_nullable_dtypes=use_nullable_dtypes) + print(result) + tm.assert_frame_equal(result, expected) + + def test_na_value_dict(all_parsers): data = """A,B,C foo,bar,NA From c3ee88320a5660a973fefff878442d59758a4ca8 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 12 Apr 2021 19:41:15 -0700 Subject: [PATCH 06/26] float support working --- pandas/_libs/parsers.pyx | 35 +++++++++++--------- pandas/io/parsers/base_parser.py | 7 ++-- pandas/tests/io/parser/test_na_values.py | 42 ++++++++++++++++++++++-- 3 files changed, 62 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 76fa6efd4f88d..db2c11f8d2de3 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1020,11 +1020,7 @@ cdef class TextReader: # don't try to upcast EAs try_upcast = upcast_na and na_count > 0 if try_upcast and not is_extension_array_dtype(col_dtype): - if na_count < len(col_res): - col_res = _maybe_upcast(col_res, self.use_nullable_dtypes) - else: - # All NaN -> float64 - col_res = col_res.astype("float64") + col_res = _maybe_upcast(col_res, self.use_nullable_dtypes) if col_res is None: raise ParserError(f'Unable to parse column {i}') @@ -1316,24 +1312,32 @@ _NA_VALUES = _ensure_encoded(list(STR_NA_VALUES)) def _maybe_upcast(arr, use_nullable_dtypes=False): """ - Tries to upcast null values for integer and boolean data types. - If arr of boolean dtype, arr is upcast to object dtype or BooleanArray - and if arr is of integer dtype, arr is upcast to float dtype, or IntegerArray. + Tries to upcast null values or use nullable dtypes if set to True. + Parameters ---------- arr : ndarray Array to upcast. use_nullable_dtypes: bool, default False - Whether to use nullable integer/boolean(IntegerArray/BooleanArray) - datatypes instead of upcasting. + Whether to use nullable datatypes instead of upcasting. + If true, then: + - int w/ NaN -> IntegerArray + - bool w/ NaN -> BooleanArray + - float w/NaN -> FloatingArray + - object(strings) w/NaN -> StringArray """ na_value = na_values[arr.dtype] if issubclass(arr.dtype.type, np.integer): mask = arr == na_value if use_nullable_dtypes: - arr = IntegerArray(arr, mask) + # only convert to integer array if not all NAN + if not mask.all(): + arr = IntegerArray(arr, mask) + else: + arr = arr.astype(float) + arr = FloatingArray(arr, mask) else: arr = arr.astype(float) np.putmask(arr, mask, np.nan) @@ -1344,15 +1348,14 @@ def _maybe_upcast(arr, use_nullable_dtypes=False): else: arr = arr.astype(object) np.putmask(arr, mask, np.nan) - elif use_nullable_dtypes and arr.dtype == np.floating: - mask = arr == na_value - if mask.any(): - arr = FloatingArray(arr, mask) + elif use_nullable_dtypes and arr.dtype == np.float64: + mask = np.isnan(arr) + arr = FloatingArray(arr, mask) elif use_nullable_dtypes and arr.dtype == np.object_: # Maybe convert StringArray & catch error for non-strings try: arr = StringArray(arr) - except ValueError: + except ValueError as e: pass return arr diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 9fa1dc0f9ec8a..60075b9ee61f1 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -716,12 +716,11 @@ def _infer_types(self, values, na_values, try_num_bool=True): ) # Maybe StringArray? Must have NA value to trigger # Since it is called use_nullable_dtypes after all - # However, all NA -> float64 not StringArray - if self.use_nullable_dtypes and na_count > 0 and na_count < len(result): + # However, all NA -> Float64 not StringArray + if self.use_nullable_dtypes and 0 < na_count < len(result): try: result = StringArray(result) - except ValueError as e: - print(e) + except ValueError: pass return result, na_count diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 248a9472bb61a..a22e380076bf3 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -157,7 +157,7 @@ def test_custom_na_values(all_parsers, na_values): { "A": pd_array([True, NA, False], dtype="boolean"), "B": pd_array([False, True, NA], dtype="boolean"), - "C": np.array([np.nan, np.nan, np.nan], dtype="float64"), + "C": pd_array([np.nan, np.nan, np.nan], dtype="Float64"), "D": np.array([True, False, True], dtype="bool"), } ), @@ -222,6 +222,44 @@ def test_int_na_values(all_parsers, use_nullable_dtypes, expected): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "use_nullable_dtypes, expected", + [ + ( + True, + DataFrame( + { + "A": pd_array([1.0, NA, 2.0], dtype="Float64"), + "B": np.array([3.0, 2.0, 1.0], dtype="float64"), + "C": pd_array([NA, 1.0, 2.0], dtype="Float64"), + "D": pd_array([NA, NA, NA], dtype="Float64"), + } + ), + ), + ( + False, + DataFrame( + { + "A": np.array([1.0, np.nan, 2.0], dtype="float64"), + "B": np.array([3.0, 2.0, 1.0], dtype="float64"), + "C": np.array([np.nan, 1.0, 2.0], dtype="float64"), + "D": np.array([np.nan, np.nan, np.nan], dtype="float64"), + } + ), + ), + ], +) +def test_float_na_values(all_parsers, use_nullable_dtypes, expected): + data = """A,B,C,D +1.0,3,NA,NA +NA,2,1.0,NA +2,1.0,2.0,NA""" + parser = all_parsers + result = parser.read_csv(StringIO(data), use_nullable_dtypes=use_nullable_dtypes) + print(result) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "use_nullable_dtypes, expected", [ @@ -232,7 +270,7 @@ def test_int_na_values(all_parsers, use_nullable_dtypes, expected): "A": np.array(["hi", "hello", "hey"], dtype=object), "B": pd_array(["hi", NA, "hello"], dtype="string"), "C": pd_array([NA, "hi", "hey"], dtype="string"), - "D": np.array([np.nan, np.nan, np.nan], dtype="float64"), + "D": pd_array([np.nan, np.nan, np.nan], dtype="Float64"), } ), ) From 6d49eaf56ead0650778c6dfa87df4ea92adcb29c Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 12 Apr 2021 19:48:55 -0700 Subject: [PATCH 07/26] Fixes --- pandas/_libs/lib.pyi | 2 ++ pandas/tests/dtypes/test_inference.py | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 477c9fd655a4a..0bbe375d235d7 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -75,6 +75,8 @@ def maybe_convert_numeric( na_values: set, convert_empty: bool = True, coerce_numeric: bool = False, + convert_to_nullable_integer: bool = False, + convert_to_floating_array: bool = False ) -> np.ndarray: ... # TODO: restrict `arr`? diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 457f8f6c54667..497fb94b957c8 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -65,7 +65,6 @@ BooleanArray, FloatingArray, IntegerArray, - StringArray, ) From 680ffb1426264dc7cf500984dee856a8d4b9d449 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sun, 18 Apr 2021 14:12:36 -0700 Subject: [PATCH 08/26] Address code review --- pandas/_libs/lib.pyi | 5 +- pandas/_libs/lib.pyx | 41 +++++++--------- pandas/_libs/ops.pyx | 10 ++-- pandas/tests/dtypes/test_inference.py | 68 ++++++++++++++++++--------- 4 files changed, 71 insertions(+), 53 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 0bbe375d235d7..bc1e4a3f28ef2 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -75,9 +75,8 @@ def maybe_convert_numeric( na_values: set, convert_empty: bool = True, coerce_numeric: bool = False, - convert_to_nullable_integer: bool = False, - convert_to_floating_array: bool = False -) -> np.ndarray: ... + convert_to_masked_nullable: bool = False, +) -> np.ndarray | (np.ndarray,np.ndarray): ... # TODO: restrict `arr`? def ensure_string_array( diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e19c4ff17470d..7ed0329751863 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2007,8 +2007,7 @@ def maybe_convert_numeric( set na_values, bint convert_empty=True, bint coerce_numeric=False, - bint convert_to_nullable_integer=False, - bint convert_to_floating_array=False, + bint convert_to_masked_nullable=False, ) -> "ArrayLike": """ Convert object array to a numeric array if possible. @@ -2033,16 +2032,14 @@ def maybe_convert_numeric( numeric array has no suitable numerical dtype to return (i.e. uint64, int32, uint8). If set to False, the original object array will be returned. Otherwise, a ValueError will be raised. - convert_to_nullable_integer : bool, default False - If an array-like object contains only integer values (and NaN) is - encountered, whether to convert and return an IntegerArray. - convert_to_floating_array : bool, default False - If an array-like object contains only float values (and NaN) is - encountered, whether to convert and return an FloatingArray. + convert_to_masked_nullable : bool, default False + Whether to return a mask for the converted values. This also disables + upcasting for ints with nulls to float64. Returns ------- - np.ndarray or ExtensionArray + np.ndarray or tuple of converted values and its mask Array of converted object values to numerical ones. + Also returns mask if convert_to_masked_nullable is True. """ if len(values) == 0: return np.array([], dtype='i8') @@ -2075,17 +2072,17 @@ def maybe_convert_numeric( for i in range(n): val = values[i] # We only want to disable NaNs showing as float if - # a) convert_to_nullable_integer = True + # a) convert_to_masked_nullable = True # b) no floats have been seen ( assuming an int shows up later ) # However, if no ints present (all null array), we need to return floats - allow_null_in_int = convert_to_nullable_integer and not seen.float_ + allow_null_in_int = convert_to_masked_nullable and not seen.float_ if val.__hash__ is not None and val in na_values: if allow_null_in_int: seen.null_ = True mask[i] = 1 else: - if convert_to_floating_array: + if convert_to_masked_nullable: mask[i] = 1 seen.saw_null() floats[i] = complexes[i] = NaN @@ -2096,7 +2093,7 @@ def maybe_convert_numeric( if allow_null_in_int: mask[i] = 1 else: - if convert_to_floating_array: + if convert_to_masked_nullable: mask[i] = 1 seen.float_ = True else: @@ -2128,7 +2125,7 @@ def maybe_convert_numeric( seen.null_ = True mask[i] = 1 else: - if convert_to_floating_array: + if convert_to_masked_nullable: mask[i] = 1 seen.saw_null() floats[i] = complexes[i] = NaN @@ -2164,7 +2161,7 @@ def maybe_convert_numeric( if as_int in na_values: mask[i] = 1 seen.null_ = True - if not convert_to_nullable_integer: + if not allow_null_in_int: seen.float_ = True else: seen.saw_int(as_int) @@ -2197,23 +2194,21 @@ def maybe_convert_numeric( # This occurs since we disabled float nulls showing as null in anticipation # of seeing ints that were never seen. So then, we return float - if convert_to_nullable_integer and seen.null_ and not seen.int_: + if allow_null_in_int and seen.null_ and not seen.int_: seen.float_ = True if seen.complex_: return complexes elif seen.float_: - if seen.null_ and convert_to_floating_array: - from pandas.core.arrays import FloatingArray - return FloatingArray(floats, mask.view(np.bool_)) + if seen.null_ and convert_to_masked_nullable: + return (floats, mask.view(np.bool_)) return floats elif seen.int_: - if seen.null_ and convert_to_nullable_integer: - from pandas.core.arrays import IntegerArray + if seen.null_ and convert_to_masked_nullable: if seen.uint_: - return IntegerArray(uints, mask.view(np.bool_)) + return (uints, mask.view(np.bool_)) else: - return IntegerArray(ints, mask.view(np.bool_)) + return (ints, mask.view(np.bool_)) if seen.uint_: return uints else: diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx index 96d7d6d9870e6..a2b7354e61e53 100644 --- a/pandas/_libs/ops.pyx +++ b/pandas/_libs/ops.pyx @@ -257,7 +257,8 @@ def vec_binop(object[:] left, object[:] right, object op) -> ndarray: def maybe_convert_bool(ndarray[object] arr, true_values=None, false_values=None, - convert_to_nullable_boolean=False) -> "ArrayLike": + convert_to_masked_nullable=False + ) -> np.ndarray | tuple[np.ndarray, np.ndarray]: cdef: Py_ssize_t i, n ndarray[uint8_t] result @@ -291,7 +292,7 @@ def maybe_convert_bool(ndarray[object] arr, result[i] = 1 elif val in false_vals: result[i] = 0 - elif isinstance(val, float): + elif is_nan(val): mask[i] = 1 result[i] = 0 # Value here doesn't matter, will be replaced w/ nan has_na = True @@ -299,9 +300,8 @@ def maybe_convert_bool(ndarray[object] arr, return arr if has_na: - if convert_to_nullable_boolean: - from pandas.core.arrays import BooleanArray - return BooleanArray(result.view(np.bool_), mask.view(np.bool_)) + if convert_to_masked_nullable: + return (result.view(np.bool_), mask.view(np.bool_)) else: arr = result.view(np.bool_).astype(object) np.putmask(arr, mask, np.nan) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 497fb94b957c8..5e8be9b8b987b 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -421,7 +421,7 @@ def test_isneginf_scalar(self, value, expected): assert result is expected @pytest.mark.parametrize( - "convert_to_nullable_boolean, exp", + "convert_to_masked_nullable, exp", [ ( True, @@ -432,14 +432,14 @@ def test_isneginf_scalar(self, value, expected): (False, np.array([True, np.nan], dtype="object")), ], ) - def test_maybe_convert_nullable_boolean(self, convert_to_nullable_boolean, exp): + def test_maybe_convert_nullable_boolean(self, convert_to_masked_nullable, exp): # GH 40687 arr = np.array([True, np.NaN], dtype=object) result = libops.maybe_convert_bool( - arr, set(), convert_to_nullable_boolean=convert_to_nullable_boolean + arr, set(), convert_to_masked_nullable=convert_to_masked_nullable ) - if convert_to_nullable_boolean: - tm.assert_extension_array_equal(result, exp) + if convert_to_masked_nullable: + tm.assert_extension_array_equal(BooleanArray(*result), exp) else: tm.assert_numpy_array_equal(result, exp) @@ -546,16 +546,30 @@ def test_convert_numeric_uint64_nan_values(self, coerce): np.array([str(-1), str(2 ** 63)], dtype=object), ], ) - def test_convert_numeric_int64_uint64(self, case, coerce): + @pytest.mark.parametrize("convert_to_masked_nullable", [True, False]) + def test_convert_numeric_int64_uint64( + self, case, coerce, convert_to_masked_nullable + ): expected = case.astype(float) if coerce else case.copy() - result = lib.maybe_convert_numeric(case, set(), coerce_numeric=coerce) + result = lib.maybe_convert_numeric( + case, + set(), + coerce_numeric=coerce, + convert_to_masked_nullable=convert_to_masked_nullable, + ) tm.assert_almost_equal(result, expected) - def test_convert_numeric_string_uint64(self): + @pytest.mark.parametrize("convert_to_masked_nullable", [True, False]) + def test_convert_numeric_string_uint64(self, convert_to_masked_nullable): # GH32394 result = lib.maybe_convert_numeric( - np.array(["uint64"], dtype=object), set(), coerce_numeric=True + np.array(["uint64"], dtype=object), + set(), + coerce_numeric=True, + convert_to_masked_nullable=convert_to_masked_nullable, ) + if convert_to_masked_nullable: + result = FloatingArray(*result) assert np.isnan(result) @pytest.mark.parametrize("value", [-(2 ** 63) - 1, 2 ** 64]) @@ -636,20 +650,28 @@ def test_maybe_convert_objects_nullable_integer(self, exp): tm.assert_extension_array_equal(result, exp) @pytest.mark.parametrize( - "exp", + "convert_to_masked_nullable, exp", [ - IntegerArray(np.array([2, 0], dtype="i8"), np.array([False, True])), - IntegerArray(np.array([2, 0], dtype="int64"), np.array([False, True])), + (True, IntegerArray(np.array([2, 0], dtype="i8"), np.array([False, True]))), + (False, np.array([2, np.nan], dtype="float64")), ], ) - def test_maybe_convert_numeric_nullable_integer(self, exp): + def test_maybe_convert_numeric_nullable_integer( + self, convert_to_masked_nullable, exp + ): # GH 40687 arr = np.array([2, np.NaN], dtype=object) - result = lib.maybe_convert_numeric(arr, set(), convert_to_nullable_integer=True) - tm.assert_extension_array_equal(result, exp) + result = lib.maybe_convert_numeric( + arr, set(), convert_to_masked_nullable=convert_to_masked_nullable + ) + if convert_to_masked_nullable: + result = IntegerArray(*result) + tm.assert_extension_array_equal(result, exp) + else: + tm.assert_numpy_array_equal(result, exp) @pytest.mark.parametrize( - "convert_to_floating_array, exp", + "convert_to_masked_nullable, exp", [ ( True, @@ -657,17 +679,19 @@ def test_maybe_convert_numeric_nullable_integer(self, exp): np.array([2.0, 0.0], dtype="float64"), np.array([False, True]) ), ), - (False, np.array([2.0, np.nan])), + (False, np.array([2.0, np.nan], dtype="float64")), ], ) - def test_maybe_convert_numeric_floating_array(self, convert_to_floating_array, exp): + def test_maybe_convert_numeric_floating_array( + self, convert_to_masked_nullable, exp + ): # GH 40687 - arr = np.array([2, np.nan], dtype=object) + arr = np.array([2.0, np.nan], dtype=object) result = lib.maybe_convert_numeric( - arr, set(), convert_to_floating_array=convert_to_floating_array + arr, set(), convert_to_masked_nullable=convert_to_masked_nullable ) - if convert_to_floating_array: - tm.assert_extension_array_equal(result, exp) + if convert_to_masked_nullable: + tm.assert_extension_array_equal(FloatingArray(*result), exp) else: tm.assert_numpy_array_equal(result, exp) From d075bed2028d1df2278cefa729a8c9736388ac58 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 5 May 2021 16:41:05 -0700 Subject: [PATCH 09/26] Finish updating code to master --- pandas/io/parsers/base_parser.py | 51 +++++++++++++++++++------------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 795076efadb12..8b41363d57d57 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -44,6 +44,7 @@ is_dict_like, is_dtype_equal, is_extension_array_dtype, + is_float_dtype, is_integer, is_integer_dtype, is_list_like, @@ -61,7 +62,10 @@ array as pd_array, ) from pandas.core.arrays import ( + BooleanArray, Categorical, + FloatingArray, + IntegerArray, StringArray, ) from pandas.core.indexes.api import ( @@ -707,10 +711,12 @@ def _infer_types(self, values, na_values, try_num_bool=True): if try_num_bool and is_object_dtype(values.dtype): # exclude e.g DatetimeIndex here try: - result, mask = lib.maybe_convert_numeric(values, - na_values, - False, - convert_to_masked_nullable=self.use_nullable_dtypes) + result, mask = lib.maybe_convert_numeric( + values, + na_values, + False, + convert_to_masked_nullable=self.use_nullable_dtypes, + ) if mask is not None: if is_integer_dtype(result): result = IntegerArray(result, mask) @@ -728,22 +734,27 @@ def _infer_types(self, values, na_values, try_num_bool=True): if values.dtype == np.object_: na_count = parsers.sanitize_objects(values, na_values, False) - if result.dtype == np.object_: - if try_num_bool: - result = libops.maybe_convert_bool( - np.asarray(values), - true_values=self.true_values, - false_values=self.false_values, - convert_to_masked_nullable=self.use_nullable_dtypes, - ) - # Maybe StringArray? Must have NA value to trigger - # Since it is called use_nullable_dtypes after all - # However, all NA -> Float64 not StringArray - if self.use_nullable_dtypes and 0 < na_count < len(result): - try: - result = StringArray(result) - except ValueError: - pass + if result.dtype == np.object_ and try_num_bool: + result, mask = libops.maybe_convert_bool( + np.asarray(values), + true_values=self.true_values, + false_values=self.false_values, + convert_to_masked_nullable=self.use_nullable_dtypes, + ) + if mask is not None: + result = BooleanArray(result, mask) + # Maybe StringArray? Must have NA value to trigger + # Since it is called use_nullable_dtypes after all + # However, all NA -> Float64 not StringArray + if ( + result.dtype == np.object_ + and self.use_nullable_dtypes + and 0 < na_count < len(result) + ): + try: + result = StringArray(result) + except ValueError: + pass return result, na_count From dc03d0ffbdf419c7240a24490511f6f7795b0486 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 5 May 2021 18:04:34 -0700 Subject: [PATCH 10/26] Preliminary StringArray support --- pandas/_libs/lib.pyx | 13 +++++++++++-- pandas/core/arrays/string_.py | 16 ++++++++++++---- pandas/tests/io/parser/test_na_values.py | 1 - 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 8d6b5cac5847c..a61fa102cfd90 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -679,11 +679,14 @@ cpdef ndarray[object] ensure_string_array( arr, object na_value=np.nan, bint convert_na_value=True, + bint coerce=True, bint copy=True, bint skipna=True, ): """ - Returns a new numpy array with object dtype and only strings and na values. + Checks that all elements in numpy are string or null and returns a new numpy array + with object dtype and only strings and na values if so. Otherwise, + raise a ValueError. Parameters ---------- @@ -693,6 +696,9 @@ cpdef ndarray[object] ensure_string_array( The value to use for na. For example, np.nan or pd.NA. convert_na_value : bool, default True If False, existing na values will be used unchanged in the new array. + coerce : bool, default True + Whether to coerce non-null non-string elements to strings. + Will raise ValueError otherwise. copy : bool, default True Whether to ensure that a new array is returned. skipna : bool, default True @@ -724,7 +730,10 @@ cpdef ndarray[object] ensure_string_array( continue if not checknull(val): - result[i] = str(val) + if coerce: + result[i] = str(val) + else: + raise ValueError("Non-string element encountered in array.") else: if convert_na_value: val = na_value diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 74ca5130ca322..55ef2dde65674 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -208,21 +208,29 @@ def __init__(self, values, copy=False): values = extract_array(values) super().__init__(values, copy=copy) + if not isinstance(values, type(self)): + self._validate() # error: Incompatible types in assignment (expression has type "StringDtype", # variable has type "PandasDtype") NDArrayBacked.__init__(self, self._ndarray, StringDtype()) - if not isinstance(values, type(self)): - self._validate() def _validate(self): """Validate that we only store NA or strings.""" - if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): - raise ValueError("StringArray requires a sequence of strings or pandas.NA") if self._ndarray.dtype != "object": raise ValueError( "StringArray requires a sequence of strings or pandas.NA. Got " f"'{self._ndarray.dtype}' dtype instead." ) + try: + NDArrayBacked.__init__( + self, + lib.ensure_string_array( + self._ndarray, na_value=StringDtype.na_value, coerce=False + ), + StringDtype(), + ) + except ValueError: + raise ValueError("StringArray requires a sequence of strings or pandas.NA") @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index a22e380076bf3..f63476b5a4dd6 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -283,7 +283,6 @@ def test_string_na_values(all_parsers, use_nullable_dtypes, expected): hey,hello,hey,NA""" parser = all_parsers result = parser.read_csv(StringIO(data), use_nullable_dtypes=use_nullable_dtypes) - print(result) tm.assert_frame_equal(result, expected) From 99afeb4afd361ad369c0dc42a90b3068e86a8d17 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 5 May 2021 21:20:25 -0700 Subject: [PATCH 11/26] Fix tests --- pandas/core/arrays/string_.py | 7 ++++--- pandas/tests/arrays/string_/test_string.py | 13 ++++++------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 55ef2dde65674..875c64fa76273 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -222,11 +222,12 @@ def _validate(self): f"'{self._ndarray.dtype}' dtype instead." ) try: + lib.ensure_string_array( + self._ndarray, na_value=StringDtype.na_value, coerce=False, copy=False + ), NDArrayBacked.__init__( self, - lib.ensure_string_array( - self._ndarray, na_value=StringDtype.na_value, coerce=False - ), + self._ndarray, StringDtype(), ) except ValueError: diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 17d05ebeb0fc5..7feb22f69632a 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -296,14 +296,13 @@ def test_constructor_raises(cls): with pytest.raises(ValueError, match=msg): cls(np.array([])) - with pytest.raises(ValueError, match=msg): - cls(np.array(["a", np.nan], dtype=object)) - - with pytest.raises(ValueError, match=msg): - cls(np.array(["a", None], dtype=object)) - with pytest.raises(ValueError, match=msg): - cls(np.array(["a", pd.NaT], dtype=object)) +@pytest.mark.parametrize("na", [np.nan, pd.NaT, None, pd.NA]) +def test_constructor_nan_like(na): + expected = pd.arrays.StringArray(np.array(["a", pd.NA])) + tm.assert_extension_array_equal( + pd.arrays.StringArray(np.array(["a", na], dtype="object")), expected + ) @pytest.mark.parametrize("copy", [True, False]) From 3e1784de50a9305dbccb185d4b2830a4c4addbed Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 10 May 2021 16:19:47 -0700 Subject: [PATCH 12/26] API: allow nan-likes in StringArray constructor --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/_libs/lib.pyx | 24 ++++++++++++++------ pandas/core/arrays/string_.py | 26 +++++++++++++++++----- pandas/tests/arrays/string_/test_string.py | 12 +++++----- 4 files changed, 46 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 5adc8540e6864..fd246cb554d7f 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -612,6 +612,7 @@ Other API changes - Partially initialized :class:`CategoricalDtype` (i.e. those with ``categories=None`` objects will no longer compare as equal to fully initialized dtype objects. - Accessing ``_constructor_expanddim`` on a :class:`DataFrame` and ``_constructor_sliced`` on a :class:`Series` now raise an ``AttributeError``. Previously a ``NotImplementedError`` was raised (:issue:`38782`) - Added new ``engine`` and ``**engine_kwargs`` parameters to :meth:`DataFrame.to_sql` to support other future "SQL engines". Currently we still only use ``SQLAlchemy`` under the hood, but more engines are planned to be supported such as ``turbodbc`` (:issue:`36893`) +- :class:`StringArray` now accepts nan-likes(``None``, ``nan``, ``NaT``, ``NA``, Decimal("NaN")) in its constructor in addition to strings. Build ===== diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e1cb744c7033c..fcb6d39bfc91f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -679,11 +679,14 @@ cpdef ndarray[object] ensure_string_array( arr, object na_value=np.nan, bint convert_na_value=True, + bint coerce=True, bint copy=True, bint skipna=True, ): """ - Returns a new numpy array with object dtype and only strings and na values. + Checks that all elements in numpy are string or null and returns a new numpy array + with object dtype and only strings and na values if so. Otherwise, + raise a ValueError. Parameters ---------- @@ -693,6 +696,9 @@ cpdef ndarray[object] ensure_string_array( The value to use for na. For example, np.nan or pd.NA. convert_na_value : bool, default True If False, existing na values will be used unchanged in the new array. + coerce : bool, default True + Whether to coerce non-null non-string elements to strings. + Will raise ValueError otherwise. copy : bool, default True Whether to ensure that a new array is returned. skipna : bool, default True @@ -724,7 +730,10 @@ cpdef ndarray[object] ensure_string_array( continue if not checknull(val): - result[i] = str(val) + if coerce: + result[i] = str(val) + else: + raise ValueError("Non-string element encountered in array.") else: if convert_na_value: val = na_value @@ -1835,10 +1844,6 @@ cdef class StringValidator(Validator): cdef inline bint is_array_typed(self) except -1: return issubclass(self.dtype.type, np.str_) - cdef bint is_valid_null(self, object value) except -1: - # We deliberately exclude None / NaN here since StringArray uses NA - return value is C_NA - cpdef bint is_string_array(ndarray values, bint skipna=False): cdef: @@ -2059,7 +2064,7 @@ def maybe_convert_numeric( upcasting for ints with nulls to float64. Returns ------- - np.ndarray + np.ndarray or tuple of converted values and its mask Array of converted object values to numerical ones. Optional[np.ndarray] @@ -2224,6 +2229,11 @@ def maybe_convert_numeric( if allow_null_in_int and seen.null_ and not seen.int_: seen.float_ = True + # This occurs since we disabled float nulls showing as null in anticipation + # of seeing ints that were never seen. So then, we return float + if allow_null_in_int and seen.null_ and not seen.int_: + seen.float_ = True + if seen.complex_: return (complexes, None) elif seen.float_: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 74ca5130ca322..c30d4b8ba7b41 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -144,11 +144,18 @@ class StringArray(PandasArray): .. warning:: Currently, this expects an object-dtype ndarray - where the elements are Python strings or :attr:`pandas.NA`. + where the elements are Python strings + or nan-likes(``None``, ``nan``, ``NaT``, ``NA``, Decimal("NaN")). This may change without warning in the future. Use :meth:`pandas.array` with ``dtype="string"`` for a stable way of creating a `StringArray` from any sequence. + .. versionchanged:: 1.3 + + StringArray now accepts nan-likes in the constructor in addition + to strings, whereas it only accepted strings and :attr:`pandas.NA` + before. + copy : bool, default False Whether to copy the array of data. @@ -208,21 +215,30 @@ def __init__(self, values, copy=False): values = extract_array(values) super().__init__(values, copy=copy) + if not isinstance(values, type(self)): + self._validate() # error: Incompatible types in assignment (expression has type "StringDtype", # variable has type "PandasDtype") NDArrayBacked.__init__(self, self._ndarray, StringDtype()) - if not isinstance(values, type(self)): - self._validate() def _validate(self): """Validate that we only store NA or strings.""" - if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): - raise ValueError("StringArray requires a sequence of strings or pandas.NA") if self._ndarray.dtype != "object": raise ValueError( "StringArray requires a sequence of strings or pandas.NA. Got " f"'{self._ndarray.dtype}' dtype instead." ) + try: + lib.ensure_string_array( + self._ndarray, na_value=StringDtype.na_value, coerce=False, copy=False + ), + NDArrayBacked.__init__( + self, + self._ndarray, + StringDtype(), + ) + except ValueError: + raise ValueError("StringArray requires a sequence of strings or pandas.NA") @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 17d05ebeb0fc5..722aada176c44 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -297,13 +297,15 @@ def test_constructor_raises(cls): cls(np.array([])) with pytest.raises(ValueError, match=msg): - cls(np.array(["a", np.nan], dtype=object)) + cls(np.array(["a", None])) - with pytest.raises(ValueError, match=msg): - cls(np.array(["a", None], dtype=object)) - with pytest.raises(ValueError, match=msg): - cls(np.array(["a", pd.NaT], dtype=object)) +@pytest.mark.parametrize("na", [np.nan, pd.NaT, None, pd.NA]) +def test_constructor_nan_like(na): + expected = pd.arrays.StringArray(np.array(["a", pd.NA])) + tm.assert_extension_array_equal( + pd.arrays.StringArray(np.array(["a", na], dtype="object")), expected + ) @pytest.mark.parametrize("copy", [True, False]) From 96ff1da535cd571cd45cb60d4cd1fdb47744f79e Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 10 May 2021 19:31:47 -0700 Subject: [PATCH 13/26] Revert weird changes & Fix stuff --- pandas/_libs/lib.pyi | 1 + pandas/_libs/lib.pyx | 7 +------ pandas/tests/arrays/string_/test_string.py | 2 +- pandas/tests/dtypes/test_inference.py | 7 ++++--- 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 9dbc47f1d40f7..22990361bc52e 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -138,6 +138,7 @@ def ensure_string_array( arr, na_value: object = np.nan, convert_na_value: bool = True, + coerce: bool = True, copy: bool = True, skipna: bool = True, ) -> np.ndarray: ... # np.ndarray[object] diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index fcb6d39bfc91f..b1523421e59fd 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2064,7 +2064,7 @@ def maybe_convert_numeric( upcasting for ints with nulls to float64. Returns ------- - np.ndarray or tuple of converted values and its mask + np.ndarray Array of converted object values to numerical ones. Optional[np.ndarray] @@ -2229,11 +2229,6 @@ def maybe_convert_numeric( if allow_null_in_int and seen.null_ and not seen.int_: seen.float_ = True - # This occurs since we disabled float nulls showing as null in anticipation - # of seeing ints that were never seen. So then, we return float - if allow_null_in_int and seen.null_ and not seen.int_: - seen.float_ = True - if seen.complex_: return (complexes, None) elif seen.float_: diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 722aada176c44..b3bc3b09e047a 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -297,7 +297,7 @@ def test_constructor_raises(cls): cls(np.array([])) with pytest.raises(ValueError, match=msg): - cls(np.array(["a", None])) + cls(np.array(["a", np.nan])) @pytest.mark.parametrize("na", [np.nan, pd.NaT, None, pd.NA]) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 076cc155f3626..73e87c75ee621 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1376,11 +1376,12 @@ def test_is_string_array(self): assert lib.is_string_array( np.array(["foo", "bar", pd.NA], dtype=object), skipna=True ) - # NaN is not valid for string array, just NA - assert not lib.is_string_array( + assert lib.is_string_array( np.array(["foo", "bar", np.nan], dtype=object), skipna=True ) - + assert not lib.is_string_array( + np.array(["foo", "bar", np.nan], dtype=object), skipna=False + ) assert not lib.is_string_array(np.array([1, 2])) def test_to_object_array_tuples(self): From 418e1d201ad0c20b9c5119fff34567fe72158ec2 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Tue, 11 May 2021 07:01:06 -0700 Subject: [PATCH 14/26] Remove failing test --- pandas/tests/arrays/string_/test_string.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index b3bc3b09e047a..7feb22f69632a 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -296,9 +296,6 @@ def test_constructor_raises(cls): with pytest.raises(ValueError, match=msg): cls(np.array([])) - with pytest.raises(ValueError, match=msg): - cls(np.array(["a", np.nan])) - @pytest.mark.parametrize("na", [np.nan, pd.NaT, None, pd.NA]) def test_constructor_nan_like(na): From 25a6c4d2ec9287b5b0a341c3cdd583cc3659a276 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 19 May 2021 16:23:41 -0700 Subject: [PATCH 15/26] Changes from code review --- pandas/_libs/lib.pyi | 3 +-- pandas/_libs/lib.pyx | 24 ++++++++++++++---------- pandas/core/arrays/string_.py | 9 ++------- pandas/core/arrays/string_arrow.py | 2 +- pandas/core/dtypes/cast.py | 4 ++-- 5 files changed, 20 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 22990361bc52e..966fd0cd4c008 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -137,8 +137,7 @@ def maybe_convert_numeric( def ensure_string_array( arr, na_value: object = np.nan, - convert_na_value: bool = True, - coerce: bool = True, + coerce: str = "all, copy: bool = True, skipna: bool = True, ) -> np.ndarray: ... # np.ndarray[object] diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index b1523421e59fd..fc3d73f332646 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -678,15 +678,14 @@ def astype_intsafe(ndarray[object] arr, cnp.dtype new_dtype) -> ndarray: cpdef ndarray[object] ensure_string_array( arr, object na_value=np.nan, - bint convert_na_value=True, - bint coerce=True, + coerce="all", bint copy=True, bint skipna=True, ): """ - Checks that all elements in numpy are string or null and returns a new numpy array - with object dtype and only strings and na values if so. Otherwise, - raise a ValueError. + Checks that all elements in numpy array are string or null + and returns a new numpy array with object dtype + and only strings and na values if so. Otherwise, raise a ValueError. Parameters ---------- @@ -696,9 +695,14 @@ cpdef ndarray[object] ensure_string_array( The value to use for na. For example, np.nan or pd.NA. convert_na_value : bool, default True If False, existing na values will be used unchanged in the new array. - coerce : bool, default True - Whether to coerce non-null non-string elements to strings. - Will raise ValueError otherwise. + coerce : {{'all', 'null', 'non-null', None}}, default 'all' + Whether to coerce non-string elements to strings. + - 'all' will convert null values and non-null non-string values. + - 'null' will only convert nulls without converting other non-strings. + - 'non-null' will only convert non-null non-string elements to string. + - None will not convert anything. + If coerce is not all, a ValueError will be raised for values + that are not strings or na_value. copy : bool, default True Whether to ensure that a new array is returned. skipna : bool, default True @@ -730,12 +734,12 @@ cpdef ndarray[object] ensure_string_array( continue if not checknull(val): - if coerce: + if coerce =="all" or coerce == "non-null": result[i] = str(val) else: raise ValueError("Non-string element encountered in array.") else: - if convert_na_value: + if coerce=="all" or coerce == "null": val = na_value if skipna: result[i] = val diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index c30d4b8ba7b41..289204c9aa4e5 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -230,13 +230,8 @@ def _validate(self): ) try: lib.ensure_string_array( - self._ndarray, na_value=StringDtype.na_value, coerce=False, copy=False + self._ndarray, na_value=StringDtype.na_value, coerce="null", copy=False ), - NDArrayBacked.__init__( - self, - self._ndarray, - StringDtype(), - ) except ValueError: raise ValueError("StringArray requires a sequence of strings or pandas.NA") @@ -251,7 +246,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): # avoid costly conversion to object dtype na_values = scalars._mask result = scalars._data - result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) + result = lib.ensure_string_array(result, copy=copy, coerce="non-null") result[na_values] = StringDtype.na_value else: diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 219a8c7ec0b82..42b7bf1a52513 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -247,7 +247,7 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False) # numerical issues with Float32Dtype na_values = scalars._mask result = scalars._data - result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) + result = lib.ensure_string_array(result, copy=copy, coerce="non-null") return cls(pa.array(result, mask=na_values, type=pa.string())) # convert non-na-likes to str diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 46dc97214e2f6..1e8c09136e223 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1125,7 +1125,7 @@ def astype_nansafe( return arr.astype(dtype, copy=copy) if issubclass(dtype.type, str): - return lib.ensure_string_array(arr, skipna=skipna, convert_na_value=False) + return lib.ensure_string_array(arr, skipna=skipna, convert_na_value="non-null") elif is_datetime64_dtype(arr): if dtype == np.int64: @@ -1925,7 +1925,7 @@ def construct_1d_ndarray_preserving_na( """ if dtype is not None and dtype.kind == "U": - subarr = lib.ensure_string_array(values, convert_na_value=False, copy=copy) + subarr = lib.ensure_string_array(values, coerce="non-null", copy=copy) else: if dtype is not None: _disallow_mismatched_datetimelike(values, dtype) From 8257dbd739a4b6f12b737f89da317a24d3f8b07f Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Thu, 20 May 2021 14:32:58 -0700 Subject: [PATCH 16/26] typo --- pandas/core/dtypes/cast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index c256f20527ad6..46af33b724d2a 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1125,7 +1125,7 @@ def astype_nansafe( return arr.astype(dtype, copy=copy) if issubclass(dtype.type, str): - return lib.ensure_string_array(arr, skipna=skipna, convert_na_value="non-null") + return lib.ensure_string_array(arr, skipna=skipna, coerce="non-null") elif is_datetime64_dtype(arr): if dtype == np.int64: From 922436a78903dfa55cd1d54d4381477cad934af5 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 21 May 2021 13:30:08 -0700 Subject: [PATCH 17/26] Update lib.pyi --- pandas/_libs/lib.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 1e49ce67f7cec..726b306e71fd5 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -146,7 +146,7 @@ def maybe_convert_numeric( def ensure_string_array( arr, na_value: object = np.nan, - coerce: str = "all, + coerce: str = "all", copy: bool = True, skipna: bool = True, ) -> np.ndarray: ... # np.ndarray[object] From 2f28086a0f23bf2b30d79ca41aaab0abb3ca370b Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sat, 29 May 2021 11:03:33 -0700 Subject: [PATCH 18/26] Update lib.pyx --- pandas/_libs/lib.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 8df50a32ae482..99872d2f9e91f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1849,7 +1849,11 @@ cdef class StringValidator(Validator): cdef inline bint is_array_typed(self) except -1: return issubclass(self.dtype.type, np.str_) - + + cdef bint is_valid_null(self, object value) except -1: + # Override to exclude float('Nan') and complex NaN + return value is None or value is C_NA or np.isnan(value) + cpdef bint is_string_array(ndarray values, bint skipna=False): cdef: From 3ee219815e619fb57edeee0c295ba36e84232e0a Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sat, 29 May 2021 11:05:19 -0700 Subject: [PATCH 19/26] Update lib.pyx --- pandas/_libs/lib.pyx | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 99872d2f9e91f..ce70d15c202f5 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -693,8 +693,6 @@ cpdef ndarray[object] ensure_string_array( The values to be converted to str, if needed. na_value : Any, default np.nan The value to use for na. For example, np.nan or pd.NA. - convert_na_value : bool, default True - If False, existing na values will be used unchanged in the new array. coerce : {{'all', 'null', 'non-null', None}}, default 'all' Whether to coerce non-string elements to strings. - 'all' will convert null values and non-null non-string values. @@ -1849,11 +1847,11 @@ cdef class StringValidator(Validator): cdef inline bint is_array_typed(self) except -1: return issubclass(self.dtype.type, np.str_) - + cdef bint is_valid_null(self, object value) except -1: # Override to exclude float('Nan') and complex NaN return value is None or value is C_NA or np.isnan(value) - + cpdef bint is_string_array(ndarray values, bint skipna=False): cdef: From 3ee55f25a94a12da069a387a150164538394d460 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sat, 29 May 2021 21:21:57 -0700 Subject: [PATCH 20/26] Updates --- pandas/tests/arrays/string_/test_string.py | 5 ++++- pandas/tests/dtypes/test_inference.py | 6 ++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index a246be938aef0..af57aff03b073 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -289,8 +289,11 @@ def test_constructor_raises(cls): with pytest.raises(ValueError, match=msg): cls(np.array([])) + with pytest.raises(ValueError, match=msg): + cls(np.array(["a", pd.NaT], dtype=object)) + -@pytest.mark.parametrize("na", [np.nan, pd.NaT, None, pd.NA]) +@pytest.mark.parametrize("na", [np.nan, None, pd.NA]) def test_constructor_nan_like(na): expected = pd.arrays.StringArray(np.array(["a", pd.NA])) tm.assert_extension_array_equal( diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 987b3accbca2e..87a1be80e3639 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1391,6 +1391,12 @@ def test_is_string_array(self): assert lib.is_string_array( np.array(["foo", "bar", np.nan], dtype=object), skipna=True ) + assert lib.is_string_array( + np.array(["foo", "bar", None], dtype=object), skipna=True + ) + assert not lib.is_string_array( + np.array(["foo", "bar", None], dtype=object), skipna=False + ) assert not lib.is_string_array( np.array(["foo", "bar", np.nan], dtype=object), skipna=False ) From fe4981a6337cd59ae68b1ff44ca0f9b600d2ee49 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 30 May 2021 06:18:55 -0700 Subject: [PATCH 21/26] Update lib.pyx --- pandas/_libs/lib.pyx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a1e66a575097e..08d7a68cd0dc0 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -731,7 +731,10 @@ cpdef ndarray[object] ensure_string_array( if isinstance(val, str): continue - if not checknull(val): + if not (val is None or val is C_NA or np.isnan(val)): + # We don't use checknull, since NaT, Decimal("NaN"), etc. aren't valid + # If they are present, they are treated like a regular Python object + # and will either cause an exception to be raised or be coerced. if coerce =="all" or coerce == "non-null": result[i] = str(val) else: From a66948aa7aa21d057c322895b59ea9f8c79480cd Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 30 May 2021 09:26:52 -0700 Subject: [PATCH 22/26] Update lib.pyx --- pandas/_libs/lib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 08d7a68cd0dc0..a987f47533259 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -731,7 +731,7 @@ cpdef ndarray[object] ensure_string_array( if isinstance(val, str): continue - if not (val is None or val is C_NA or np.isnan(val)): + if not (val is None or val is C_NA or val != val): # We don't use checknull, since NaT, Decimal("NaN"), etc. aren't valid # If they are present, they are treated like a regular Python object # and will either cause an exception to be raised or be coerced. From e8527191d33ed9c4416d265b175822c19bd5b4ae Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 31 May 2021 09:29:58 -0700 Subject: [PATCH 23/26] Update lib.pyx --- pandas/_libs/lib.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a987f47533259..f39b1fbc49cdb 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -731,7 +731,7 @@ cpdef ndarray[object] ensure_string_array( if isinstance(val, str): continue - if not (val is None or val is C_NA or val != val): + if not (val is None or val is C_NA or val is np.nan): # We don't use checknull, since NaT, Decimal("NaN"), etc. aren't valid # If they are present, they are treated like a regular Python object # and will either cause an exception to be raised or be coerced. @@ -1853,7 +1853,7 @@ cdef class StringValidator(Validator): cdef bint is_valid_null(self, object value) except -1: # Override to exclude float('Nan') and complex NaN - return value is None or value is C_NA or np.isnan(value) + return value is None or value is C_NA or value is np.nan cpdef bint is_string_array(ndarray values, bint skipna=False): From 91b73bb93aad90f26040c729b57d99ec26eb3941 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 4 Jun 2021 08:28:52 -0700 Subject: [PATCH 24/26] disallow invalid nans in stringarray constructor --- pandas/_libs/lib.pyx | 23 ++++++++++++++++------- pandas/core/arrays/string_.py | 7 +++++-- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index f39b1fbc49cdb..e3fa8eeaa9b53 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -94,6 +94,7 @@ from pandas._libs.missing cimport ( is_null_timedelta64, isnaobj, ) +from pandas._libs.missing import checknull from pandas._libs.tslibs.conversion cimport convert_to_tsobject from pandas._libs.tslibs.nattype cimport ( NPY_NAT, @@ -696,10 +697,12 @@ cpdef ndarray[object] ensure_string_array( coerce : {{'all', 'null', 'non-null', None}}, default 'all' Whether to coerce non-string elements to strings. - 'all' will convert null values and non-null non-string values. - - 'null' will only convert nulls without converting other non-strings. + - 'strict-null' will only convert pd.NA, np.nan, or None to na_value + without converting other non-strings. + - 'null' will convert nulls to na_value w/out converting other non-strings. - 'non-null' will only convert non-null non-string elements to string. - None will not convert anything. - If coerce is not all, a ValueError will be raised for values + If coerce is not 'all', a ValueError will be raised for values that are not strings or na_value. copy : bool, default True Whether to ensure that a new array is returned. @@ -714,6 +717,7 @@ cpdef ndarray[object] ensure_string_array( """ cdef: Py_ssize_t i = 0, n = len(arr) + set strict_na_values = {C_NA, np.nan, None} if hasattr(arr, "to_numpy"): arr = arr.to_numpy() @@ -725,22 +729,27 @@ cpdef ndarray[object] ensure_string_array( if copy and result is arr: result = result.copy() + if coerce == 'strict-null': + # We don't use checknull, since NaT, Decimal("NaN"), etc. aren't valid + # If they are present, they are treated like a regular Python object + # and will either cause an exception to be raised or be coerced. + check_null = strict_na_values.__contains__ + else: + check_null = checknull + for i in range(n): val = arr[i] if isinstance(val, str): continue - if not (val is None or val is C_NA or val is np.nan): - # We don't use checknull, since NaT, Decimal("NaN"), etc. aren't valid - # If they are present, they are treated like a regular Python object - # and will either cause an exception to be raised or be coerced. + if not check_null(val): if coerce =="all" or coerce == "non-null": result[i] = str(val) else: raise ValueError("Non-string element encountered in array.") else: - if coerce=="all" or coerce == "null": + if coerce=="all" or coerce == "null" or coerce == 'strict-null': val = na_value if skipna: result[i] = val diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 79ddd12476323..d0ea1aa5c5293 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -230,8 +230,11 @@ def _validate(self): ) try: lib.ensure_string_array( - self._ndarray, na_value=StringDtype.na_value, coerce="null", copy=False - ), + self._ndarray, + na_value=StringDtype.na_value, + coerce="strict-null", + copy=False, + ) except ValueError: raise ValueError("StringArray requires a sequence of strings or pandas.NA") From 41f49d21d8da2bbdcc37d33714d009ea2b862049 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 4 Jun 2021 12:40:56 -0700 Subject: [PATCH 25/26] add to _from_sequence and fixes --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/core/arrays/string_.py | 18 ++++++++++++++---- pandas/core/arrays/string_arrow.py | 16 +++++++++++++--- 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 93e27a7318f2d..4c5175b8e1bcc 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -644,7 +644,7 @@ Other API changes - Partially initialized :class:`CategoricalDtype` (i.e. those with ``categories=None`` objects will no longer compare as equal to fully initialized dtype objects. - Accessing ``_constructor_expanddim`` on a :class:`DataFrame` and ``_constructor_sliced`` on a :class:`Series` now raise an ``AttributeError``. Previously a ``NotImplementedError`` was raised (:issue:`38782`) - Added new ``engine`` and ``**engine_kwargs`` parameters to :meth:`DataFrame.to_sql` to support other future "SQL engines". Currently we still only use ``SQLAlchemy`` under the hood, but more engines are planned to be supported such as ``turbodbc`` (:issue:`36893`) -- :class:`StringArray` now accepts nan-likes(``None``, ``nan``, ``NaT``, ``NA``, Decimal("NaN")) in its constructor in addition to strings. +- :class:`StringArray` now accepts nan-likes(``None``, ``nan``, ``NA``) in its constructor in addition to strings. - Removed redundant ``freq`` from :class:`PeriodIndex` string representation (:issue:`41653`) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index d0ea1aa5c5293..4d97035714ba3 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -145,7 +145,7 @@ class StringArray(PandasArray): Currently, this expects an object-dtype ndarray where the elements are Python strings - or nan-likes(``None``, ``nan``, ``NaT``, ``NA``, Decimal("NaN")). + or nan-likes(``None``, ``nan``, ``NA``). This may change without warning in the future. Use :meth:`pandas.array` with ``dtype="string"`` for a stable way of creating a `StringArray` from any sequence. @@ -239,7 +239,9 @@ def _validate(self): raise ValueError("StringArray requires a sequence of strings or pandas.NA") @classmethod - def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): + def _from_sequence( + cls, scalars, *, dtype: Dtype | None = None, copy=False, coerce=True + ): if dtype: assert dtype == "string" @@ -247,15 +249,23 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype + if coerce: + coerce = "non-null" + else: + coerce = None na_values = scalars._mask result = scalars._data - result = lib.ensure_string_array(result, copy=copy, coerce="non-null") + result = lib.ensure_string_array(result, copy=copy, coerce=coerce) result[na_values] = StringDtype.na_value else: # convert non-na-likes to str, and nan-likes to StringDtype.na_value + if coerce: + coerce = "all" + else: + coerce = "strict-null" result = lib.ensure_string_array( - scalars, na_value=StringDtype.na_value, copy=copy + scalars, na_value=StringDtype.na_value, copy=copy, coerce=coerce ) # Manually creating new array avoids the validation step in the __init__, so is diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 7aeadbb4c4616..f0af7a8a43594 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -237,7 +237,9 @@ def __init__(self, values): ) @classmethod - def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False): + def _from_sequence( + cls, scalars, dtype: Dtype | None = None, copy: bool = False, coerce=True + ): from pandas.core.arrays.masked import BaseMaskedArray _chk_pyarrow_available() @@ -247,11 +249,19 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False) # numerical issues with Float32Dtype na_values = scalars._mask result = scalars._data - result = lib.ensure_string_array(result, copy=copy, coerce="non-null") + if coerce: + coerce = "non-null" + else: + coerce = None + result = lib.ensure_string_array(result, copy=copy, coerce=coerce) return cls(pa.array(result, mask=na_values, type=pa.string())) # convert non-na-likes to str - result = lib.ensure_string_array(scalars, copy=copy) + if coerce: + coerce = "all" + else: + coerce = "strict-null" + result = lib.ensure_string_array(scalars, copy=copy, coerce=coerce) return cls(pa.array(result, type=pa.string(), from_pandas=True)) @classmethod From 033580ff373e626885295528e3b3db32d39d6016 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 4 Jun 2021 15:39:34 -0700 Subject: [PATCH 26/26] Update to make work --- doc/source/whatsnew/v1.3.0.rst | 2 -- pandas/_libs/parsers.pyx | 4 ++-- pandas/io/parsers/base_parser.py | 4 ++-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 44db989b33903..e690d5fc85785 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -225,8 +225,6 @@ Other enhancements - Add support for parsing ``ISO 8601``-like timestamps with negative signs to :class:`Timedelta` (:issue:`37172`) - Add support for unary operators in :class:`FloatingArray` (:issue:`38749`) - :class:`RangeIndex` can now be constructed by passing a ``range`` object directly e.g. ``pd.RangeIndex(range(3))`` (:issue:`12067`) -- :meth:`round` being enabled for the nullable integer and floating dtypes (:issue:`38844`) -- :meth:`pandas.read_csv` and :meth:`pandas.read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`) - :meth:`pandas.read_csv` now accepts an argument ``use_nullable_dtypes`` that allows reading data directly into the nullable integer and boolean data types (:issue:`36712`) - :meth:`Series.round` and :meth:`DataFrame.round` now work with nullable integer and floating dtypes (:issue:`38844`) - :meth:`read_csv` and :meth:`read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index bde6674dbda67..4c64a0e3d8479 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -114,8 +114,8 @@ from pandas.core.arrays import ( BooleanArray, FloatingArray, IntegerArray, - StringArray, ) +from pandas.core.arrays.string_ import StringDtype cdef: float64_t INF = np.inf @@ -1382,7 +1382,7 @@ def _maybe_upcast(arr, use_nullable_dtypes=False): elif use_nullable_dtypes and arr.dtype == np.object_: # Maybe convert StringArray & catch error for non-strings try: - arr = StringArray(arr) + arr = StringDtype.construct_array_type()._from_sequence(arr) except ValueError as e: pass diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index fc8f83bd29fe3..46e0875ab61ec 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -63,8 +63,8 @@ Categorical, FloatingArray, IntegerArray, - StringArray, ) +from pandas.core.arrays.string_ import StringDtype from pandas.core.indexes.api import ( Index, MultiIndex, @@ -757,7 +757,7 @@ def _infer_types(self, values, na_values, try_num_bool=True): and 0 < na_count < len(result) ): try: - result = StringArray(result) + result = StringDtype.construct_array_type()._from_sequence(result) except ValueError: pass