From 266603fbdfe6438044815d277370ae0dcb3cf583 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 13 Nov 2019 15:32:02 -0600 Subject: [PATCH 1/5] wip --- pandas/_libs/lib.pyx | 4 +- pandas/_libs/tslibs/nattype.pyx | 3 ++ pandas/core/arrays/numpy_.py | 3 ++ pandas/core/arrays/string_.py | 43 +++++++++++----------- pandas/core/dtypes/missing.py | 4 ++ pandas/tests/arrays/string_/test_string.py | 2 +- pandas/tests/extension/test_string.py | 8 ++-- 7 files changed, 40 insertions(+), 27 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 7d65cb52bce1e..de44fe3666390 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -49,6 +49,7 @@ cdef extern from "src/parse_helper.h": int floatify(object, float64_t *result, int *maybe_int) except -1 cimport pandas._libs.util as util +from pandas.core.na_scalar import NA from pandas._libs.util cimport is_nan, UINT64_MAX, INT64_MAX, INT64_MIN from pandas._libs.tslib import array_to_datetime @@ -156,6 +157,7 @@ def is_scalar(val: object) -> bool: return (cnp.PyArray_IsAnyScalar(val) # PyArray_IsAnyScalar is always False for bytearrays on Py3 + or val is NA or PyDate_Check(val) or PyDelta_Check(val) or PyTime_Check(val) @@ -1500,7 +1502,7 @@ cdef class Validator: f'must define is_value_typed') cdef bint is_valid_null(self, object value) except -1: - return value is None or util.is_nan(value) + return value is None or value is NA or util.is_nan(value) cdef bint is_array_typed(self) except -1: return False diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index e491d6111a919..49843152fb4bf 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -12,6 +12,7 @@ cimport numpy as cnp from numpy cimport int64_t cnp.import_array() +from pandas.core.na_scalar import NA from pandas._libs.tslibs.np_datetime cimport ( get_datetime64_value, get_timedelta64_value) cimport pandas._libs.tslibs.util as util @@ -799,6 +800,8 @@ cpdef bint is_null_datetimelike(object val, bint inat_is_null=True): """ if val is None: return True + elif val is NA: + return True elif val is c_NaT: return True elif util.is_float_object(val) or util.is_complex_object(val): diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 6f2bb095a014d..8ba5cd7565850 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -278,6 +278,9 @@ def fillna(self, value=None, method=None, limit=None): return new_values def take(self, indices, allow_fill=False, fill_value=None): + if fill_value is None: + # Primarily for subclasses + fill_value = self.dtype.na_value result = take( self._ndarray, indices, allow_fill=allow_fill, fill_value=fill_value ) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 7c487b227de20..de2c324225025 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,12 +1,12 @@ import operator -from typing import TYPE_CHECKING, Type +from typing import Type import numpy as np from pandas._libs import lib from pandas.core.dtypes.base import ExtensionDtype -from pandas.core.dtypes.common import pandas_dtype +from pandas.core.dtypes.common import is_float_dtype, pandas_dtype from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import is_array_like @@ -16,9 +16,7 @@ from pandas.core.arrays import PandasArray from pandas.core.construction import extract_array from pandas.core.missing import isna - -if TYPE_CHECKING: - from pandas._typing import Scalar +from pandas.core.na_scalar import NA @register_extension_dtype @@ -50,16 +48,8 @@ class StringDtype(ExtensionDtype): StringDtype """ - @property - def na_value(self) -> "Scalar": - """ - StringDtype uses :attr:`numpy.nan` as the missing NA value. - - .. warning:: - - `na_value` may change in a future release. - """ - return np.nan + #: StringDtype.na_value uses pandas.NA + na_value = NA @property def type(self) -> Type: @@ -172,10 +162,10 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): if dtype: assert dtype == "string" result = super()._from_sequence(scalars, dtype=object, copy=copy) - # convert None to np.nan + # convert None to NA # TODO: it would be nice to do this in _validate / lib.is_string_array # We are already doing a scan over the values there. - result[result.isna()] = np.nan + result[result.isna()] = StringDtype.na_value return result @classmethod @@ -192,6 +182,12 @@ def __arrow_array__(self, type=None): type = pa.string() return pa.array(self._ndarray, type=type, from_pandas=True) + def _values_for_factorize(self): + arr = self._ndarray.copy() + mask = self.isna() + arr[mask] = -1 + return arr, -1 + def __setitem__(self, key, value): value = extract_array(value, extract_numpy=True) if isinstance(value, type(self)): @@ -205,9 +201,14 @@ def __setitem__(self, key, value): # validate new items if scalar_value: - if scalar_value is None: - value = np.nan - elif not (isinstance(value, str) or np.isnan(value)): + if ( + value is NA + or value is None + or is_float_dtype(value) + and np.isnan(value) + ): + value = StringDtype.na_value + elif not (isinstance(value, str) or (value is NA)): raise ValueError( "Cannot set non-string value '{}' into a StringArray.".format(value) ) @@ -265,7 +266,7 @@ def method(self, other): other = other[valid] result = np.empty_like(self._ndarray, dtype="object") - result[mask] = np.nan + result[mask] = StringDtype.na_value result[valid] = op(self._ndarray[valid], other) if op.__name__ in {"add", "radd", "mul", "rmul"}: diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 0a8f636b4cb2a..693ee06fc1acf 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -9,6 +9,7 @@ import pandas._libs.missing as libmissing from pandas._libs.tslibs import NaT, iNaT +from ..na_scalar import NA from .common import ( _NS_DTYPE, _TD_DTYPE, @@ -440,6 +441,9 @@ def array_equivalent(left, right, strict_nan=False): if left_value is NaT and right_value is not NaT: return False + elif left_value is NA and right_value is not NA: + return False + elif isinstance(left_value, float) and np.isnan(left_value): if not isinstance(right_value, float) or not np.isnan(right_value): return False diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index efe2b4e0b2deb..186c47423b20d 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -12,7 +12,7 @@ def test_none_to_nan(): a = pd.arrays.StringArray._from_sequence(["a", None, "b"]) assert a[1] is not None - assert np.isnan(a[1]) + assert a[1] is pd.NA def test_setitem_validates(): diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 5b872d5b72227..471a1b79d23bc 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -25,7 +25,7 @@ def data(): @pytest.fixture def data_missing(): """Length 2 array with [NA, Valid]""" - return StringArray._from_sequence([np.nan, "A"]) + return StringArray._from_sequence([pd.NA, "A"]) @pytest.fixture @@ -35,17 +35,17 @@ def data_for_sorting(): @pytest.fixture def data_missing_for_sorting(): - return StringArray._from_sequence(["B", np.nan, "A"]) + return StringArray._from_sequence(["B", pd.NA, "A"]) @pytest.fixture def na_value(): - return np.nan + return pd.NA @pytest.fixture def data_for_grouping(): - return StringArray._from_sequence(["B", "B", np.nan, np.nan, "A", "A", "B", "C"]) + return StringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"]) class TestDtype(base.BaseDtypeTests): From e83a8a9a5807d5dd9c02e5bafd8a4bd18f191311 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 13 Nov 2019 15:58:34 -0600 Subject: [PATCH 2/5] update --- pandas/_libs/lib.pyx | 4 +--- pandas/_libs/testing.pyx | 8 +++++--- pandas/_libs/tslibs/nattype.pyx | 3 --- pandas/core/arrays/string_.py | 16 +++++++++------- pandas/core/dtypes/missing.py | 5 +++-- pandas/tests/arrays/string_/test_string.py | 9 +++++++++ 6 files changed, 27 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index b942a8417cf01..e2fb4a3635387 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -49,7 +49,6 @@ cdef extern from "src/parse_helper.h": int floatify(object, float64_t *result, int *maybe_int) except -1 cimport pandas._libs.util as util -from pandas.core.na_scalar import NA from pandas._libs.util cimport is_nan, UINT64_MAX, INT64_MAX, INT64_MIN from pandas._libs.tslib import array_to_datetime @@ -156,7 +155,6 @@ def is_scalar(val: object) -> bool: return (cnp.PyArray_IsAnyScalar(val) # PyArray_IsAnyScalar is always False for bytearrays on Py3 - or val is NA or PyDate_Check(val) or PyDelta_Check(val) or PyTime_Check(val) @@ -1502,7 +1500,7 @@ cdef class Validator: f'must define is_value_typed') cdef bint is_valid_null(self, object value) except -1: - return value is None or value is NA or util.is_nan(value) + return value is None or value is C_NA or util.is_nan(value) cdef bint is_array_typed(self) except -1: return False diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index f848310d961e1..adbc25b96ea42 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -180,13 +180,15 @@ cpdef assert_almost_equal(a, b, # classes can't be the same, to raise error assert_class_equal(a, b, obj=obj) - if a == b: - # object comparison - return True if isna(a) and isna(b): # TODO: Should require same-dtype NA? # nan / None comparison return True + + if a == b: + # object comparison + return True + if is_comparable_as_number(a) and is_comparable_as_number(b): if array_equivalent(a, b, strict_nan=True): # inf comparison diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 49843152fb4bf..e491d6111a919 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -12,7 +12,6 @@ cimport numpy as cnp from numpy cimport int64_t cnp.import_array() -from pandas.core.na_scalar import NA from pandas._libs.tslibs.np_datetime cimport ( get_datetime64_value, get_timedelta64_value) cimport pandas._libs.tslibs.util as util @@ -800,8 +799,6 @@ cpdef bint is_null_datetimelike(object val, bint inat_is_null=True): """ if val is None: return True - elif val is NA: - return True elif val is c_NaT: return True elif util.is_float_object(val) or util.is_complex_object(val): diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index de2c324225025..86114774b47ae 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -3,7 +3,7 @@ import numpy as np -from pandas._libs import lib +from pandas._libs import lib, missing as libmissing from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import is_float_dtype, pandas_dtype @@ -16,7 +16,6 @@ from pandas.core.arrays import PandasArray from pandas.core.construction import extract_array from pandas.core.missing import isna -from pandas.core.na_scalar import NA @register_extension_dtype @@ -49,7 +48,7 @@ class StringDtype(ExtensionDtype): """ #: StringDtype.na_value uses pandas.NA - na_value = NA + na_value = libmissing.NA @property def type(self) -> Type: @@ -202,13 +201,16 @@ def __setitem__(self, key, value): # validate new items if scalar_value: if ( - value is NA + value is libmissing.NA or value is None - or is_float_dtype(value) - and np.isnan(value) + or ( + is_float_dtype(value) + and not isinstance(value, str) + and np.isnan(value) + ) ): value = StringDtype.na_value - elif not (isinstance(value, str) or (value is NA)): + elif not (isinstance(value, str) or (value is libmissing.NA)): raise ValueError( "Cannot set non-string value '{}' into a StringArray.".format(value) ) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 693ee06fc1acf..e5234c06c9ba0 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -9,7 +9,6 @@ import pandas._libs.missing as libmissing from pandas._libs.tslibs import NaT, iNaT -from ..na_scalar import NA from .common import ( _NS_DTYPE, _TD_DTYPE, @@ -441,7 +440,7 @@ def array_equivalent(left, right, strict_nan=False): if left_value is NaT and right_value is not NaT: return False - elif left_value is NA and right_value is not NA: + elif left_value is libmissing.NA and right_value is not libmissing.NA: return False elif isinstance(left_value, float) and np.isnan(left_value): @@ -455,6 +454,8 @@ def array_equivalent(left, right, strict_nan=False): if "Cannot compare tz-naive" in str(err): # tzawareness compat failure, see GH#28507 return False + elif "boolean value of NA is ambiguous" in str(err): + return False raise return True diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 186c47423b20d..3029f3007929b 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -24,6 +24,15 @@ def test_setitem_validates(): a[:] = np.array([1, 2]) +def test_setitem_with_scalar_string(): + # is_float_dtype considers some strings, like 'd', to be floats + # which can cause issues. + arr = pd.array(["a", "c"], dtype="string") + arr[0] = "d" + expected = pd.array(["d", "c"], dtype="string") + tm.assert_extension_array_equal(arr, expected) + + @pytest.mark.parametrize( "input, method", [ From 004e42f665b617116e7e01a03e863e6862149051 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 13 Nov 2019 16:17:01 -0600 Subject: [PATCH 3/5] fixup --- pandas/core/strings.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 7194d1cf08e4a..2b2604a001e52 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -123,6 +123,10 @@ def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object): if na_mask: mask = isna(arr) convert = not np.all(mask) + if convert: + # NA -> nan + arr[mask] = np.nan + try: result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert) except (TypeError, AttributeError) as e: From 1660769183ac4fcc64dfb1a4decc93fee107a2eb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 14 Nov 2019 07:39:29 -0600 Subject: [PATCH 4/5] simplify --- pandas/core/arrays/string_.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 86114774b47ae..bb2a9a22e601f 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -6,7 +6,7 @@ from pandas._libs import lib, missing as libmissing from pandas.core.dtypes.base import ExtensionDtype -from pandas.core.dtypes.common import is_float_dtype, pandas_dtype +from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import is_array_like @@ -161,7 +161,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): if dtype: assert dtype == "string" result = super()._from_sequence(scalars, dtype=object, copy=copy) - # convert None to NA + # Standardize all missing-like values to NA # TODO: it would be nice to do this in _validate / lib.is_string_array # We are already doing a scan over the values there. result[result.isna()] = StringDtype.na_value @@ -200,17 +200,9 @@ def __setitem__(self, key, value): # validate new items if scalar_value: - if ( - value is libmissing.NA - or value is None - or ( - is_float_dtype(value) - and not isinstance(value, str) - and np.isnan(value) - ) - ): + if isna(value): value = StringDtype.na_value - elif not (isinstance(value, str) or (value is libmissing.NA)): + elif not isinstance(value, str): raise ValueError( "Cannot set non-string value '{}' into a StringArray.".format(value) ) From c357f8cf201c4c6f53034babc024357e66d2a0b2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 14 Nov 2019 07:43:58 -0600 Subject: [PATCH 5/5] add note and failing test --- pandas/core/strings.py | 3 +++ pandas/tests/test_strings.py | 8 ++++++++ 2 files changed, 11 insertions(+) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 2b2604a001e52..0a0f1c4d2527e 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -124,6 +124,9 @@ def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object): mask = isna(arr) convert = not np.all(mask) if convert: + # XXX: This converts pd.NA to np.nan to match the output of + # object-dtype ops that return numeric, like str.count + # We probably want to return Int64Dtype instead. # NA -> nan arr[mask] = np.nan diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index f5d28ec82d1d4..9b8ccdf4a7d9e 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3507,3 +3507,11 @@ def test_string_array(any_string_method): assert all(result[columns].dtypes == "string") result[columns] = result[columns].astype(object) tm.assert_equal(result, expected) + + +@pytest.mark.xfail(reason="not implmented yet") +def test_string_dtype_numeric(): + s = Series(["a", "aa", None], dtype="string") + result = s.str.count("a") + expected = Series([1, 2, None], dtype="Int64") + tm.assert_series_equal(result, expected)