Update to us pd.NA instead of np.nan / _ufloat_nan

MichaelTiemannOSC · MichaelTiemannOSC · commit e1d367c483e9 · 2023-07-02T09:04:48.000-04:00
To resolve the question of the proper na_value for EA dtypes (np.nan vs. uncertainties _ufloat_nan), use the gender-neutral pd.NA value.

Signed-off-by: Michael Tiemann &lt;72577720+MichaelTiemannOSC@users.noreply.github.com&gt;
diff --git a/pint_pandas/pint_array.py b/pint_pandas/pint_array.py
@@ -148,9 +148,7 @@ def name(self):
 
     @property
     def na_value(self):
-        if HAS_UNCERTAINTIES:
-            return self.ureg.Quantity(_ufloat_nan, self.units)
-        return self.ureg.Quantity(np.nan, self.units)
+        return self.ureg.Quantity(pd.NA, self.units)
 
     def __hash__(self):
         # make myself hashable
@@ -383,8 +381,7 @@ def isna(self):
             if len(self._data) == 0:
                 # True or False doesn't matter--we just need the value for the type
                 return np.full((0), True)
-            elif isinstance(self._data[0], UFloat):
-                return unp.isnan(self._data)
+            return self._data.map(lambda x: pd.isna(x) or (isinstance(x, UFloat) and unp.isnan(x)))
         return self._data.isna()
 
     def astype(self, dtype, copy=True):
@@ -537,6 +534,9 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
             dtype = PintType(master_scalar.units)
 
         def quantify_nan(item, promote_to_ufloat):
+            if pd.isna(item):
+                return dtype.ureg.Quantity(item, dtype.units)
+            # FIXME: most of this code is never executed (except the final return)
             if promote_to_ufloat:
                 if type(item) is UFloat:
                     return item * dtype.units
@@ -551,28 +551,31 @@ def quantify_nan(item, promote_to_ufloat):
             return item
 
         if isinstance(master_scalar, _Quantity):
+            # A quantified master_scalar does not guarantee that we don't have NA and/or np.nan values in our scalars
             if HAS_UNCERTAINTIES:
                 promote_to_ufloat = any(
-                    [isinstance(item.m, UFloat) for item in scalars]
+                    [isinstance(item.m, UFloat) for item in scalars if pd.notna(item)]
                 )
             else:
                 promote_to_ufloat = False
-            scalars = [quantify_nan(item, promote_to_ufloat) for item in scalars]
+            scalars = [item if isinstance(item, _Quantity) else quantify_nan(item, promote_to_ufloat) for item in scalars]
             scalars = [
                 (item.to(dtype.units).magnitude if hasattr(item, "to") else item)
                 for item in scalars
             ]
-        if HAS_UNCERTAINTIES:
+        elif HAS_UNCERTAINTIES:
             promote_to_ufloat = any([isinstance(item, UFloat) for item in scalars])
-            if promote_to_ufloat:
-                scalars = [
-                    item
-                    if isinstance(item, UFloat)
-                    else _ufloat_nan
-                    if np.isnan(item)
-                    else ufloat(item, 0)
-                    for item in scalars
-                ]
+        else:
+            promote_to_ufloat = False
+        if promote_to_ufloat:
+            scalars = [
+                item
+                if isinstance(item, UFloat)
+                else _ufloat_nan
+                if pd.isna(item)
+                else ufloat(item, 0)
+                for item in scalars
+            ]
         return cls(scalars, dtype=dtype, copy=copy)
 
     @classmethod
@@ -642,7 +645,7 @@ def factorize(
 
             codes = [-1] * len(self.data)
             # Note that item is a local variable provided in the loop below
-            vf = np.vectorize(lambda x: x == item, otypes=[bool])
+            vf = np.vectorize(lambda x: True if (x_na:=pd.isna(x))*(item_na:=pd.isna(item)) else (x_na==item_na and x==item), otypes=[bool])
             for code, item in enumerate(arr):
                 code_mask = vf(self._data)
                 codes = np.where(code_mask, code, codes)
@@ -663,7 +666,7 @@ def _values_for_factorize(self):
             for item in arr:
                 if item not in unique_data:
                     unique_data.append(item)
-            return np.array(unique_data), _ufloat_nan
+            return np.array(unique_data), pd.NA
         return arr._values_for_factorize()
 
     def value_counts(self, dropna=True):
@@ -690,19 +693,16 @@ def value_counts(self, dropna=True):
 
         # compute counts on the data with no nans
         data = self._data
+        nafilt = data.isna()
+        na_value = pd.NA
+        data = data[~nafilt]
         if HAS_UNCERTAINTIES and data.dtype.kind == "O":
-            nafilt = unp.isnan(data)
-            na_value = _ufloat_nan
-            data = data[~nafilt]
             unique_data = []
             for item in data:
                 if item not in unique_data:
                     unique_data.append(item)
             index = list(unique_data)
         else:
-            nafilt = np.isnan(data)
-            na_value = np.nan
-            data = data[~nafilt]
             index = list(set(data))
 
         data_list = data.tolist()
@@ -883,7 +883,7 @@ def __array__(self, dtype=None, copy=False):
 
     def _to_array_of_quantity(self, copy=False):
         qtys = [
-            self._Q(item, self._dtype.units) if not pd.isna(item) else item
+            self._Q(item, self._dtype.units) if item is not pd.NA else item
             for item in self._data
         ]
         with warnings.catch_warnings(record=True):
diff --git a/pint_pandas/testsuite/test_pandas_extensiontests.py b/pint_pandas/testsuite/test_pandas_extensiontests.py
@@ -193,9 +193,9 @@ def data_missing(numeric_dtype):
     numeric_dtype = dtypemap.get(numeric_dtype, numeric_dtype)
     if HAS_UNCERTAINTIES:
         numeric_dtype = None
-        dm = [_ufloat_nan, ufloat(1, 0)]
+        dm = [pd.NA, ufloat(1, 0)]
     else:
-        dm = [np.nan, 1]
+        dm = [pd.NA, 1]
     return PintArray.from_1darray_quantity(
         ureg.Quantity(pd.array(dm, dtype=numeric_dtype), ureg.meter)
     )
@@ -260,9 +260,9 @@ def data_missing_for_sorting(numeric_dtype):
     numeric_dtype = dtypemap.get(numeric_dtype, numeric_dtype)
     if HAS_UNCERTAINTIES:
         numeric_dtype = None
-        dms = [ufloat(4, 0), _ufloat_nan, ufloat(-5, 0)]
+        dms = [ufloat(4, 0), pd.NA, ufloat(-5, 0)]
     else:
-        dms = [4, np.nan, -5]
+        dms = [4, pd.NA, -5]
     return PintArray.from_1darray_quantity(
         ureg.Quantity(pd.array(dms, dtype=numeric_dtype), ureg.centimeter)
     )
@@ -271,8 +271,6 @@ def data_missing_for_sorting(numeric_dtype):
 @pytest.fixture
 def na_cmp():
     """Binary operator for comparing NA values."""
-    if HAS_UNCERTAINTIES:
-        return lambda x, y: bool(unp.isnan(x.magnitude)) & bool(unp.isnan(y.magnitude))
     return lambda x, y: bool(pd.isna(x.magnitude)) & bool(pd.isna(y.magnitude))
 
 
@@ -286,12 +284,11 @@ def data_for_grouping(numeric_dtype):
     a = 1.0
     b = 2.0**32 + 1
     c = 2.0**32 + 10
-    _n = np.nan
+    _n = pd.NA
     if HAS_UNCERTAINTIES:
         a = a + ufloat(0, 0)
         b = b + ufloat(0, 0)
         c = c + ufloat(0, 0)
-        _n = _ufloat_nan
         numeric_dtype = None
     else:
         numeric_dtype = dtypemap.get(numeric_dtype, numeric_dtype)