Update to Pandas 2.1.0.dev0+1415.g9cfedf48fa

MichaelTiemannOSC · MichaelTiemannOSC · commit 9c7971baa872 · 2023-08-07T09:12:11.000-04:00
Additional code synchronizations (and the addition of a dtype-preserving map method). These changes were initially developed to support uncertainties, but the uncertainty changes have all been stripped out to simplify merging of underlying code. Once these changes are fully synced with a release version of Pandas 2.1, we can look at adding back uncertainties. These changes also tolerate complex128 as a base type for magnitudes, with one except (under discussion as pandas-dev/pandas#54445). Signed-off-by: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com>
diff --git a/pint_pandas/pint_array.py b/pint_pandas/pint_array.py
@@ -6,7 +6,7 @@
 import numpy as np
 import pandas as pd
 import pint
-from pandas import DataFrame, Series
+from pandas import DataFrame, Series, Index
 from pandas.api.extensions import (
     ExtensionArray,
     ExtensionDtype,
@@ -27,6 +27,8 @@
 # quantify/dequantify
 NO_UNIT = "No Unit"
 
+# from pint.facets.plain.quantity import PlainQuantity as _Quantity
+# from pint.facets.plain.unit import PlainUnit as _Unit
 
 class PintType(ExtensionDtype):
     """
@@ -65,7 +67,7 @@ def __new__(cls, units=None):
         if not isinstance(units, _Unit):
             units = cls._parse_dtype_strict(units)
             # ureg.unit returns a quantity with a magnitude of 1
-            # eg 1 mm. Initialising a quantity and taking it's unit
+            # eg 1 mm. Initialising a quantity and taking its unit
             # TODO: Seperate units from quantities in pint
             # to simplify this bit
             units = cls.ureg.Quantity(1, units).units
@@ -195,8 +197,8 @@ def __repr__(self):
     float: pd.Float64Dtype(),
     np.float64: pd.Float64Dtype(),
     np.float32: pd.Float32Dtype(),
-    np.complex128: pd.core.dtypes.dtypes.PandasDtype("complex128"),
-    np.complex64: pd.core.dtypes.dtypes.PandasDtype("complex64"),
+    np.complex128: pd.core.dtypes.dtypes.NumpyEADtype("complex128"),
+    np.complex64: pd.core.dtypes.dtypes.NumpyEADtype("complex64"),
     # np.float16: pd.Float16Dtype(),
 }
 dtypeunmap = {v: k for k, v in dtypemap.items()}
@@ -250,7 +252,6 @@ def __init__(self, values, dtype=None, copy=False):
             copy = False
         elif not isinstance(values, pd.core.arrays.numeric.NumericArray):
             values = pd.array(values, copy=copy)
-            copy = False
         if copy:
             values = values.copy()
         self._data = values
@@ -309,12 +310,22 @@ def __setitem__(self, key, value):
             # doing nothing here seems to be ok
             return
 
+        master_scalar = None
+        try:
+            master_scalar = next(i for i in self._data if pd.notna(i))
+        except StopIteration:
+            pass
+
         if isinstance(value, _Quantity):
             value = value.to(self.units).magnitude
-        elif is_list_like(value) and len(value) > 0 and isinstance(value[0], _Quantity):
-            value = [item.to(self.units).magnitude for item in value]
+        elif is_list_like(value) and len(value) > 0:
+            if isinstance(value[0], _Quantity):
+                value = [item.to(self.units).magnitude for item in value]
+            if len(value) == 1:
+                value = value[0]
 
         key = check_array_indexer(self, key)
+        # Filter out invalid values for our array type(s)
         try:
             self._data[key] = value
         except IndexError as e:
@@ -458,7 +469,8 @@ def take(self, indices, allow_fill=False, fill_value=None):
         Examples
         --------
         """
-        from pandas.core.algorithms import take, is_scalar
+        from pandas.core.algorithms import take
+        from pandas.core.dtypes.common import is_scalar
 
         data = self._data
         if allow_fill and fill_value is None:
@@ -470,7 +482,10 @@ def take(self, indices, allow_fill=False, fill_value=None):
                 # magnitude is in fact an array scalar, which will get rejected by pandas.
                 fill_value = fill_value[()]
 
-        result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            # Turn off warning that PandasArray is deprecated for ``take``
+            result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill)
 
         return PintArray(result, dtype=self.dtype)
 
@@ -512,22 +527,17 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
                 raise ValueError(
                     "Cannot infer dtype. No dtype specified and empty array"
                 )
-        if dtype is None and not isinstance(master_scalar, _Quantity):
-            raise ValueError("No dtype specified and not a sequence of quantities")
-        if dtype is None and isinstance(master_scalar, _Quantity):
+        if dtype is None:
+            if not isinstance(master_scalar, _Quantity):
+                raise ValueError("No dtype specified and not a sequence of quantities")
             dtype = PintType(master_scalar.units)
 
-        def quantify_nan(item):
-            if type(item) is float:
-                return item * dtype.units
-            return item
-
         if isinstance(master_scalar, _Quantity):
-            scalars = [quantify_nan(item) for item in scalars]
             scalars = [
                 (item.to(dtype.units).magnitude if hasattr(item, "to") else item)
                 for item in scalars
             ]
+        # When creating empty arrays, make them large enoguh to hold UFloats in case we need to do so later
         return cls(scalars, dtype=dtype, copy=copy)
 
     @classmethod
@@ -538,10 +548,21 @@ def _from_sequence_of_strings(cls, scalars, dtype=None, copy=False):
 
     @classmethod
     def _from_factorized(cls, values, original):
+        from pandas._libs.lib import infer_dtype
+
+        if infer_dtype(values) != "object":
+            values = pd.array(values, copy=False)
         return cls(values, dtype=original.dtype)
 
     def _values_for_factorize(self):
-        return self._data._values_for_factorize()
+        # factorize can now handle differentiating various types of null values.
+        # These can only occur when the array has object dtype.
+        # However, for backwards compatibility we only use the null for the
+        # provided dtype. This may be revisited in the future, see GH#48476.
+        arr = self._data
+        if arr.dtype.kind == "O":
+            return np.array(arr, copy=False), self.dtype.na_value.m
+        return arr._values_for_factorize()
 
     def value_counts(self, dropna=True):
         """
@@ -567,16 +588,17 @@ def value_counts(self, dropna=True):
 
         # compute counts on the data with no nans
         data = self._data
-        nafilt = np.isnan(data)
+        nafilt = pd.isna(data)
+        na_value = self.dtype.na_value.m
         data = data[~nafilt]
+        index = list(set(data))
 
         data_list = data.tolist()
-        index = list(set(data))
         array = [data_list.count(item) for item in index]
 
         if not dropna:
-            index.append(np.nan)
-            array.append(nafilt.sum())
+            index.append(na_value)
+            array.append(len(nafilt))
 
         return Series(array, index=index)
 
@@ -589,7 +611,8 @@ def unique(self):
         """
         from pandas import unique
 
-        return self._from_sequence(unique(self._data), dtype=self.dtype)
+        data = self._data
+        return self._from_sequence(unique(data), dtype=self.dtype)
 
     def __contains__(self, item) -> bool:
         if not isinstance(item, _Quantity):
@@ -691,7 +714,7 @@ def convert_values(param):
                 else:
                     return param
 
-            if isinstance(other, (Series, DataFrame)):
+            if isinstance(other, (Series, DataFrame, Index)):
                 return NotImplemented
             lvalues = self.quantity
             validate_length(lvalues, other)
@@ -740,7 +763,9 @@ def __array__(self, dtype=None, copy=False):
 
     def _to_array_of_quantity(self, copy=False):
         qtys = [
-            self._Q(item, self._dtype.units) if not pd.isna(item) else item
+            self._Q(item, self._dtype.units)
+            if item is not self.dtype.na_value.m
+            else self.dtype.na_value
             for item in self._data
         ]
         with warnings.catch_warnings(record=True):
@@ -798,7 +823,42 @@ def searchsorted(self, value, side="left", sorter=None):
             value = [item.to(self.units).magnitude for item in value]
         return arr.searchsorted(value, side=side, sorter=sorter)
 
-    def _reduce(self, name, **kwds):
+    def map(self, mapper, na_action=None):
+        """
+        Map values using an input mapping or function.
+
+        Parameters
+        ----------
+        mapper : function, dict, or Series
+            Mapping correspondence.
+        na_action : {None, 'ignore'}, default None
+            If 'ignore', propagate NA values, without passing them to the
+            mapping correspondence. If 'ignore' is not supported, a
+            ``NotImplementedError`` should be raised.
+
+        Returns
+        -------
+        If mapper is a function, operate on the magnitudes of the array and 
+        
+        """
+        if callable(mapper) and len(self):
+            from pandas._libs import lib
+
+            # This converts PintArray into array of Quantities
+            values = self.astype(object, copy=False)
+            # Using _from_sequence allows for possibility that mapper changes units
+            if na_action is None:
+                arr = lib.map_infer(values, mapper, convert=True)
+            else:
+                arr = lib.map_infer_mask(
+                    values, mapper, mask=pd.isna(values).view(np.uint8), convert=True
+                )
+            # If mapper doesn't return a Quantity, this will raise a ValueError
+            return PintArray._from_sequence(arr)
+        else:
+            return super().map(mapper, na_action=na_action)
+
+    def _reduce(self, name, *, skipna: bool = True, keepdims: bool = False, **kwds):
         """
         Return a scalar result of performing the reduction operation.
 
@@ -842,14 +902,20 @@ def _reduce(self, name, **kwds):
 
         if isinstance(self._data, ExtensionArray):
             try:
-                result = self._data._reduce(name, **kwds)
+                result = self._data._reduce(
+                    name, skipna=skipna, keepdims=keepdims, **kwds
+                )
             except NotImplementedError:
                 result = functions[name](self.numpy_data, **kwds)
 
         if name in {"all", "any", "kurt", "skew"}:
             return result
         if name == "var":
+            if keepdims:
+                return PintArray(result, f"pint[({self.units})**2]")
             return self._Q(result, self.units**2)
+        if keepdims:
+            return PintArray(result, self.dtype)
         return self._Q(result, self.units)
 
     def _accumulate(self, name: str, *, skipna: bool = True, **kwds):
@@ -866,7 +932,6 @@ def _accumulate(self, name: str, *, skipna: bool = True, **kwds):
                 result = self._data._accumulate(name, **kwds)
             except NotImplementedError:
                 result = functions[name](self.numpy_data, **kwds)
-        print(result)
 
         return self._from_sequence(result, self.units)
 
diff --git a/pint_pandas/testsuite/test_issues.py b/pint_pandas/testsuite/test_issues.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 import pandas as pd
+import pandas._testing as tm
 import pytest
 import pint
 from pandas.tests.extension.base.base import BaseExtensionTests
@@ -41,7 +42,7 @@ def test_force_ndarray_like(self):
             expected = pd.DataFrame(
                 {0: PintArray(q_a_), 1: PintArray(q_b)}, dtype="pint[degC]"
             )
-            self.assert_equal(result, expected)
+            tm.assert_equal(result, expected)
 
         finally:
             # restore registry
@@ -64,7 +65,7 @@ def test_offset_concat(self):
         expected = pd.DataFrame(
             {0: PintArray(q_a_), 1: PintArray(q_b)}, dtype="pint[degC]"
         )
-        self.assert_equal(result, expected)
+        tm.assert_equal(result, expected)
 
         # issue #141
         print(PintArray(q_a))
@@ -80,7 +81,7 @@ def test_assignment_add_empty(self):
         result = pd.Series(data)
         result[[]] += data[0]
         expected = pd.Series(data)
-        self.assert_series_equal(result, expected)
+        tm.assert_series_equal(result, expected)
 
 
 class TestIssue80:
@@ -167,3 +168,17 @@ def test_issue_127():
     a = PintType.construct_from_string("pint[dimensionless]")
     b = PintType.construct_from_string("pint[]")
     assert a == b
+
+
+class TestIssue174(BaseExtensionTests):
+    def test_sum(self):
+        a = pd.DataFrame([[0, 1, 2], [3, 4, 5]]).astype("pint[m]")
+        row_sum = a.sum(axis=0)
+        expected_1 = pd.Series([3, 5, 7], dtype="pint[m]")
+
+        tm.assert_series_equal(row_sum, expected_1)
+
+        col_sum = a.sum(axis=1)
+        expected_2 = pd.Series([3, 12], dtype="pint[m]")
+
+        tm.assert_series_equal(col_sum, expected_2)