Skip to content

Commit 5270a46

Browse files
Fix numerous regressions in test_pandas_extensiontests
Fixes include: * Factorization * NaN handling (several more issues still need to be resolved) * Proper unit declarations in test_offset_concat * Integration of new `numeric_dtype` parameter A major outstanding issue (presently being discussed as pandas-dev/pandas#53904) concerns whether we can make AffineScalarFunc hashable and/or whether other legacy Pandas code (which has been deprecated) can be further removed. Signed-off-by: Michael Tiemann <[email protected]>
1 parent 959570f commit 5270a46

File tree

3 files changed

+123
-74
lines changed

3 files changed

+123
-74
lines changed

pint_pandas/pint_array.py

+114-72
Original file line numberDiff line numberDiff line change
@@ -262,64 +262,6 @@ def __init__(self, values, dtype=None, copy=False):
262262
copy = False
263263
elif not isinstance(values, pd.core.arrays.numeric.NumericArray):
264264
values = pd.array(values, copy=copy)
265-
else: # not isinstance(values, np.ndarray):
266-
if HAS_UNCERTAINTIES and dtype.kind == "O":
267-
values = np.array(values, dtype=object, copy=copy)
268-
else:
269-
values = np.array(values, copy=copy)
270-
copy = False
271-
if HAS_UNCERTAINTIES:
272-
if np.issubdtype(values.dtype, np.floating) or len(values) == 0:
273-
pass
274-
else:
275-
value_notna = [
276-
isinstance(v, UFloat)
277-
for v in values
278-
if not (pd.isna(v) or unp.isnan(v))
279-
]
280-
if value_notna == []:
281-
# all NaNs, either from our own data, or from Pint/Pandas internals
282-
pa_nan = _ufloat_nan if dtype.kind == "O" else np.nan
283-
for i in range(len(values)):
284-
# Promote/demote NaNs to match non-NaN magnitudes
285-
values[i] = pa_nan
286-
copy = False
287-
else:
288-
any_UFloats = any(value_notna)
289-
all_UFloats = all(value_notna)
290-
if any_UFloats != all_UFloats:
291-
# warnings.warn(
292-
# f"pint-pandas does not support certain magnitudes of {values.dtype}. Converting magnitudes to ufloat.",
293-
# category=RuntimeWarning,
294-
# )
295-
for i, v in enumerate(values):
296-
# List comprehensions are great, but they are not np.arrays!
297-
if not isinstance(v, UFloat):
298-
if pd.isna(v):
299-
values[i] = _ufloat_nan
300-
else:
301-
values[i] = ufloat(v, 0)
302-
elif unp.isnan(v):
303-
# Do we need to canonicalize our NaNs?
304-
values[i] = _ufloat_nan
305-
copy = False
306-
else:
307-
pa_nan = _ufloat_nan if any_UFloats else np.nan
308-
for i, v in enumerate(values):
309-
# Promote/demote NaNs to match non-NaN magnitudes
310-
if pd.isna(v) or unp.isnan(v):
311-
values[i] = pa_nan
312-
copy = False
313-
if not any_UFloats:
314-
values = values.astype(float)
315-
copy = False
316-
elif not np.issubdtype(values.dtype, np.floating):
317-
warnings.warn(
318-
f"pint-pandas does not support magnitudes of {values.dtype}. Converting magnitudes to float.",
319-
category=RuntimeWarning,
320-
)
321-
values = values.astype(float)
322-
copy = False
323265
if copy:
324266
values = values.copy()
325267
self._data = values
@@ -438,10 +380,11 @@ def isna(self):
438380
"""
439381
if HAS_UNCERTAINTIES:
440382
# GH https://github.com/lebigot/uncertainties/issues/164
441-
if isinstance(self._data, np.ndarray) and len(self._data) == 0:
383+
if len(self._data) == 0:
442384
# True or False doesn't matter--we just need the value for the type
443385
return np.full((0), True)
444-
return unp.isnan(self._data)
386+
elif isinstance(self._data[0], UFloat):
387+
return unp.isnan(self._data)
445388
return self._data.isna()
446389

447390
def astype(self, dtype, copy=True):
@@ -533,7 +476,8 @@ def take(self, indices, allow_fill=False, fill_value=None):
533476
Examples
534477
--------
535478
"""
536-
from pandas.core.algorithms import take, is_scalar
479+
from pandas.core.algorithms import take
480+
from pandas.core.dtypes.common import is_scalar
537481

538482
data = self._data
539483
if allow_fill and fill_value is None:
@@ -592,8 +536,8 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
592536
if dtype is None and isinstance(master_scalar, _Quantity):
593537
dtype = PintType(master_scalar.units)
594538

595-
def quantify_nan(item):
596-
if HAS_UNCERTAINTIES:
539+
def quantify_nan(item, promote_to_ufloat):
540+
if promote_to_ufloat:
597541
if type(item) is UFloat:
598542
return item * dtype.units
599543
if type(item) is float:
@@ -607,11 +551,19 @@ def quantify_nan(item):
607551
return item
608552

609553
if isinstance(master_scalar, _Quantity):
610-
scalars = [quantify_nan(item) for item in scalars]
554+
if HAS_UNCERTAINTIES:
555+
promote_to_ufloat = any([isinstance(item.m, UFloat) for item in scalars])
556+
else:
557+
promote_to_ufloat = False
558+
scalars = [quantify_nan(item, promote_to_ufloat) for item in scalars]
611559
scalars = [
612560
(item.to(dtype.units).magnitude if hasattr(item, "to") else item)
613561
for item in scalars
614562
]
563+
if HAS_UNCERTAINTIES:
564+
promote_to_ufloat = any([isinstance(item, UFloat) for item in scalars])
565+
if promote_to_ufloat:
566+
scalars = [item if isinstance(item, UFloat) else _ufloat_nan if np.isnan(item) else ufloat(item, 0) for item in scalars]
615567
return cls(scalars, dtype=dtype, copy=copy)
616568

617569
@classmethod
@@ -620,15 +572,90 @@ def _from_sequence_of_strings(cls, scalars, dtype=None, copy=False):
620572
dtype = PintType.construct_from_quantity_string(scalars[0])
621573
return cls._from_sequence([dtype.ureg.Quantity(x) for x in scalars])
622574

575+
def factorize(
576+
self,
577+
use_na_sentinel: bool = True,
578+
) -> tuple[np.ndarray, ExtensionArray]:
579+
"""
580+
Encode the extension array as an enumerated type.
581+
582+
Parameters
583+
----------
584+
use_na_sentinel : bool, default True
585+
If True, the sentinel -1 will be used for NaN values. If False,
586+
NaN values will be encoded as non-negative integers and will not drop the
587+
NaN from the uniques of the values.
588+
589+
.. versionadded:: 1.5.0
590+
591+
Returns
592+
-------
593+
codes : ndarray
594+
An integer NumPy array that's an indexer into the original
595+
ExtensionArray.
596+
uniques : ExtensionArray
597+
An ExtensionArray containing the unique values of `self`.
598+
599+
.. note::
600+
601+
uniques will *not* contain an entry for the NA value of
602+
the ExtensionArray if there are any missing values present
603+
in `self`.
604+
605+
See Also
606+
--------
607+
factorize : Top-level factorize method that dispatches here.
608+
609+
Notes
610+
-----
611+
:meth:`pandas.factorize` offers a `sort` keyword as well.
612+
"""
613+
# Implementer note: There are two ways to override the behavior of
614+
# pandas.factorize
615+
# 1. _values_for_factorize and _from_factorize.
616+
# Specify the values passed to pandas' internal factorization
617+
# routines, and how to convert from those values back to the
618+
# original ExtensionArray.
619+
# 2. ExtensionArray.factorize.
620+
# Complete control over factorization.
621+
if HAS_UNCERTAINTIES and self._data.dtype.kind == 'O':
622+
arr, na_value = self._values_for_factorize()
623+
624+
if not use_na_sentinel:
625+
# factorize can now handle differentiating various types of null values.
626+
# These can only occur when the array has object dtype.
627+
# However, for backwards compatibility we only use the null for the
628+
# provided dtype. This may be revisited in the future, see GH#48476.
629+
null_mask = isna(arr)
630+
if null_mask.any():
631+
# Don't modify (potentially user-provided) array
632+
arr = np.where(null_mask, na_value, arr)
633+
634+
codes = [-1] * len(self.data)
635+
# Note that item is a local variable provided in the loop below
636+
vf = np.vectorize(lambda x: x == item, otypes=[bool])
637+
for code, item in enumerate(arr):
638+
code_mask = vf(self._data)
639+
codes = np.where(code_mask, code, codes)
640+
641+
uniques_ea = self._from_factorized(arr, self)
642+
return codes, uniques_ea
643+
else:
644+
return super(PintArray, self).factorize(self, use_na_sentinel)
645+
623646
@classmethod
624647
def _from_factorized(cls, values, original):
625648
return cls(values, dtype=original.dtype)
626649

627650
def _values_for_factorize(self):
628651
arr = self._data
629-
if HAS_UNCERTAINTIES:
630-
return arr, _ufloat_nan
631-
return self._data._values_for_factorize()
652+
if HAS_UNCERTAINTIES and arr.dtype.kind == 'O':
653+
unique_data = []
654+
for item in arr:
655+
if item not in unique_data:
656+
unique_data.append(item)
657+
return np.array(unique_data), _ufloat_nan
658+
return arr._values_for_factorize()
632659

633660
def value_counts(self, dropna=True):
634661
"""
@@ -654,18 +681,26 @@ def value_counts(self, dropna=True):
654681

655682
# compute counts on the data with no nans
656683
data = self._data
657-
if HAS_UNCERTAINTIES:
684+
if HAS_UNCERTAINTIES and data.dtype.kind == 'O':
658685
nafilt = unp.isnan(data)
686+
na_value = _ufloat_nan
687+
data = data[~nafilt]
688+
unique_data = []
689+
for item in data:
690+
if item not in unique_data:
691+
unique_data.append(item)
692+
index = list(unique_data)
659693
else:
660694
nafilt = np.isnan(data)
661-
data = data[~nafilt]
695+
na_value = np.nan
696+
data = data[~nafilt]
697+
index = list(set(data))
662698

663699
data_list = data.tolist()
664-
index = list(set(data))
665700
array = [data_list.count(item) for item in index]
666701

667702
if not dropna:
668-
index.append(np.nan)
703+
index.append(na_value)
669704
array.append(nafilt.sum())
670705

671706
return Series(array, index=index)
@@ -679,7 +714,14 @@ def unique(self):
679714
"""
680715
from pandas import unique
681716

682-
return self._from_sequence(unique(self._data), dtype=self.dtype)
717+
data = self._data
718+
if HAS_UNCERTAINTIES and data.dtype.kind == 'O':
719+
unique_data = []
720+
for item in data:
721+
if item not in unique_data:
722+
unique_data.append(item)
723+
return self._from_sequence(pd.array(unique_data, dtype=data.dtype), dtype=self.dtype)
724+
return self._from_sequence(unique(data), dtype=self.dtype)
683725

684726
def __contains__(self, item) -> bool:
685727
if not isinstance(item, _Quantity):

pint_pandas/testsuite/test_issues.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ class TestIssue21(BaseExtensionTests):
6565
def test_offset_concat(self):
6666
q_a = ureg.Quantity(np.arange(5) + ufloat(0, 0), ureg.Unit("degC"))
6767
q_b = ureg.Quantity(np.arange(6) + ufloat(0, 0), ureg.Unit("degC"))
68-
q_a_ = np.append(q_a, ufloat(np.nan, 0))
68+
q_a_ = np.append(q_a, ureg.Quantity(ufloat(np.nan, 0), ureg.Unit("degC")))
6969

7070
a = pd.Series(PintArray(q_a))
7171
b = pd.Series(PintArray(q_b))

pint_pandas/testsuite/test_pandas_extensiontests.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ def numeric_dtype(request):
179179
@pytest.fixture
180180
def data(request, numeric_dtype):
181181
if HAS_UNCERTAINTIES:
182+
numeric_dtype = None
182183
d = (
183184
np.arange(start=1.0, stop=101.0, dtype=numeric_dtype) + ufloat(0, 0)
184185
) * ureg.nm
@@ -191,6 +192,7 @@ def data(request, numeric_dtype):
191192
def data_missing(numeric_dtype):
192193
numeric_dtype = dtypemap.get(numeric_dtype, numeric_dtype)
193194
if HAS_UNCERTAINTIES:
195+
numeric_dtype = None
194196
dm = [_ufloat_nan, ufloat(1, 0)]
195197
else:
196198
dm = [np.nan, 1]
@@ -202,6 +204,7 @@ def data_missing(numeric_dtype):
202204
@pytest.fixture
203205
def data_for_twos(numeric_dtype):
204206
if HAS_UNCERTAINTIES:
207+
numeric_dtype = None
205208
x = [ufloat(2.0, 0)] * 100
206209
else:
207210
x = [
@@ -243,6 +246,7 @@ def sort_by_key(request):
243246
@pytest.fixture
244247
def data_for_sorting(numeric_dtype):
245248
if HAS_UNCERTAINTIES:
249+
numeric_dtype = None
246250
ds = [ufloat(0.3, 0), ufloat(10, 0), ufloat(-50, 0)]
247251
else:
248252
ds = [0.3, 10, -50]
@@ -255,6 +259,7 @@ def data_for_sorting(numeric_dtype):
255259
def data_missing_for_sorting(numeric_dtype):
256260
numeric_dtype = dtypemap.get(numeric_dtype, numeric_dtype)
257261
if HAS_UNCERTAINTIES:
262+
numeric_dtype = None
258263
dms = [ufloat(4, 0), _ufloat_nan, ufloat(-5, 0)]
259264
else:
260265
dms = [4, np.nan, -5]
@@ -287,7 +292,9 @@ def data_for_grouping(numeric_dtype):
287292
b = b + ufloat(0, 0)
288293
c = c + ufloat(0, 0)
289294
_n = _ufloat_nan
290-
numeric_dtype = dtypemap.get(numeric_dtype, numeric_dtype)
295+
numeric_dtype = None
296+
else:
297+
numeric_dtype = dtypemap.get(numeric_dtype, numeric_dtype)
291298
return PintArray.from_1darray_quantity(
292299
ureg.Quantity(pd.array([b, b, _n, _n, a, a, b, c], dtype=numeric_dtype), ureg.m)
293300
)

0 commit comments

Comments
 (0)