Skip to content

Commit 9c7971b

Browse files
Update to Pandas 2.1.0.dev0+1415.g9cfedf48fa
Additional code synchronizations (and the addition of a dtype-preserving map method). These changes were initially developed to support uncertainties, but the uncertainty changes have all been stripped out to simplify merging of underlying code. Once these changes are fully synced with a release version of Pandas 2.1, we can look at adding back uncertainties. These changes also tolerate complex128 as a base type for magnitudes, with one except (under discussion as pandas-dev/pandas#54445). Signed-off-by: Michael Tiemann <[email protected]>
1 parent bf84e37 commit 9c7971b

File tree

2 files changed

+112
-32
lines changed

2 files changed

+112
-32
lines changed

pint_pandas/pint_array.py

+94-29
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import numpy as np
77
import pandas as pd
88
import pint
9-
from pandas import DataFrame, Series
9+
from pandas import DataFrame, Series, Index
1010
from pandas.api.extensions import (
1111
ExtensionArray,
1212
ExtensionDtype,
@@ -27,6 +27,8 @@
2727
# quantify/dequantify
2828
NO_UNIT = "No Unit"
2929

30+
# from pint.facets.plain.quantity import PlainQuantity as _Quantity
31+
# from pint.facets.plain.unit import PlainUnit as _Unit
3032

3133
class PintType(ExtensionDtype):
3234
"""
@@ -65,7 +67,7 @@ def __new__(cls, units=None):
6567
if not isinstance(units, _Unit):
6668
units = cls._parse_dtype_strict(units)
6769
# ureg.unit returns a quantity with a magnitude of 1
68-
# eg 1 mm. Initialising a quantity and taking it's unit
70+
# eg 1 mm. Initialising a quantity and taking its unit
6971
# TODO: Seperate units from quantities in pint
7072
# to simplify this bit
7173
units = cls.ureg.Quantity(1, units).units
@@ -195,8 +197,8 @@ def __repr__(self):
195197
float: pd.Float64Dtype(),
196198
np.float64: pd.Float64Dtype(),
197199
np.float32: pd.Float32Dtype(),
198-
np.complex128: pd.core.dtypes.dtypes.PandasDtype("complex128"),
199-
np.complex64: pd.core.dtypes.dtypes.PandasDtype("complex64"),
200+
np.complex128: pd.core.dtypes.dtypes.NumpyEADtype("complex128"),
201+
np.complex64: pd.core.dtypes.dtypes.NumpyEADtype("complex64"),
200202
# np.float16: pd.Float16Dtype(),
201203
}
202204
dtypeunmap = {v: k for k, v in dtypemap.items()}
@@ -250,7 +252,6 @@ def __init__(self, values, dtype=None, copy=False):
250252
copy = False
251253
elif not isinstance(values, pd.core.arrays.numeric.NumericArray):
252254
values = pd.array(values, copy=copy)
253-
copy = False
254255
if copy:
255256
values = values.copy()
256257
self._data = values
@@ -309,12 +310,22 @@ def __setitem__(self, key, value):
309310
# doing nothing here seems to be ok
310311
return
311312

313+
master_scalar = None
314+
try:
315+
master_scalar = next(i for i in self._data if pd.notna(i))
316+
except StopIteration:
317+
pass
318+
312319
if isinstance(value, _Quantity):
313320
value = value.to(self.units).magnitude
314-
elif is_list_like(value) and len(value) > 0 and isinstance(value[0], _Quantity):
315-
value = [item.to(self.units).magnitude for item in value]
321+
elif is_list_like(value) and len(value) > 0:
322+
if isinstance(value[0], _Quantity):
323+
value = [item.to(self.units).magnitude for item in value]
324+
if len(value) == 1:
325+
value = value[0]
316326

317327
key = check_array_indexer(self, key)
328+
# Filter out invalid values for our array type(s)
318329
try:
319330
self._data[key] = value
320331
except IndexError as e:
@@ -458,7 +469,8 @@ def take(self, indices, allow_fill=False, fill_value=None):
458469
Examples
459470
--------
460471
"""
461-
from pandas.core.algorithms import take, is_scalar
472+
from pandas.core.algorithms import take
473+
from pandas.core.dtypes.common import is_scalar
462474

463475
data = self._data
464476
if allow_fill and fill_value is None:
@@ -470,7 +482,10 @@ def take(self, indices, allow_fill=False, fill_value=None):
470482
# magnitude is in fact an array scalar, which will get rejected by pandas.
471483
fill_value = fill_value[()]
472484

473-
result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill)
485+
with warnings.catch_warnings():
486+
warnings.simplefilter("ignore")
487+
# Turn off warning that PandasArray is deprecated for ``take``
488+
result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill)
474489

475490
return PintArray(result, dtype=self.dtype)
476491

@@ -512,22 +527,17 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
512527
raise ValueError(
513528
"Cannot infer dtype. No dtype specified and empty array"
514529
)
515-
if dtype is None and not isinstance(master_scalar, _Quantity):
516-
raise ValueError("No dtype specified and not a sequence of quantities")
517-
if dtype is None and isinstance(master_scalar, _Quantity):
530+
if dtype is None:
531+
if not isinstance(master_scalar, _Quantity):
532+
raise ValueError("No dtype specified and not a sequence of quantities")
518533
dtype = PintType(master_scalar.units)
519534

520-
def quantify_nan(item):
521-
if type(item) is float:
522-
return item * dtype.units
523-
return item
524-
525535
if isinstance(master_scalar, _Quantity):
526-
scalars = [quantify_nan(item) for item in scalars]
527536
scalars = [
528537
(item.to(dtype.units).magnitude if hasattr(item, "to") else item)
529538
for item in scalars
530539
]
540+
# When creating empty arrays, make them large enoguh to hold UFloats in case we need to do so later
531541
return cls(scalars, dtype=dtype, copy=copy)
532542

533543
@classmethod
@@ -538,10 +548,21 @@ def _from_sequence_of_strings(cls, scalars, dtype=None, copy=False):
538548

539549
@classmethod
540550
def _from_factorized(cls, values, original):
551+
from pandas._libs.lib import infer_dtype
552+
553+
if infer_dtype(values) != "object":
554+
values = pd.array(values, copy=False)
541555
return cls(values, dtype=original.dtype)
542556

543557
def _values_for_factorize(self):
544-
return self._data._values_for_factorize()
558+
# factorize can now handle differentiating various types of null values.
559+
# These can only occur when the array has object dtype.
560+
# However, for backwards compatibility we only use the null for the
561+
# provided dtype. This may be revisited in the future, see GH#48476.
562+
arr = self._data
563+
if arr.dtype.kind == "O":
564+
return np.array(arr, copy=False), self.dtype.na_value.m
565+
return arr._values_for_factorize()
545566

546567
def value_counts(self, dropna=True):
547568
"""
@@ -567,16 +588,17 @@ def value_counts(self, dropna=True):
567588

568589
# compute counts on the data with no nans
569590
data = self._data
570-
nafilt = np.isnan(data)
591+
nafilt = pd.isna(data)
592+
na_value = self.dtype.na_value.m
571593
data = data[~nafilt]
594+
index = list(set(data))
572595

573596
data_list = data.tolist()
574-
index = list(set(data))
575597
array = [data_list.count(item) for item in index]
576598

577599
if not dropna:
578-
index.append(np.nan)
579-
array.append(nafilt.sum())
600+
index.append(na_value)
601+
array.append(len(nafilt))
580602

581603
return Series(array, index=index)
582604

@@ -589,7 +611,8 @@ def unique(self):
589611
"""
590612
from pandas import unique
591613

592-
return self._from_sequence(unique(self._data), dtype=self.dtype)
614+
data = self._data
615+
return self._from_sequence(unique(data), dtype=self.dtype)
593616

594617
def __contains__(self, item) -> bool:
595618
if not isinstance(item, _Quantity):
@@ -691,7 +714,7 @@ def convert_values(param):
691714
else:
692715
return param
693716

694-
if isinstance(other, (Series, DataFrame)):
717+
if isinstance(other, (Series, DataFrame, Index)):
695718
return NotImplemented
696719
lvalues = self.quantity
697720
validate_length(lvalues, other)
@@ -740,7 +763,9 @@ def __array__(self, dtype=None, copy=False):
740763

741764
def _to_array_of_quantity(self, copy=False):
742765
qtys = [
743-
self._Q(item, self._dtype.units) if not pd.isna(item) else item
766+
self._Q(item, self._dtype.units)
767+
if item is not self.dtype.na_value.m
768+
else self.dtype.na_value
744769
for item in self._data
745770
]
746771
with warnings.catch_warnings(record=True):
@@ -798,7 +823,42 @@ def searchsorted(self, value, side="left", sorter=None):
798823
value = [item.to(self.units).magnitude for item in value]
799824
return arr.searchsorted(value, side=side, sorter=sorter)
800825

801-
def _reduce(self, name, **kwds):
826+
def map(self, mapper, na_action=None):
827+
"""
828+
Map values using an input mapping or function.
829+
830+
Parameters
831+
----------
832+
mapper : function, dict, or Series
833+
Mapping correspondence.
834+
na_action : {None, 'ignore'}, default None
835+
If 'ignore', propagate NA values, without passing them to the
836+
mapping correspondence. If 'ignore' is not supported, a
837+
``NotImplementedError`` should be raised.
838+
839+
Returns
840+
-------
841+
If mapper is a function, operate on the magnitudes of the array and
842+
843+
"""
844+
if callable(mapper) and len(self):
845+
from pandas._libs import lib
846+
847+
# This converts PintArray into array of Quantities
848+
values = self.astype(object, copy=False)
849+
# Using _from_sequence allows for possibility that mapper changes units
850+
if na_action is None:
851+
arr = lib.map_infer(values, mapper, convert=True)
852+
else:
853+
arr = lib.map_infer_mask(
854+
values, mapper, mask=pd.isna(values).view(np.uint8), convert=True
855+
)
856+
# If mapper doesn't return a Quantity, this will raise a ValueError
857+
return PintArray._from_sequence(arr)
858+
else:
859+
return super().map(mapper, na_action=na_action)
860+
861+
def _reduce(self, name, *, skipna: bool = True, keepdims: bool = False, **kwds):
802862
"""
803863
Return a scalar result of performing the reduction operation.
804864
@@ -842,14 +902,20 @@ def _reduce(self, name, **kwds):
842902

843903
if isinstance(self._data, ExtensionArray):
844904
try:
845-
result = self._data._reduce(name, **kwds)
905+
result = self._data._reduce(
906+
name, skipna=skipna, keepdims=keepdims, **kwds
907+
)
846908
except NotImplementedError:
847909
result = functions[name](self.numpy_data, **kwds)
848910

849911
if name in {"all", "any", "kurt", "skew"}:
850912
return result
851913
if name == "var":
914+
if keepdims:
915+
return PintArray(result, f"pint[({self.units})**2]")
852916
return self._Q(result, self.units**2)
917+
if keepdims:
918+
return PintArray(result, self.dtype)
853919
return self._Q(result, self.units)
854920

855921
def _accumulate(self, name: str, *, skipna: bool = True, **kwds):
@@ -866,7 +932,6 @@ def _accumulate(self, name: str, *, skipna: bool = True, **kwds):
866932
result = self._data._accumulate(name, **kwds)
867933
except NotImplementedError:
868934
result = functions[name](self.numpy_data, **kwds)
869-
print(result)
870935

871936
return self._from_sequence(result, self.units)
872937

pint_pandas/testsuite/test_issues.py

+18-3
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import numpy as np
55
import pandas as pd
6+
import pandas._testing as tm
67
import pytest
78
import pint
89
from pandas.tests.extension.base.base import BaseExtensionTests
@@ -41,7 +42,7 @@ def test_force_ndarray_like(self):
4142
expected = pd.DataFrame(
4243
{0: PintArray(q_a_), 1: PintArray(q_b)}, dtype="pint[degC]"
4344
)
44-
self.assert_equal(result, expected)
45+
tm.assert_equal(result, expected)
4546

4647
finally:
4748
# restore registry
@@ -64,7 +65,7 @@ def test_offset_concat(self):
6465
expected = pd.DataFrame(
6566
{0: PintArray(q_a_), 1: PintArray(q_b)}, dtype="pint[degC]"
6667
)
67-
self.assert_equal(result, expected)
68+
tm.assert_equal(result, expected)
6869

6970
# issue #141
7071
print(PintArray(q_a))
@@ -80,7 +81,7 @@ def test_assignment_add_empty(self):
8081
result = pd.Series(data)
8182
result[[]] += data[0]
8283
expected = pd.Series(data)
83-
self.assert_series_equal(result, expected)
84+
tm.assert_series_equal(result, expected)
8485

8586

8687
class TestIssue80:
@@ -167,3 +168,17 @@ def test_issue_127():
167168
a = PintType.construct_from_string("pint[dimensionless]")
168169
b = PintType.construct_from_string("pint[]")
169170
assert a == b
171+
172+
173+
class TestIssue174(BaseExtensionTests):
174+
def test_sum(self):
175+
a = pd.DataFrame([[0, 1, 2], [3, 4, 5]]).astype("pint[m]")
176+
row_sum = a.sum(axis=0)
177+
expected_1 = pd.Series([3, 5, 7], dtype="pint[m]")
178+
179+
tm.assert_series_equal(row_sum, expected_1)
180+
181+
col_sum = a.sum(axis=1)
182+
expected_2 = pd.Series([3, 12], dtype="pint[m]")
183+
184+
tm.assert_series_equal(col_sum, expected_2)

0 commit comments

Comments
 (0)