Skip to content

ENH: Use IntergerArray to avoid forced conversion from integer to float #27335

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Nov 13, 2019
Merged
53 changes: 48 additions & 5 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -971,6 +971,7 @@ cdef class Seen:
bint nat_ # seen nat
bint bool_ # seen_bool
bint null_ # seen_null
bint nan_ # seen_np.nan
bint uint_ # seen_uint (unsigned integer)
bint sint_ # seen_sint (signed integer)
bint float_ # seen_float
Expand All @@ -995,6 +996,7 @@ cdef class Seen:
self.nat_ = 0
self.bool_ = 0
self.null_ = 0
self.nan_ = 0
self.uint_ = 0
self.sint_ = 0
self.float_ = 0
Expand Down Expand Up @@ -1956,10 +1958,37 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
@cython.wraparound(False)
def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
bint safe=0, bint convert_datetime=0,
bint convert_timedelta=0):
bint convert_timedelta=0,
bint convert_to_nullable_integer=0):
"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you can update this doc-string here e.g. Returns / Parameters

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doc-string added

Type inference function-- convert object array to proper dtype

Parameters
----------
values : ndarray
Array of object elements to convert.
try_float : bool, default False
If an array-like object contains only float or NaN values is
encountered, whether to convert and return an array of float dtype.
safe : bool, default False
Whether to upcast numeric type (e.g. int cast to float). If set to
True, no upcasting will be performed.
convert_datetime : bool, default False
If an array-like object contains only datetime values or NaT is
encountered, whether to convert and return an array of M8[ns] dtype.
convert_timedelta : bool, default False
If an array-like object contains only timedelta values or NaT is
encountered, whether to convert and return an array of m8[ns] dtype.
convert_to_nullable_integer : bool, default False
If an array-like object contains only interger values (and NaN) is
encountered, whether to convert and return an IntegerArray.

Returns
-------
array : array of converted object values to more specific dtypes if
pplicable
"""

cdef:
Py_ssize_t i, n
ndarray[float64_t] floats
Expand All @@ -1980,6 +2009,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
ints = np.empty(n, dtype='i8')
uints = np.empty(n, dtype='u8')
bools = np.empty(n, dtype=np.uint8)
mask = np.full(n, False)

if convert_datetime:
datetimes = np.empty(n, dtype='M8[ns]')
Expand All @@ -1997,6 +2027,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
if val is None:
seen.null_ = 1
floats[i] = complexes[i] = fnan
mask[i] = True
elif val is NaT:
seen.nat_ = 1
if convert_datetime:
Expand All @@ -2006,6 +2037,10 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
if not (convert_datetime or convert_timedelta):
seen.object_ = 1
break
elif val is np.nan:
seen.nan_ = 1
mask[i] = True
floats[i] = complexes[i] = val
elif util.is_bool_object(val):
seen.bool_ = 1
bools[i] = val
Expand Down Expand Up @@ -2087,11 +2122,19 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,

if not seen.object_:
if not safe:
if seen.null_:
if seen.null_ or seen.nan_:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

separate issue is we should change line 2083 to use a DatetimeArray (can be separate PR) or here if it works out.

can you also update the doc-string (well add it really :->) thanks for workign on this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will leave line 2083 out as it is a separate issue.
Which version of the doc-string shall I modify?

if seen.is_float_or_complex:
if seen.complex_:
return complexes
elif seen.float_ or seen.int_:
elif seen.float_:
return floats
elif seen.int_:
if convert_to_nullable_integer:
from pandas.core.arrays import IntegerArray
return IntegerArray(ints, mask)
else:
return floats
elif seen.nan_:
return floats
else:
if not seen.bool_:
Expand Down Expand Up @@ -2130,7 +2173,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
if seen.complex_:
if not seen.int_:
return complexes
elif seen.float_:
elif seen.float_ or seen.nan_:
if not seen.int_:
return floats
else:
Expand All @@ -2154,7 +2197,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
if seen.complex_:
if not seen.int_:
return complexes
elif seen.float_:
elif seen.float_ or seen.nan_:
if not seen.int_:
return floats
elif seen.int_:
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -857,9 +857,9 @@ def style(self):
... index=['panda', 'polar', 'koala'])
>>> df
species population
panda bear 1864
polar bear 22000
koala marsupial 80000
panda bear 1864
polar bear 22000
koala marsupial 80000
>>> for label, content in df.items():
... print('label:', label)
... print('content:', content, sep='\n')
Expand Down
1 change: 1 addition & 0 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -548,6 +548,7 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
tuple
arrays, columns
"""

if columns is None:
gen = (list(x.keys()) for x in data)
types = (dict, OrderedDict) if PY36 else OrderedDict
Expand Down
15 changes: 15 additions & 0 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
Timestamp,
isna,
)
from pandas.core.arrays import IntegerArray
import pandas.util.testing as tm


Expand Down Expand Up @@ -552,6 +553,20 @@ def test_maybe_convert_objects_datetime(self):
out = lib.maybe_convert_objects(arr, convert_datetime=1, convert_timedelta=1)
tm.assert_numpy_array_equal(out, exp)

@pytest.mark.parametrize(
"exp",
[
IntegerArray(np.array([2, 0], dtype="i8"), np.array([False, True])),
IntegerArray(np.array([2, 0], dtype="int64"), np.array([False, True])),
],
)
def test_maybe_convert_objects_nullable_integer(self, exp):
# GH27335
arr = np.array([2, np.NaN], dtype=object)
result = lib.maybe_convert_objects(arr, convert_to_nullable_integer=1)

tm.assert_extension_array_equal(result, exp)

def test_mixed_dtypes_remain_object_array(self):
# GH14956
array = np.array([datetime(2015, 1, 1, tzinfo=pytz.utc), 1], dtype=object)
Expand Down