Skip to content

Commit f3a26d8

Browse files
jiangyue12392proost
authored andcommitted
ENH: Use IntergerArray to avoid forced conversion from integer to float (pandas-dev#27335)
1 parent 5d21074 commit f3a26d8

File tree

4 files changed

+67
-8
lines changed

4 files changed

+67
-8
lines changed

pandas/_libs/lib.pyx

+48-5
Original file line numberDiff line numberDiff line change
@@ -971,6 +971,7 @@ cdef class Seen:
971971
bint nat_ # seen nat
972972
bint bool_ # seen_bool
973973
bint null_ # seen_null
974+
bint nan_ # seen_np.nan
974975
bint uint_ # seen_uint (unsigned integer)
975976
bint sint_ # seen_sint (signed integer)
976977
bint float_ # seen_float
@@ -995,6 +996,7 @@ cdef class Seen:
995996
self.nat_ = 0
996997
self.bool_ = 0
997998
self.null_ = 0
999+
self.nan_ = 0
9981000
self.uint_ = 0
9991001
self.sint_ = 0
10001002
self.float_ = 0
@@ -1953,10 +1955,37 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
19531955
@cython.wraparound(False)
19541956
def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
19551957
bint safe=0, bint convert_datetime=0,
1956-
bint convert_timedelta=0):
1958+
bint convert_timedelta=0,
1959+
bint convert_to_nullable_integer=0):
19571960
"""
19581961
Type inference function-- convert object array to proper dtype
1962+
1963+
Parameters
1964+
----------
1965+
values : ndarray
1966+
Array of object elements to convert.
1967+
try_float : bool, default False
1968+
If an array-like object contains only float or NaN values is
1969+
encountered, whether to convert and return an array of float dtype.
1970+
safe : bool, default False
1971+
Whether to upcast numeric type (e.g. int cast to float). If set to
1972+
True, no upcasting will be performed.
1973+
convert_datetime : bool, default False
1974+
If an array-like object contains only datetime values or NaT is
1975+
encountered, whether to convert and return an array of M8[ns] dtype.
1976+
convert_timedelta : bool, default False
1977+
If an array-like object contains only timedelta values or NaT is
1978+
encountered, whether to convert and return an array of m8[ns] dtype.
1979+
convert_to_nullable_integer : bool, default False
1980+
If an array-like object contains only interger values (and NaN) is
1981+
encountered, whether to convert and return an IntegerArray.
1982+
1983+
Returns
1984+
-------
1985+
array : array of converted object values to more specific dtypes if
1986+
pplicable
19591987
"""
1988+
19601989
cdef:
19611990
Py_ssize_t i, n
19621991
ndarray[float64_t] floats
@@ -1977,6 +2006,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
19772006
ints = np.empty(n, dtype='i8')
19782007
uints = np.empty(n, dtype='u8')
19792008
bools = np.empty(n, dtype=np.uint8)
2009+
mask = np.full(n, False)
19802010

19812011
if convert_datetime:
19822012
datetimes = np.empty(n, dtype='M8[ns]')
@@ -1994,6 +2024,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
19942024
if val is None:
19952025
seen.null_ = 1
19962026
floats[i] = complexes[i] = fnan
2027+
mask[i] = True
19972028
elif val is NaT:
19982029
seen.nat_ = 1
19992030
if convert_datetime:
@@ -2003,6 +2034,10 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
20032034
if not (convert_datetime or convert_timedelta):
20042035
seen.object_ = 1
20052036
break
2037+
elif val is np.nan:
2038+
seen.nan_ = 1
2039+
mask[i] = True
2040+
floats[i] = complexes[i] = val
20062041
elif util.is_bool_object(val):
20072042
seen.bool_ = 1
20082043
bools[i] = val
@@ -2084,11 +2119,19 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
20842119

20852120
if not seen.object_:
20862121
if not safe:
2087-
if seen.null_:
2122+
if seen.null_ or seen.nan_:
20882123
if seen.is_float_or_complex:
20892124
if seen.complex_:
20902125
return complexes
2091-
elif seen.float_ or seen.int_:
2126+
elif seen.float_:
2127+
return floats
2128+
elif seen.int_:
2129+
if convert_to_nullable_integer:
2130+
from pandas.core.arrays import IntegerArray
2131+
return IntegerArray(ints, mask)
2132+
else:
2133+
return floats
2134+
elif seen.nan_:
20922135
return floats
20932136
else:
20942137
if not seen.bool_:
@@ -2127,7 +2170,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
21272170
if seen.complex_:
21282171
if not seen.int_:
21292172
return complexes
2130-
elif seen.float_:
2173+
elif seen.float_ or seen.nan_:
21312174
if not seen.int_:
21322175
return floats
21332176
else:
@@ -2151,7 +2194,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
21512194
if seen.complex_:
21522195
if not seen.int_:
21532196
return complexes
2154-
elif seen.float_:
2197+
elif seen.float_ or seen.nan_:
21552198
if not seen.int_:
21562199
return floats
21572200
elif seen.int_:

pandas/core/frame.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -857,9 +857,9 @@ def style(self):
857857
... index=['panda', 'polar', 'koala'])
858858
>>> df
859859
species population
860-
panda bear 1864
861-
polar bear 22000
862-
koala marsupial 80000
860+
panda bear 1864
861+
polar bear 22000
862+
koala marsupial 80000
863863
>>> for label, content in df.items():
864864
... print('label:', label)
865865
... print('content:', content, sep='\n')

pandas/core/internals/construction.py

+1
Original file line numberDiff line numberDiff line change
@@ -548,6 +548,7 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
548548
tuple
549549
arrays, columns
550550
"""
551+
551552
if columns is None:
552553
gen = (list(x.keys()) for x in data)
553554
types = (dict, OrderedDict) if PY36 else OrderedDict

pandas/tests/dtypes/test_inference.py

+15
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
Timestamp,
5252
isna,
5353
)
54+
from pandas.core.arrays import IntegerArray
5455
import pandas.util.testing as tm
5556

5657

@@ -552,6 +553,20 @@ def test_maybe_convert_objects_datetime(self):
552553
out = lib.maybe_convert_objects(arr, convert_datetime=1, convert_timedelta=1)
553554
tm.assert_numpy_array_equal(out, exp)
554555

556+
@pytest.mark.parametrize(
557+
"exp",
558+
[
559+
IntegerArray(np.array([2, 0], dtype="i8"), np.array([False, True])),
560+
IntegerArray(np.array([2, 0], dtype="int64"), np.array([False, True])),
561+
],
562+
)
563+
def test_maybe_convert_objects_nullable_integer(self, exp):
564+
# GH27335
565+
arr = np.array([2, np.NaN], dtype=object)
566+
result = lib.maybe_convert_objects(arr, convert_to_nullable_integer=1)
567+
568+
tm.assert_extension_array_equal(result, exp)
569+
555570
def test_mixed_dtypes_remain_object_array(self):
556571
# GH14956
557572
array = np.array([datetime(2015, 1, 1, tzinfo=pytz.utc), 1], dtype=object)

0 commit comments

Comments
 (0)