Skip to content

Commit dbc647d

Browse files
author
Jiang Yue
committed
Use IntergerArray for integer arrays with null
1 parent c74a853 commit dbc647d

File tree

4 files changed

+40
-15
lines changed

4 files changed

+40
-15
lines changed

pandas/_libs/lib.pyx

+21-5
Original file line numberDiff line numberDiff line change
@@ -944,6 +944,7 @@ cdef class Seen:
944944
bint int_ # seen_int
945945
bint bool_ # seen_bool
946946
bint null_ # seen_null
947+
bint nan_ # seen_np.nan
947948
bint uint_ # seen_uint (unsigned integer)
948949
bint sint_ # seen_sint (signed integer)
949950
bint float_ # seen_float
@@ -967,6 +968,7 @@ cdef class Seen:
967968
self.int_ = 0
968969
self.bool_ = 0
969970
self.null_ = 0
971+
self.nan_ = 0
970972
self.uint_ = 0
971973
self.sint_ = 0
972974
self.float_ = 0
@@ -1905,7 +1907,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
19051907
@cython.wraparound(False)
19061908
def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
19071909
bint safe=0, bint convert_datetime=0,
1908-
bint convert_timedelta=0):
1910+
bint convert_timedelta=0, to_integer_array=False):
19091911
"""
19101912
Type inference function-- convert object array to proper dtype
19111913
"""
@@ -1929,6 +1931,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
19291931
ints = np.empty(n, dtype='i8')
19301932
uints = np.empty(n, dtype='u8')
19311933
bools = np.empty(n, dtype=np.uint8)
1934+
mask = np.full(n, False)
19321935

19331936
if convert_datetime:
19341937
datetimes = np.empty(n, dtype='M8[ns]')
@@ -1946,6 +1949,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
19461949
if val is None:
19471950
seen.null_ = 1
19481951
floats[i] = complexes[i] = fnan
1952+
mask[i] = True
19491953
elif val is NaT:
19501954
if convert_datetime:
19511955
idatetimes[i] = NPY_NAT
@@ -1956,6 +1960,10 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
19561960
if not (convert_datetime or convert_timedelta):
19571961
seen.object_ = 1
19581962
break
1963+
elif val is np.nan:
1964+
seen.nan_ = 1
1965+
mask[i] = True
1966+
floats[i] = complexes[i] = val
19591967
elif util.is_bool_object(val):
19601968
seen.bool_ = 1
19611969
bools[i] = val
@@ -2037,11 +2045,19 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
20372045

20382046
if not seen.object_:
20392047
if not safe:
2040-
if seen.null_:
2048+
if seen.null_ or seen.nan_:
20412049
if seen.is_float_or_complex:
20422050
if seen.complex_:
20432051
return complexes
2044-
elif seen.float_ or seen.int_:
2052+
elif seen.float_:
2053+
return floats
2054+
elif seen.int_:
2055+
if to_integer_array:
2056+
from pandas.core.arrays import IntegerArray
2057+
return IntegerArray(ints, mask)
2058+
else:
2059+
return floats
2060+
elif seen.nan_:
20452061
return floats
20462062
else:
20472063
if not seen.bool_:
@@ -2071,7 +2087,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
20712087
if seen.complex_:
20722088
if not seen.int_:
20732089
return complexes
2074-
elif seen.float_:
2090+
elif seen.float_ or seen.nan_:
20752091
if not seen.int_:
20762092
return floats
20772093
else:
@@ -2086,7 +2102,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
20862102
if seen.complex_:
20872103
if not seen.int_:
20882104
return complexes
2089-
elif seen.float_:
2105+
elif seen.float_ or seen.nan_:
20902106
if not seen.int_:
20912107
return floats
20922108
elif seen.int_:

pandas/core/frame.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,8 @@ def _constructor_expanddim(self):
389389
# ----------------------------------------------------------------------
390390
# Constructors
391391

392-
def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False):
392+
def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False,
393+
to_integer_array=False):
393394
if data is None:
394395
data = {}
395396
if dtype is not None:
@@ -442,7 +443,8 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False):
442443
if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1:
443444
if is_named_tuple(data[0]) and columns is None:
444445
columns = data[0]._fields
445-
arrays, columns = to_arrays(data, columns, dtype=dtype)
446+
arrays, columns = to_arrays(data, columns, dtype=dtype,
447+
to_integer_array=to_integer_array)
446448
columns = ensure_index(columns)
447449

448450
# set the index

pandas/core/internals/construction.py

+12-6
Original file line numberDiff line numberDiff line change
@@ -437,7 +437,8 @@ def _get_axes(N, K, index, columns):
437437
# Conversion of Inputs to Arrays
438438

439439

440-
def to_arrays(data, columns, coerce_float=False, dtype=None):
440+
def to_arrays(data, columns, coerce_float=False, dtype=None,
441+
to_integer_array=False):
441442
"""
442443
Return list of arrays, columns.
443444
"""
@@ -464,7 +465,8 @@ def to_arrays(data, columns, coerce_float=False, dtype=None):
464465
return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype)
465466
elif isinstance(data[0], abc.Mapping):
466467
return _list_of_dict_to_arrays(
467-
data, columns, coerce_float=coerce_float, dtype=dtype
468+
data, columns, coerce_float=coerce_float, dtype=dtype,
469+
to_integer_array=to_integer_array
468470
)
469471
elif isinstance(data[0], ABCSeries):
470472
return _list_of_series_to_arrays(
@@ -535,7 +537,8 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
535537
return values.T, columns
536538

537539

538-
def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
540+
def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None,
541+
to_integer_array=False):
539542
if columns is None:
540543
gen = (list(x.keys()) for x in data)
541544
sort = not any(isinstance(d, OrderedDict) for d in data)
@@ -547,11 +550,13 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
547550

548551
content = list(lib.dicts_to_array(data, list(columns)).T)
549552
return _convert_object_array(
550-
content, columns, dtype=dtype, coerce_float=coerce_float
553+
content, columns, dtype=dtype, coerce_float=coerce_float,
554+
to_integer_array=to_integer_array
551555
)
552556

553557

554-
def _convert_object_array(content, columns, coerce_float=False, dtype=None):
558+
def _convert_object_array(content, columns, coerce_float=False, dtype=None,
559+
to_integer_array=False):
555560
if columns is None:
556561
columns = ibase.default_index(len(content))
557562
else:
@@ -565,7 +570,8 @@ def _convert_object_array(content, columns, coerce_float=False, dtype=None):
565570
# provide soft conversion of object dtypes
566571
def convert(arr):
567572
if dtype != object and dtype != np.object:
568-
arr = lib.maybe_convert_objects(arr, try_float=coerce_float)
573+
arr = lib.maybe_convert_objects(arr, try_float=coerce_float,
574+
to_integer_array=to_integer_array)
569575
arr = maybe_cast_to_datetime(arr, dtype)
570576
return arr
571577

pandas/io/json/_normalize.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ def json_normalize(
120120
errors: Optional[str] = "raise",
121121
sep: str = ".",
122122
max_level: Optional[int] = None,
123+
to_integer_array: Optional[bool] = False
123124
):
124125
"""
125126
Normalize semi-structured JSON data into a flat table.
@@ -264,7 +265,7 @@ def _pull_field(js, spec):
264265
# TODO: handle record value which are lists, at least error
265266
# reasonably
266267
data = nested_to_record(data, sep=sep, max_level=max_level)
267-
return DataFrame(data)
268+
return DataFrame(data, to_integer_array=to_integer_array)
268269
elif not isinstance(record_path, list):
269270
record_path = [record_path]
270271

@@ -326,7 +327,7 @@ def _recursive_extract(data, path, seen_meta, level=0):
326327

327328
_recursive_extract(data, record_path, {}, level=0)
328329

329-
result = DataFrame(records)
330+
result = DataFrame(records, to_integer_array=to_integer_array)
330331

331332
if record_prefix is not None:
332333
result = result.rename(columns=lambda x: "{p}{c}".format(p=record_prefix, c=x))

0 commit comments

Comments
 (0)