Skip to content

Commit e7e2f1c

Browse files
jbrockmendelmroeschke
authored andcommitted
REF: use maybe_convert_objects in pd.array (pandas-dev#56484)
* REF: use maybe_convert_objects in pd.array * lint fixups * Update pandas/_libs/lib.pyx Co-authored-by: Matthew Roeschke <[email protected]> --------- Co-authored-by: Matthew Roeschke <[email protected]>
1 parent 0dd37c5 commit e7e2f1c

File tree

4 files changed

+109
-54
lines changed

4 files changed

+109
-54
lines changed

pandas/_libs/lib.pyx

+34-9
Original file line numberDiff line numberDiff line change
@@ -2628,7 +2628,11 @@ def maybe_convert_objects(ndarray[object] objects,
26282628
seen.object_ = True
26292629
break
26302630
elif val is C_NA:
2631-
seen.object_ = True
2631+
if convert_to_nullable_dtype:
2632+
seen.null_ = True
2633+
mask[i] = True
2634+
else:
2635+
seen.object_ = True
26322636
continue
26332637
else:
26342638
seen.object_ = True
@@ -2691,6 +2695,12 @@ def maybe_convert_objects(ndarray[object] objects,
26912695
dtype = StringDtype(storage="pyarrow_numpy")
26922696
return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
26932697

2698+
elif convert_to_nullable_dtype and is_string_array(objects, skipna=True):
2699+
from pandas.core.arrays.string_ import StringDtype
2700+
2701+
dtype = StringDtype()
2702+
return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
2703+
26942704
seen.object_ = True
26952705
elif seen.interval_:
26962706
if is_interval_array(objects):
@@ -2734,12 +2744,12 @@ def maybe_convert_objects(ndarray[object] objects,
27342744
return objects
27352745

27362746
if seen.bool_:
2737-
if seen.is_bool:
2738-
# is_bool property rules out everything else
2739-
return bools.view(np.bool_)
2740-
elif convert_to_nullable_dtype and seen.is_bool_or_na:
2747+
if convert_to_nullable_dtype and seen.is_bool_or_na:
27412748
from pandas.core.arrays import BooleanArray
27422749
return BooleanArray(bools.view(np.bool_), mask)
2750+
elif seen.is_bool:
2751+
# is_bool property rules out everything else
2752+
return bools.view(np.bool_)
27432753
seen.object_ = True
27442754

27452755
if not seen.object_:
@@ -2752,11 +2762,11 @@ def maybe_convert_objects(ndarray[object] objects,
27522762
result = floats
27532763
elif seen.int_ or seen.uint_:
27542764
if convert_to_nullable_dtype:
2755-
from pandas.core.arrays import IntegerArray
2765+
# Below we will wrap in IntegerArray
27562766
if seen.uint_:
2757-
result = IntegerArray(uints, mask)
2767+
result = uints
27582768
else:
2759-
result = IntegerArray(ints, mask)
2769+
result = ints
27602770
else:
27612771
result = floats
27622772
elif seen.nan_:
@@ -2771,7 +2781,6 @@ def maybe_convert_objects(ndarray[object] objects,
27712781
result = uints
27722782
else:
27732783
result = ints
2774-
27752784
else:
27762785
# don't cast int to float, etc.
27772786
if seen.null_:
@@ -2794,6 +2803,22 @@ def maybe_convert_objects(ndarray[object] objects,
27942803
else:
27952804
result = ints
27962805

2806+
# TODO: do these after the itemsize check?
2807+
if (result is ints or result is uints) and convert_to_nullable_dtype:
2808+
from pandas.core.arrays import IntegerArray
2809+
2810+
# Set these values to 1 to be deterministic, match
2811+
# IntegerArray._internal_fill_value
2812+
result[mask] = 1
2813+
result = IntegerArray(result, mask)
2814+
elif result is floats and convert_to_nullable_dtype:
2815+
from pandas.core.arrays import FloatingArray
2816+
2817+
# Set these values to 1.0 to be deterministic, match
2818+
# FloatingArray._internal_fill_value
2819+
result[mask] = 1.0
2820+
result = FloatingArray(result, mask)
2821+
27972822
if result is uints or result is ints or result is floats or result is complexes:
27982823
# cast to the largest itemsize when all values are NumPy scalars
27992824
if itemsize_max > 0 and itemsize_max != result.dtype.itemsize:

pandas/core/construction.py

+57-43
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,8 @@
77

88
from __future__ import annotations
99

10-
from collections.abc import Sequence
1110
from typing import (
1211
TYPE_CHECKING,
13-
Optional,
14-
Union,
1512
cast,
1613
overload,
1714
)
@@ -23,17 +20,9 @@
2320

2421
from pandas._libs import lib
2522
from pandas._libs.tslibs import (
26-
Period,
2723
get_supported_dtype,
2824
is_supported_dtype,
2925
)
30-
from pandas._typing import (
31-
AnyArrayLike,
32-
ArrayLike,
33-
Dtype,
34-
DtypeObj,
35-
T,
36-
)
3726

3827
from pandas.core.dtypes.base import ExtensionDtype
3928
from pandas.core.dtypes.cast import (
@@ -46,6 +35,7 @@
4635
maybe_promote,
4736
)
4837
from pandas.core.dtypes.common import (
38+
ensure_object,
4939
is_list_like,
5040
is_object_dtype,
5141
is_string_dtype,
@@ -63,11 +53,25 @@
6353
import pandas.core.common as com
6454

6555
if TYPE_CHECKING:
56+
from collections.abc import Sequence
57+
58+
from pandas._typing import (
59+
AnyArrayLike,
60+
ArrayLike,
61+
Dtype,
62+
DtypeObj,
63+
T,
64+
)
65+
6666
from pandas import (
6767
Index,
6868
Series,
6969
)
70-
from pandas.core.arrays.base import ExtensionArray
70+
from pandas.core.arrays import (
71+
DatetimeArray,
72+
ExtensionArray,
73+
TimedeltaArray,
74+
)
7175

7276

7377
def array(
@@ -286,9 +290,7 @@ def array(
286290
ExtensionArray,
287291
FloatingArray,
288292
IntegerArray,
289-
IntervalArray,
290293
NumpyExtensionArray,
291-
PeriodArray,
292294
TimedeltaArray,
293295
)
294296
from pandas.core.arrays.string_ import StringDtype
@@ -320,46 +322,58 @@ def array(
320322
return cls._from_sequence(data, dtype=dtype, copy=copy)
321323

322324
if dtype is None:
323-
inferred_dtype = lib.infer_dtype(data, skipna=True)
324-
if inferred_dtype == "period":
325-
period_data = cast(Union[Sequence[Optional[Period]], AnyArrayLike], data)
326-
return PeriodArray._from_sequence(period_data, copy=copy)
327-
328-
elif inferred_dtype == "interval":
329-
return IntervalArray(data, copy=copy)
330-
331-
elif inferred_dtype.startswith("datetime"):
332-
# datetime, datetime64
333-
try:
334-
return DatetimeArray._from_sequence(data, copy=copy)
335-
except ValueError:
336-
# Mixture of timezones, fall back to NumpyExtensionArray
337-
pass
338-
339-
elif inferred_dtype.startswith("timedelta"):
340-
# timedelta, timedelta64
341-
return TimedeltaArray._from_sequence(data, copy=copy)
342-
343-
elif inferred_dtype == "string":
325+
was_ndarray = isinstance(data, np.ndarray)
326+
# error: Item "Sequence[object]" of "Sequence[object] | ExtensionArray |
327+
# ndarray[Any, Any]" has no attribute "dtype"
328+
if not was_ndarray or data.dtype == object: # type: ignore[union-attr]
329+
result = lib.maybe_convert_objects(
330+
ensure_object(data),
331+
convert_non_numeric=True,
332+
convert_to_nullable_dtype=True,
333+
dtype_if_all_nat=None,
334+
)
335+
result = ensure_wrapped_if_datetimelike(result)
336+
if isinstance(result, np.ndarray):
337+
if len(result) == 0 and not was_ndarray:
338+
# e.g. empty list
339+
return FloatingArray._from_sequence(data, dtype="Float64")
340+
return NumpyExtensionArray._from_sequence(
341+
data, dtype=result.dtype, copy=copy
342+
)
343+
if result is data and copy:
344+
return result.copy()
345+
return result
346+
347+
data = cast(np.ndarray, data)
348+
result = ensure_wrapped_if_datetimelike(data)
349+
if result is not data:
350+
result = cast("DatetimeArray | TimedeltaArray", result)
351+
if copy and result.dtype == data.dtype:
352+
return result.copy()
353+
return result
354+
355+
if data.dtype.kind in "SU":
344356
# StringArray/ArrowStringArray depending on pd.options.mode.string_storage
345357
dtype = StringDtype()
346358
cls = dtype.construct_array_type()
347359
return cls._from_sequence(data, dtype=dtype, copy=copy)
348360

349-
elif inferred_dtype == "integer":
361+
elif data.dtype.kind in "iu":
350362
return IntegerArray._from_sequence(data, copy=copy)
351-
elif inferred_dtype == "empty" and not hasattr(data, "dtype") and not len(data):
352-
return FloatingArray._from_sequence(data, copy=copy)
353-
elif (
354-
inferred_dtype in ("floating", "mixed-integer-float")
355-
and getattr(data, "dtype", None) != np.float16
356-
):
363+
elif data.dtype.kind == "f":
357364
# GH#44715 Exclude np.float16 bc FloatingArray does not support it;
358365
# we will fall back to NumpyExtensionArray.
366+
if data.dtype == np.float16:
367+
return NumpyExtensionArray._from_sequence(
368+
data, dtype=data.dtype, copy=copy
369+
)
359370
return FloatingArray._from_sequence(data, copy=copy)
360371

361-
elif inferred_dtype == "boolean":
372+
elif data.dtype.kind == "b":
362373
return BooleanArray._from_sequence(data, dtype="boolean", copy=copy)
374+
else:
375+
# e.g. complex
376+
return NumpyExtensionArray._from_sequence(data, dtype=data.dtype, copy=copy)
363377

364378
# Pandas overrides NumPy for
365379
# 1. datetime64[ns,us,ms,s]

pandas/tests/arrays/test_array.py

+16
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,14 @@ def test_dt64_array(dtype_unit):
220220
.construct_array_type()
221221
._from_sequence(["a", None], dtype=pd.StringDtype()),
222222
),
223+
(
224+
# numpy array with string dtype
225+
np.array(["a", "b"], dtype=str),
226+
None,
227+
pd.StringDtype()
228+
.construct_array_type()
229+
._from_sequence(["a", "b"], dtype=pd.StringDtype()),
230+
),
223231
# Boolean
224232
(
225233
[True, None],
@@ -247,6 +255,14 @@ def test_dt64_array(dtype_unit):
247255
"category",
248256
pd.Categorical([pd.Period("2000", "D"), pd.Period("2001", "D")]),
249257
),
258+
# Complex
259+
(
260+
np.array([complex(1), complex(2)], dtype=np.complex128),
261+
None,
262+
NumpyExtensionArray(
263+
np.array([complex(1), complex(2)], dtype=np.complex128)
264+
),
265+
),
250266
],
251267
)
252268
def test_array(data, dtype, expected):

pandas/tests/dtypes/test_inference.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -936,9 +936,9 @@ def test_maybe_convert_objects_bool_nan(self):
936936
def test_maybe_convert_objects_nullable_boolean(self):
937937
# GH50047
938938
arr = np.array([True, False], dtype=object)
939-
exp = np.array([True, False])
939+
exp = BooleanArray._from_sequence([True, False], dtype="boolean")
940940
out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True)
941-
tm.assert_numpy_array_equal(out, exp)
941+
tm.assert_extension_array_equal(out, exp)
942942

943943
arr = np.array([True, False, pd.NaT], dtype=object)
944944
exp = np.array([True, False, pd.NaT], dtype=object)

0 commit comments

Comments
 (0)