Skip to content

Commit 65b2f4a

Browse files
rhshadrachyeshsurya
authored andcommitted
ENH: Make maybe_convert_object respect dtype itemsize (pandas-dev#40908)
1 parent 32e1612 commit 65b2f4a

File tree

10 files changed

+142
-45
lines changed

10 files changed

+142
-45
lines changed

doc/source/whatsnew/v1.3.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ Other enhancements
221221
- :meth:`pandas.read_csv` and :meth:`pandas.read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`)
222222
- :meth:`.GroupBy.any` and :meth:`.GroupBy.all` use Kleene logic with nullable data types (:issue:`37506`)
223223
- :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`)
224+
- Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`)
224225
- Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`)
225226
-
226227

@@ -691,7 +692,7 @@ Numeric
691692
- Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`)
692693
- Bug in :meth:`DataFrame.transform` would raise ``SpecificationError`` when passed a dictionary and columns were missing; will now raise a ``KeyError`` instead (:issue:`40004`)
693694
- Bug in :meth:`DataFrameGroupBy.rank` giving incorrect results with ``pct=True`` and equal values between consecutive groups (:issue:`40518`)
694-
-
695+
- Bug in :meth:`Series.count` would result in an ``int32`` result on 32-bit platforms when argument ``level=None`` (:issue:`40908`)
695696

696697
Conversion
697698
^^^^^^^^^^

pandas/_libs/lib.pyx

+61-27
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ cdef extern from "numpy/arrayobject.h":
6868
object fields
6969
tuple names
7070

71+
cdef extern from "numpy/ndarrayobject.h":
72+
bint PyArray_CheckScalar(obj) nogil
73+
7174

7275
cdef extern from "src/parse_helper.h":
7376
int floatify(object, float64_t *result, int *maybe_int) except -1
@@ -209,6 +212,24 @@ def is_scalar(val: object) -> bool:
209212
or is_offset_object(val))
210213

211214

215+
cdef inline int64_t get_itemsize(object val):
216+
"""
217+
Get the itemsize of a NumPy scalar, -1 if not a NumPy scalar.
218+
219+
Parameters
220+
----------
221+
val : object
222+
223+
Returns
224+
-------
225+
is_ndarray : bool
226+
"""
227+
if PyArray_CheckScalar(val):
228+
return cnp.PyArray_DescrFromScalar(val).itemsize
229+
else:
230+
return -1
231+
232+
212233
def is_iterator(obj: object) -> bool:
213234
"""
214235
Check if the object is an iterator.
@@ -2188,7 +2209,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
21882209

21892210
Parameters
21902211
----------
2191-
values : ndarray[object]
2212+
objects : ndarray[object]
21922213
Array of object elements to convert.
21932214
try_float : bool, default False
21942215
If an array-like object contains only float or NaN values is
@@ -2212,7 +2233,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
22122233
Array of converted object values to more specific dtypes if applicable.
22132234
"""
22142235
cdef:
2215-
Py_ssize_t i, n
2236+
Py_ssize_t i, n, itemsize_max = 0
22162237
ndarray[float64_t] floats
22172238
ndarray[complex128_t] complexes
22182239
ndarray[int64_t] ints
@@ -2245,6 +2266,10 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
22452266

22462267
for i in range(n):
22472268
val = objects[i]
2269+
if itemsize_max != -1:
2270+
itemsize = get_itemsize(val)
2271+
if itemsize > itemsize_max or itemsize == -1:
2272+
itemsize_max = itemsize
22482273

22492274
if val is None:
22502275
seen.null_ = True
@@ -2346,92 +2371,101 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
23462371
seen.object_ = True
23472372

23482373
if not seen.object_:
2374+
result = None
23492375
if not safe:
23502376
if seen.null_ or seen.nan_:
23512377
if seen.is_float_or_complex:
23522378
if seen.complex_:
2353-
return complexes
2379+
result = complexes
23542380
elif seen.float_:
2355-
return floats
2381+
result = floats
23562382
elif seen.int_:
23572383
if convert_to_nullable_integer:
23582384
from pandas.core.arrays import IntegerArray
2359-
return IntegerArray(ints, mask)
2385+
result = IntegerArray(ints, mask)
23602386
else:
2361-
return floats
2387+
result = floats
23622388
elif seen.nan_:
2363-
return floats
2389+
result = floats
23642390
else:
23652391
if not seen.bool_:
23662392
if seen.datetime_:
23672393
if not seen.numeric_ and not seen.timedelta_:
2368-
return datetimes
2394+
result = datetimes
23692395
elif seen.timedelta_:
23702396
if not seen.numeric_:
2371-
return timedeltas
2397+
result = timedeltas
23722398
elif seen.nat_:
23732399
if not seen.numeric_:
23742400
if convert_datetime and convert_timedelta:
23752401
# TODO: array full of NaT ambiguity resolve here needed
23762402
pass
23772403
elif convert_datetime:
2378-
return datetimes
2404+
result = datetimes
23792405
elif convert_timedelta:
2380-
return timedeltas
2406+
result = timedeltas
23812407
else:
23822408
if seen.complex_:
2383-
return complexes
2409+
result = complexes
23842410
elif seen.float_:
2385-
return floats
2411+
result = floats
23862412
elif seen.int_:
23872413
if seen.uint_:
2388-
return uints
2414+
result = uints
23892415
else:
2390-
return ints
2416+
result = ints
23912417
elif seen.is_bool:
2392-
return bools.view(np.bool_)
2418+
result = bools.view(np.bool_)
23932419

23942420
else:
23952421
# don't cast int to float, etc.
23962422
if seen.null_:
23972423
if seen.is_float_or_complex:
23982424
if seen.complex_:
23992425
if not seen.int_:
2400-
return complexes
2426+
result = complexes
24012427
elif seen.float_ or seen.nan_:
24022428
if not seen.int_:
2403-
return floats
2429+
result = floats
24042430
else:
24052431
if not seen.bool_:
24062432
if seen.datetime_:
24072433
if not seen.numeric_ and not seen.timedelta_:
2408-
return datetimes
2434+
result = datetimes
24092435
elif seen.timedelta_:
24102436
if not seen.numeric_:
2411-
return timedeltas
2437+
result = timedeltas
24122438
elif seen.nat_:
24132439
if not seen.numeric_:
24142440
if convert_datetime and convert_timedelta:
24152441
# TODO: array full of NaT ambiguity resolve here needed
24162442
pass
24172443
elif convert_datetime:
2418-
return datetimes
2444+
result = datetimes
24192445
elif convert_timedelta:
2420-
return timedeltas
2446+
result = timedeltas
24212447
else:
24222448
if seen.complex_:
24232449
if not seen.int_:
2424-
return complexes
2450+
result = complexes
24252451
elif seen.float_ or seen.nan_:
24262452
if not seen.int_:
2427-
return floats
2453+
result = floats
24282454
elif seen.int_:
24292455
if seen.uint_:
2430-
return uints
2456+
result = uints
24312457
else:
2432-
return ints
2458+
result = ints
24332459
elif seen.is_bool and not seen.nan_:
2434-
return bools.view(np.bool_)
2460+
result = bools.view(np.bool_)
2461+
2462+
if result is uints or result is ints or result is floats or result is complexes:
2463+
# cast to the largest itemsize when all values are NumPy scalars
2464+
if itemsize_max > 0 and itemsize_max != result.dtype.itemsize:
2465+
result = result.astype(result.dtype.kind + str(itemsize_max))
2466+
return result
2467+
elif result is not None:
2468+
return result
24352469

24362470
return objects
24372471

pandas/core/series.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1891,7 +1891,7 @@ def count(self, level=None):
18911891
2
18921892
"""
18931893
if level is None:
1894-
return notna(self._values).sum()
1894+
return notna(self._values).sum().astype("int64")
18951895
else:
18961896
warnings.warn(
18971897
"Using the level keyword in DataFrame and Series aggregations is "

pandas/tests/dtypes/test_inference.py

+64
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from pandas.core.dtypes.common import (
3232
ensure_int32,
3333
is_bool,
34+
is_complex,
3435
is_datetime64_any_dtype,
3536
is_datetime64_dtype,
3637
is_datetime64_ns_dtype,
@@ -614,6 +615,69 @@ def test_maybe_convert_objects_bool_nan(self):
614615
out = lib.maybe_convert_objects(ind.values, safe=1)
615616
tm.assert_numpy_array_equal(out, exp)
616617

618+
@pytest.mark.parametrize(
619+
"data0",
620+
[
621+
True,
622+
1,
623+
1.0,
624+
1.0 + 1.0j,
625+
np.int8(1),
626+
np.int16(1),
627+
np.int32(1),
628+
np.int64(1),
629+
np.float16(1),
630+
np.float32(1),
631+
np.float64(1),
632+
np.complex64(1),
633+
np.complex128(1),
634+
],
635+
)
636+
@pytest.mark.parametrize(
637+
"data1",
638+
[
639+
True,
640+
1,
641+
1.0,
642+
1.0 + 1.0j,
643+
np.int8(1),
644+
np.int16(1),
645+
np.int32(1),
646+
np.int64(1),
647+
np.float16(1),
648+
np.float32(1),
649+
np.float64(1),
650+
np.complex64(1),
651+
np.complex128(1),
652+
],
653+
)
654+
def test_maybe_convert_objects_itemsize(self, data0, data1):
655+
# GH 40908
656+
data = [data0, data1]
657+
arr = np.array(data, dtype="object")
658+
659+
common_kind = np.find_common_type(
660+
[type(data0), type(data1)], scalar_types=[]
661+
).kind
662+
kind0 = "python" if not hasattr(data0, "dtype") else data0.dtype.kind
663+
kind1 = "python" if not hasattr(data1, "dtype") else data1.dtype.kind
664+
if kind0 != "python" and kind1 != "python":
665+
kind = common_kind
666+
itemsize = max(data0.dtype.itemsize, data1.dtype.itemsize)
667+
elif is_bool(data0) or is_bool(data1):
668+
kind = "bool" if (is_bool(data0) and is_bool(data1)) else "object"
669+
itemsize = ""
670+
elif is_complex(data0) or is_complex(data1):
671+
kind = common_kind
672+
itemsize = 16
673+
else:
674+
kind = common_kind
675+
itemsize = 8
676+
677+
expected = np.array(data, dtype=f"{kind}{itemsize}")
678+
result = lib.maybe_convert_objects(arr)
679+
tm.assert_numpy_array_equal(result, expected)
680+
617681
def test_mixed_dtypes_remain_object_array(self):
618682
# GH14956
619683
arr = np.array([datetime(2015, 1, 1, tzinfo=pytz.utc), 1], dtype=object)

pandas/tests/extension/test_sparse.py

-7
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,6 @@
1616
import numpy as np
1717
import pytest
1818

19-
from pandas.compat import (
20-
IS64,
21-
is_platform_windows,
22-
)
2319
from pandas.errors import PerformanceWarning
2420

2521
from pandas.core.dtypes.common import is_object_dtype
@@ -428,9 +424,6 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
428424
]:
429425
mark = pytest.mark.xfail(reason="result dtype.fill_value mismatch")
430426
request.node.add_marker(mark)
431-
elif is_platform_windows() or not IS64:
432-
mark = pytest.mark.xfail(reason="results are int32, expected int64")
433-
request.node.add_marker(mark)
434427
super().test_arith_frame_with_scalar(data, all_arithmetic_operators)
435428

436429

pandas/tests/frame/constructors/test_from_records.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ def test_from_records_sequencelike(self):
117117
result = DataFrame.from_records(tuples, exclude=exclude)
118118
result.columns = [columns[i] for i in sorted(columns_to_test)]
119119
tm.assert_series_equal(result["C"], df["C"])
120-
tm.assert_series_equal(result["E1"], df["E1"].astype("float64"))
120+
tm.assert_series_equal(result["E1"], df["E1"])
121121

122122
def test_from_records_sequencelike_empty(self):
123123
# empty case

pandas/tests/frame/methods/test_replace.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
import numpy as np
1111
import pytest
1212

13+
from pandas.compat import np_version_under1p20
14+
1315
import pandas as pd
1416
from pandas import (
1517
DataFrame,
@@ -1514,8 +1516,14 @@ def test_replace_commutative(self, df, to_replace, exp):
15141516
np.float64(1),
15151517
],
15161518
)
1517-
def test_replace_replacer_dtype(self, replacer):
1519+
def test_replace_replacer_dtype(self, request, replacer):
15181520
# GH26632
1521+
if np.isscalar(replacer) and replacer.dtype.itemsize < 8:
1522+
request.node.add_marker(
1523+
pytest.mark.xfail(
1524+
np_version_under1p20, reason="np.putmask doesn't coerce dtype"
1525+
)
1526+
)
15191527
df = DataFrame(["a"])
15201528
result = df.replace({"a": replacer, "b": replacer})
15211529
expected = DataFrame([replacer])

pandas/tests/frame/test_constructors.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1924,12 +1924,12 @@ def test_constructor_for_list_with_dtypes(self):
19241924
# test list of lists/ndarrays
19251925
df = DataFrame([np.arange(5) for x in range(5)])
19261926
result = df.dtypes
1927-
expected = Series([np.dtype("int64")] * 5)
1927+
expected = Series([np.dtype("int")] * 5)
19281928
tm.assert_series_equal(result, expected)
19291929

19301930
df = DataFrame([np.array(np.arange(5), dtype="int32") for x in range(5)])
19311931
result = df.dtypes
1932-
expected = Series([np.dtype("int64")] * 5)
1932+
expected = Series([np.dtype("int32")] * 5)
19331933
tm.assert_series_equal(result, expected)
19341934

19351935
# overflow issue? (we always expected int64 upcasting here)

pandas/tests/groupby/test_groupby.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -99,10 +99,7 @@ def max_value(group):
9999

100100
applied = df.groupby("A").apply(max_value)
101101
result = applied.dtypes
102-
expected = Series(
103-
[np.dtype("object")] * 2 + [np.dtype("float64")] * 2 + [np.dtype("int64")],
104-
index=["A", "B", "C", "D", "value"],
105-
)
102+
expected = df.dtypes
106103
tm.assert_series_equal(result, expected)
107104

108105

pandas/tests/indexing/test_coercion.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -641,7 +641,7 @@ def test_where_series_complex128(self, fill_val, exp_dtype):
641641
values = klass([True, False, True, True])
642642
else:
643643
values = klass(x * fill_val for x in [5, 6, 7, 8])
644-
exp = klass([1 + 1j, values[1], 3 + 3j, values[3]])
644+
exp = klass([1 + 1j, values[1], 3 + 3j, values[3]], dtype=exp_dtype)
645645
self._assert_where_conversion(obj, cond, values, exp, exp_dtype)
646646

647647
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)