ENH: Make maybe_convert_object respect dtype itemsize (pandas-dev#40908)

rhshadrach · yeshsurya · commit 65b2f4a8734b · 2021-05-06T14:24:40.000+05:30
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -221,6 +221,7 @@ Other enhancements
 - :meth:`pandas.read_csv` and :meth:`pandas.read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`)
 - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` use Kleene logic with nullable data types (:issue:`37506`)
 - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`)
+- Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`)
 - Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`)
 -
 
@@ -691,7 +692,7 @@ Numeric
 - Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`)
 - Bug in :meth:`DataFrame.transform` would raise ``SpecificationError`` when passed a dictionary and columns were missing; will now raise a ``KeyError`` instead (:issue:`40004`)
 - Bug in :meth:`DataFrameGroupBy.rank` giving incorrect results with ``pct=True`` and equal values between consecutive groups (:issue:`40518`)
--
+- Bug in :meth:`Series.count` would result in an ``int32`` result on 32-bit platforms when argument ``level=None`` (:issue:`40908`)
 
 Conversion
 ^^^^^^^^^^
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -68,6 +68,9 @@ cdef extern from "numpy/arrayobject.h":
             object fields
             tuple names
 
+cdef extern from "numpy/ndarrayobject.h":
+    bint PyArray_CheckScalar(obj) nogil
+
 
 cdef extern from "src/parse_helper.h":
     int floatify(object, float64_t *result, int *maybe_int) except -1
@@ -209,6 +212,24 @@ def is_scalar(val: object) -> bool:
             or is_offset_object(val))
 
 
+cdef inline int64_t get_itemsize(object val):
+    """
+    Get the itemsize of a NumPy scalar, -1 if not a NumPy scalar.
+
+    Parameters
+    ----------
+    val : object
+
+    Returns
+    -------
+    is_ndarray : bool
+    """
+    if PyArray_CheckScalar(val):
+        return cnp.PyArray_DescrFromScalar(val).itemsize
+    else:
+        return -1
+
+
 def is_iterator(obj: object) -> bool:
     """
     Check if the object is an iterator.
@@ -2188,7 +2209,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
 
     Parameters
     ----------
-    values : ndarray[object]
+    objects : ndarray[object]
         Array of object elements to convert.
     try_float : bool, default False
         If an array-like object contains only float or NaN values is
@@ -2212,7 +2233,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
         Array of converted object values to more specific dtypes if applicable.
     """
     cdef:
-        Py_ssize_t i, n
+        Py_ssize_t i, n, itemsize_max = 0
         ndarray[float64_t] floats
         ndarray[complex128_t] complexes
         ndarray[int64_t] ints
@@ -2245,6 +2266,10 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
 
     for i in range(n):
         val = objects[i]
+        if itemsize_max != -1:
+            itemsize = get_itemsize(val)
+            if itemsize > itemsize_max or itemsize == -1:
+                itemsize_max = itemsize
 
         if val is None:
             seen.null_ = True
@@ -2346,92 +2371,101 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
         seen.object_ = True
 
     if not seen.object_:
+        result = None
         if not safe:
             if seen.null_ or seen.nan_:
                 if seen.is_float_or_complex:
                     if seen.complex_:
-                        return complexes
+                        result = complexes
                     elif seen.float_:
-                        return floats
+                        result = floats
                     elif seen.int_:
                         if convert_to_nullable_integer:
                             from pandas.core.arrays import IntegerArray
-                            return IntegerArray(ints, mask)
+                            result = IntegerArray(ints, mask)
                         else:
-                            return floats
+                            result = floats
                     elif seen.nan_:
-                        return floats
+                        result = floats
             else:
                 if not seen.bool_:
                     if seen.datetime_:
                         if not seen.numeric_ and not seen.timedelta_:
-                            return datetimes
+                            result = datetimes
                     elif seen.timedelta_:
                         if not seen.numeric_:
-                            return timedeltas
+                            result = timedeltas
                     elif seen.nat_:
                         if not seen.numeric_:
                             if convert_datetime and convert_timedelta:
                                 # TODO: array full of NaT ambiguity resolve here needed
                                 pass
                             elif convert_datetime:
-                                return datetimes
+                                result = datetimes
                             elif convert_timedelta:
-                                return timedeltas
+                                result = timedeltas
                     else:
                         if seen.complex_:
-                            return complexes
+                            result = complexes
                         elif seen.float_:
-                            return floats
+                            result = floats
                         elif seen.int_:
                             if seen.uint_:
-                                return uints
+                                result = uints
                             else:
-                                return ints
+                                result = ints
                 elif seen.is_bool:
-                    return bools.view(np.bool_)
+                    result = bools.view(np.bool_)
 
         else:
             # don't cast int to float, etc.
             if seen.null_:
                 if seen.is_float_or_complex:
                     if seen.complex_:
                         if not seen.int_:
-                            return complexes
+                            result = complexes
                     elif seen.float_ or seen.nan_:
                         if not seen.int_:
-                            return floats
+                            result = floats
             else:
                 if not seen.bool_:
                     if seen.datetime_:
                         if not seen.numeric_ and not seen.timedelta_:
-                            return datetimes
+                            result = datetimes
                     elif seen.timedelta_:
                         if not seen.numeric_:
-                            return timedeltas
+                            result = timedeltas
                     elif seen.nat_:
                         if not seen.numeric_:
                             if convert_datetime and convert_timedelta:
                                 # TODO: array full of NaT ambiguity resolve here needed
                                 pass
                             elif convert_datetime:
-                                return datetimes
+                                result = datetimes
                             elif convert_timedelta:
-                                return timedeltas
+                                result = timedeltas
                     else:
                         if seen.complex_:
                             if not seen.int_:
-                                return complexes
+                                result = complexes
                         elif seen.float_ or seen.nan_:
                             if not seen.int_:
-                                return floats
+                                result = floats
                         elif seen.int_:
                             if seen.uint_:
-                                return uints
+                                result = uints
                             else:
-                                return ints
+                                result = ints
                 elif seen.is_bool and not seen.nan_:
-                    return bools.view(np.bool_)
+                    result = bools.view(np.bool_)
+
+        if result is uints or result is ints or result is floats or result is complexes:
+            # cast to the largest itemsize when all values are NumPy scalars
+            if itemsize_max > 0 and itemsize_max != result.dtype.itemsize:
+                result = result.astype(result.dtype.kind + str(itemsize_max))
+            return result
+        elif result is not None:
+            return result
 
     return objects
 
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1891,7 +1891,7 @@ def count(self, level=None):
         2
         """
         if level is None:
-            return notna(self._values).sum()
+            return notna(self._values).sum().astype("int64")
         else:
             warnings.warn(
                 "Using the level keyword in DataFrame and Series aggregations is "
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
@@ -31,6 +31,7 @@
 from pandas.core.dtypes.common import (
     ensure_int32,
     is_bool,
+    is_complex,
     is_datetime64_any_dtype,
     is_datetime64_dtype,
     is_datetime64_ns_dtype,
@@ -614,6 +615,69 @@ def test_maybe_convert_objects_bool_nan(self):
         out = lib.maybe_convert_objects(ind.values, safe=1)
         tm.assert_numpy_array_equal(out, exp)
 
+    @pytest.mark.parametrize(
+        "data0",
+        [
+            True,
+            1,
+            1.0,
+            1.0 + 1.0j,
+            np.int8(1),
+            np.int16(1),
+            np.int32(1),
+            np.int64(1),
+            np.float16(1),
+            np.float32(1),
+            np.float64(1),
+            np.complex64(1),
+            np.complex128(1),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "data1",
+        [
+            True,
+            1,
+            1.0,
+            1.0 + 1.0j,
+            np.int8(1),
+            np.int16(1),
+            np.int32(1),
+            np.int64(1),
+            np.float16(1),
+            np.float32(1),
+            np.float64(1),
+            np.complex64(1),
+            np.complex128(1),
+        ],
+    )
+    def test_maybe_convert_objects_itemsize(self, data0, data1):
+        # GH 40908
+        data = [data0, data1]
+        arr = np.array(data, dtype="object")
+
+        common_kind = np.find_common_type(
+            [type(data0), type(data1)], scalar_types=[]
+        ).kind
+        kind0 = "python" if not hasattr(data0, "dtype") else data0.dtype.kind
+        kind1 = "python" if not hasattr(data1, "dtype") else data1.dtype.kind
+        if kind0 != "python" and kind1 != "python":
+            kind = common_kind
+            itemsize = max(data0.dtype.itemsize, data1.dtype.itemsize)
+        elif is_bool(data0) or is_bool(data1):
+            kind = "bool" if (is_bool(data0) and is_bool(data1)) else "object"
+            itemsize = ""
+        elif is_complex(data0) or is_complex(data1):
+            kind = common_kind
+            itemsize = 16
+        else:
+            kind = common_kind
+            itemsize = 8
+
+        expected = np.array(data, dtype=f"{kind}{itemsize}")
+        result = lib.maybe_convert_objects(arr)
+        tm.assert_numpy_array_equal(result, expected)
+
     def test_mixed_dtypes_remain_object_array(self):
         # GH14956
         arr = np.array([datetime(2015, 1, 1, tzinfo=pytz.utc), 1], dtype=object)
diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py
@@ -16,10 +16,6 @@
 import numpy as np
 import pytest
 
-from pandas.compat import (
-    IS64,
-    is_platform_windows,
-)
 from pandas.errors import PerformanceWarning
 
 from pandas.core.dtypes.common import is_object_dtype
@@ -428,9 +424,6 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
         ]:
             mark = pytest.mark.xfail(reason="result dtype.fill_value mismatch")
             request.node.add_marker(mark)
-        elif is_platform_windows() or not IS64:
-            mark = pytest.mark.xfail(reason="results are int32, expected int64")
-            request.node.add_marker(mark)
         super().test_arith_frame_with_scalar(data, all_arithmetic_operators)
 
 
diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py
@@ -117,7 +117,7 @@ def test_from_records_sequencelike(self):
         result = DataFrame.from_records(tuples, exclude=exclude)
         result.columns = [columns[i] for i in sorted(columns_to_test)]
         tm.assert_series_equal(result["C"], df["C"])
-        tm.assert_series_equal(result["E1"], df["E1"].astype("float64"))
+        tm.assert_series_equal(result["E1"], df["E1"])
 
     def test_from_records_sequencelike_empty(self):
         # empty case
diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py
@@ -10,6 +10,8 @@
 import numpy as np
 import pytest
 
+from pandas.compat import np_version_under1p20
+
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -1514,8 +1516,14 @@ def test_replace_commutative(self, df, to_replace, exp):
             np.float64(1),
         ],
     )
-    def test_replace_replacer_dtype(self, replacer):
+    def test_replace_replacer_dtype(self, request, replacer):
         # GH26632
+        if np.isscalar(replacer) and replacer.dtype.itemsize < 8:
+            request.node.add_marker(
+                pytest.mark.xfail(
+                    np_version_under1p20, reason="np.putmask doesn't coerce dtype"
+                )
+            )
         df = DataFrame(["a"])
         result = df.replace({"a": replacer, "b": replacer})
         expected = DataFrame([replacer])
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -1924,12 +1924,12 @@ def test_constructor_for_list_with_dtypes(self):
         # test list of lists/ndarrays
         df = DataFrame([np.arange(5) for x in range(5)])
         result = df.dtypes
-        expected = Series([np.dtype("int64")] * 5)
+        expected = Series([np.dtype("int")] * 5)
         tm.assert_series_equal(result, expected)
 
         df = DataFrame([np.array(np.arange(5), dtype="int32") for x in range(5)])
         result = df.dtypes
-        expected = Series([np.dtype("int64")] * 5)
+        expected = Series([np.dtype("int32")] * 5)
         tm.assert_series_equal(result, expected)
 
         # overflow issue? (we always expected int64 upcasting here)
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -99,10 +99,7 @@ def max_value(group):
 
     applied = df.groupby("A").apply(max_value)
     result = applied.dtypes
-    expected = Series(
-        [np.dtype("object")] * 2 + [np.dtype("float64")] * 2 + [np.dtype("int64")],
-        index=["A", "B", "C", "D", "value"],
-    )
+    expected = df.dtypes
     tm.assert_series_equal(result, expected)
 
 
diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py
@@ -641,7 +641,7 @@ def test_where_series_complex128(self, fill_val, exp_dtype):
             values = klass([True, False, True, True])
         else:
             values = klass(x * fill_val for x in [5, 6, 7, 8])
-        exp = klass([1 + 1j, values[1], 3 + 3j, values[3]])
+        exp = klass([1 + 1j, values[1], 3 + 3j, values[3]], dtype=exp_dtype)
         self._assert_where_conversion(obj, cond, values, exp, exp_dtype)
 
     @pytest.mark.parametrize(

Original file line number	Diff line number	Diff line change
`@@ -1891,7 +1891,7 @@ def count(self, level=None):`
`1891`	`1891`	`2`
`1892`	`1892`	`"""`
`1893`	`1893`	`if level is None:`
`1894`		`- return notna(self._values).sum()`
	`1894`	`+ return notna(self._values).sum().astype("int64")`
`1895`	`1895`	`else:`
`1896`	`1896`	`warnings.warn(`
`1897`	`1897`	`"Using the level keyword in DataFrame and Series aggregations is "`