diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 44a3fefb1689a..fd8536e38eee7 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -6,6 +6,7 @@ from pandas._libs import lib, tslib, tslibs from pandas._libs.tslibs import NaT, OutOfBoundsDatetime, Period, iNaT +from pandas.util._validators import validate_bool_kwarg from .common import ( _INT64_DTYPE, @@ -696,9 +697,7 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False): elif np.issubdtype(arr.dtype, np.floating) and np.issubdtype(dtype, np.integer): if not np.isfinite(arr).all(): - raise ValueError( - "Cannot convert non-finite values (NA or inf) to " "integer" - ) + raise ValueError("Cannot convert non-finite values (NA or inf) to integer") elif is_object_dtype(arr): @@ -719,9 +718,7 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False): return astype_nansafe(to_timedelta(arr).values, dtype, copy=copy) if dtype.name in ("datetime64", "timedelta64"): - msg = ( - "The '{dtype}' dtype has no unit. " "Please pass in '{dtype}[ns]' instead." - ) + msg = "The '{dtype}' dtype has no unit. Please pass in '{dtype}[ns]' instead." raise ValueError(msg.format(dtype=dtype.name)) if copy or is_object_dtype(arr) or is_object_dtype(dtype): @@ -731,50 +728,33 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False): return arr.view(dtype) -def maybe_convert_objects( - values, convert_dates=True, convert_numeric=True, convert_timedeltas=True, copy=True -): - """ if we have an object dtype, try to coerce dates and/or numbers """ - - # if we have passed in a list or scalar - if isinstance(values, (list, tuple)): - values = np.array(values, dtype=np.object_) - if not hasattr(values, "dtype"): - values = np.array([values], dtype=np.object_) +def maybe_convert_objects(values: np.ndarray, convert_numeric: bool = True): + """ + If we have an object dtype array, try to coerce dates and/or numbers. - # convert dates - if convert_dates and values.dtype == np.object_: + Parameters + ---------- + values : ndarray + convert_numeric : bool, default True - # we take an aggressive stance and convert to datetime64[ns] - if convert_dates == "coerce": - new_values = maybe_cast_to_datetime(values, "M8[ns]", errors="coerce") + Returns + ------- + ndarray or DatetimeIndex + """ + validate_bool_kwarg(convert_numeric, "convert_numeric") - # if we are all nans then leave me alone - if not isna(new_values).all(): - values = new_values + orig_values = values - else: - values = lib.maybe_convert_objects(values, convert_datetime=convert_dates) + # convert dates + if is_object_dtype(values.dtype): + values = lib.maybe_convert_objects(values, convert_datetime=True) # convert timedeltas - if convert_timedeltas and values.dtype == np.object_: - - if convert_timedeltas == "coerce": - from pandas.core.tools.timedeltas import to_timedelta - - new_values = to_timedelta(values, errors="coerce") - - # if we are all nans then leave me alone - if not isna(new_values).all(): - values = new_values - - else: - values = lib.maybe_convert_objects( - values, convert_timedelta=convert_timedeltas - ) + if is_object_dtype(values.dtype): + values = lib.maybe_convert_objects(values, convert_timedelta=True) # convert to numeric - if values.dtype == np.object_: + if is_object_dtype(values.dtype): if convert_numeric: try: new_values = lib.maybe_convert_numeric( @@ -791,33 +771,38 @@ def maybe_convert_objects( # soft-conversion values = lib.maybe_convert_objects(values) - values = values.copy() if copy else values + if values is orig_values: + values = values.copy() return values def soft_convert_objects( - values, datetime=True, numeric=True, timedelta=True, coerce=False, copy=True + values: np.ndarray, + datetime: bool = True, + numeric: bool = True, + timedelta: bool = True, + coerce: bool = False, + copy: bool = True, ): """ if we have an object dtype, try to coerce dates and/or numbers """ + validate_bool_kwarg(datetime, "datetime") + validate_bool_kwarg(numeric, "numeric") + validate_bool_kwarg(timedelta, "timedelta") + validate_bool_kwarg(coerce, "coerce") + validate_bool_kwarg(copy, "copy") + conversion_count = sum((datetime, numeric, timedelta)) if conversion_count == 0: - raise ValueError( - "At least one of datetime, numeric or timedelta must " "be True." - ) + raise ValueError("At least one of datetime, numeric or timedelta must be True.") elif conversion_count > 1 and coerce: raise ValueError( "Only one of 'datetime', 'numeric' or " "'timedelta' can be True when when coerce=True." ) - if isinstance(values, (list, tuple)): - # List or scalar - values = np.array(values, dtype=np.object_) - elif not hasattr(values, "dtype"): - values = np.array([values], dtype=np.object_) - elif not is_object_dtype(values.dtype): + if not is_object_dtype(values.dtype): # If not object, do not attempt conversion values = values.copy() if copy else values return values @@ -843,13 +828,13 @@ def soft_convert_objects( # GH 20380, when datetime is beyond year 2262, hence outside # bound of nanosecond-resolution 64-bit integers. try: - values = lib.maybe_convert_objects(values, convert_datetime=datetime) + values = lib.maybe_convert_objects(values, convert_datetime=True) except OutOfBoundsDatetime: pass if timedelta and is_object_dtype(values.dtype): # Object check to ensure only run if previous did not convert - values = lib.maybe_convert_objects(values, convert_timedelta=timedelta) + values = lib.maybe_convert_objects(values, convert_timedelta=True) if numeric and is_object_dtype(values.dtype): try: @@ -1368,7 +1353,7 @@ def maybe_cast_to_integer_array(arr, dtype, copy=False): arr = np.asarray(arr) if is_unsigned_integer_dtype(dtype) and (arr < 0).any(): - raise OverflowError("Trying to coerce negative values " "to unsigned integers") + raise OverflowError("Trying to coerce negative values to unsigned integers") if is_integer_dtype(dtype) and (is_float_dtype(arr) or is_object_dtype(arr)): raise ValueError("Trying to coerce float values to integers") diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0229f36cedff8..ecc421df3fbb1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6033,6 +6033,11 @@ def _convert( ------- converted : same as input object """ + validate_bool_kwarg(datetime, "datetime") + validate_bool_kwarg(numeric, "numeric") + validate_bool_kwarg(timedelta, "timedelta") + validate_bool_kwarg(coerce, "coerce") + validate_bool_kwarg(copy, "copy") return self._constructor( self._data.convert( datetime=datetime, diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 26aca34f20594..ace57938f948c 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -18,7 +18,6 @@ find_common_type, infer_dtype_from, infer_dtype_from_scalar, - maybe_convert_objects, maybe_downcast_to_dtype, maybe_infer_dtype_type, maybe_promote, @@ -669,7 +668,14 @@ def _astype(self, dtype, copy=False, errors="raise", values=None, **kwargs): ) return newb - def convert(self, copy=True, **kwargs): + def convert( + self, + copy: bool = True, + datetime: bool = True, + numeric: bool = True, + timedelta: bool = True, + coerce: bool = False, + ): """ attempt to coerce any object types to better types return a copy of the block (if copy = True) by definition we are not an ObjectBlock here! @@ -827,9 +833,7 @@ def replace( convert=convert, ) if convert: - blocks = [ - b.convert(by_item=True, numeric=False, copy=not inplace) for b in blocks - ] + blocks = [b.convert(numeric=False, copy=not inplace) for b in blocks] return blocks def _replace_single(self, *args, **kwargs): @@ -2779,37 +2783,31 @@ def is_bool(self): """ return lib.is_bool_array(self.values.ravel()) - # TODO: Refactor when convert_objects is removed since there will be 1 path - def convert(self, *args, **kwargs): + def convert( + self, + copy: bool = True, + datetime: bool = True, + numeric: bool = True, + timedelta: bool = True, + coerce: bool = False, + ): """ attempt to coerce any object types to better types return a copy of the block (if copy = True) by definition we ARE an ObjectBlock!!!!! can return multiple blocks! """ - if args: - raise NotImplementedError - by_item = kwargs.get("by_item", True) - - new_inputs = ["coerce", "datetime", "numeric", "timedelta"] - new_style = False - for kw in new_inputs: - new_style |= kw in kwargs - - if new_style: - fn = soft_convert_objects - fn_inputs = new_inputs - else: - fn = maybe_convert_objects - fn_inputs = ["convert_dates", "convert_numeric", "convert_timedeltas"] - fn_inputs += ["copy"] - - fn_kwargs = {key: kwargs[key] for key in fn_inputs if key in kwargs} - # operate column-by-column def f(m, v, i): shape = v.shape - values = fn(v.ravel(), **fn_kwargs) + values = soft_convert_objects( + v.ravel(), + datetime=datetime, + numeric=numeric, + timedelta=timedelta, + coerce=coerce, + copy=copy, + ) if isinstance(values, np.ndarray): # TODO: allow EA once reshape is supported values = values.reshape(shape) @@ -2817,7 +2815,7 @@ def f(m, v, i): values = _block_shape(values, ndim=self.ndim) return values - if by_item and not self._is_single_block: + if self.ndim == 2: blocks = self.split_and_operate(None, f, False) else: values = f(None, self.values.ravel(), None) @@ -3041,7 +3039,7 @@ def re_replacer(s): # convert block = self.make_block(new_values) if convert: - block = block.convert(by_item=True, numeric=False) + block = block.convert(numeric=False) return block def _replace_coerce( @@ -3080,9 +3078,7 @@ def _replace_coerce( mask=mask, ) if convert: - block = [ - b.convert(by_item=True, numeric=False, copy=True) for b in block - ] + block = [b.convert(numeric=False, copy=True) for b in block] return block return self diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 2e7280eeae0e2..394c0773409f2 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1551,7 +1551,6 @@ def index(self): def convert(self, **kwargs): """ convert the whole block as one """ - kwargs["by_item"] = False return self.apply("convert", **kwargs) @property diff --git a/pandas/tests/dtypes/cast/test_convert_objects.py b/pandas/tests/dtypes/cast/test_convert_objects.py index 45980dbd82736..a28d554acd312 100644 --- a/pandas/tests/dtypes/cast/test_convert_objects.py +++ b/pandas/tests/dtypes/cast/test_convert_objects.py @@ -5,9 +5,8 @@ @pytest.mark.parametrize("data", [[1, 2], ["apply", "banana"]]) -@pytest.mark.parametrize("copy", [True, False]) -def test_maybe_convert_objects_copy(data, copy): +def test_maybe_convert_objects_copy(data): arr = np.array(data) - out = maybe_convert_objects(arr, copy=copy) + out = maybe_convert_objects(arr) - assert (arr is out) is (not copy) + assert arr is not out diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 655e484bc34d1..b56251aae884e 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -584,10 +584,6 @@ def _compare(old_mgr, new_mgr): new_mgr = mgr.convert() _compare(mgr, new_mgr) - mgr = create_mgr("a, b: object; f: i8; g: f8") - new_mgr = mgr.convert() - _compare(mgr, new_mgr) - # convert mgr = create_mgr("a,b,foo: object; f: i8; g: f8") mgr.set("a", np.array(["1"] * N, dtype=np.object_))