Skip to content

CLN: simplify maybe_convert_objects, soft_convert_objects #27444

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Jul 20, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 41 additions & 56 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from pandas._libs import lib, tslib, tslibs
from pandas._libs.tslibs import NaT, OutOfBoundsDatetime, Period, iNaT
from pandas.util._validators import validate_bool_kwarg

from .common import (
_INT64_DTYPE,
Expand Down Expand Up @@ -696,9 +697,7 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False):
elif np.issubdtype(arr.dtype, np.floating) and np.issubdtype(dtype, np.integer):

if not np.isfinite(arr).all():
raise ValueError(
"Cannot convert non-finite values (NA or inf) to " "integer"
)
raise ValueError("Cannot convert non-finite values (NA or inf) to integer")

elif is_object_dtype(arr):

Expand All @@ -719,9 +718,7 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False):
return astype_nansafe(to_timedelta(arr).values, dtype, copy=copy)

if dtype.name in ("datetime64", "timedelta64"):
msg = (
"The '{dtype}' dtype has no unit. " "Please pass in '{dtype}[ns]' instead."
)
msg = "The '{dtype}' dtype has no unit. Please pass in '{dtype}[ns]' instead."
raise ValueError(msg.format(dtype=dtype.name))

if copy or is_object_dtype(arr) or is_object_dtype(dtype):
Expand All @@ -731,50 +728,33 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False):
return arr.view(dtype)


def maybe_convert_objects(
values, convert_dates=True, convert_numeric=True, convert_timedeltas=True, copy=True
):
""" if we have an object dtype, try to coerce dates and/or numbers """

# if we have passed in a list or scalar
if isinstance(values, (list, tuple)):
values = np.array(values, dtype=np.object_)
if not hasattr(values, "dtype"):
values = np.array([values], dtype=np.object_)
def maybe_convert_objects(values: np.ndarray, convert_numeric: bool = True):
"""
If we have an object dtype array, try to coerce dates and/or numbers.
# convert dates
if convert_dates and values.dtype == np.object_:
Parameters
----------
values : ndarray
convert_numeric : bool, default True
# we take an aggressive stance and convert to datetime64[ns]
if convert_dates == "coerce":
new_values = maybe_cast_to_datetime(values, "M8[ns]", errors="coerce")
Returns
-------
ndarray or DatetimeIndex
"""
validate_bool_kwarg(convert_numeric, "convert_numeric")

# if we are all nans then leave me alone
if not isna(new_values).all():
values = new_values
orig_values = values

else:
values = lib.maybe_convert_objects(values, convert_datetime=convert_dates)
# convert dates
if is_object_dtype(values.dtype):
values = lib.maybe_convert_objects(values, convert_datetime=True)

# convert timedeltas
if convert_timedeltas and values.dtype == np.object_:

if convert_timedeltas == "coerce":
from pandas.core.tools.timedeltas import to_timedelta

new_values = to_timedelta(values, errors="coerce")

# if we are all nans then leave me alone
if not isna(new_values).all():
values = new_values

else:
values = lib.maybe_convert_objects(
values, convert_timedelta=convert_timedeltas
)
if is_object_dtype(values.dtype):
values = lib.maybe_convert_objects(values, convert_timedelta=True)

# convert to numeric
if values.dtype == np.object_:
if is_object_dtype(values.dtype):
if convert_numeric:
try:
new_values = lib.maybe_convert_numeric(
Expand All @@ -791,33 +771,38 @@ def maybe_convert_objects(
# soft-conversion
values = lib.maybe_convert_objects(values)

values = values.copy() if copy else values
if values is orig_values:
values = values.copy()

return values


def soft_convert_objects(
values, datetime=True, numeric=True, timedelta=True, coerce=False, copy=True
values: np.ndarray,
datetime: bool = True,
numeric: bool = True,
timedelta: bool = True,
coerce: bool = False,
copy: bool = True,
):
""" if we have an object dtype, try to coerce dates and/or numbers """

validate_bool_kwarg(datetime, "datetime")
validate_bool_kwarg(numeric, "numeric")
validate_bool_kwarg(timedelta, "timedelta")
validate_bool_kwarg(coerce, "coerce")
validate_bool_kwarg(copy, "copy")

conversion_count = sum((datetime, numeric, timedelta))
if conversion_count == 0:
raise ValueError(
"At least one of datetime, numeric or timedelta must " "be True."
)
raise ValueError("At least one of datetime, numeric or timedelta must be True.")
elif conversion_count > 1 and coerce:
raise ValueError(
"Only one of 'datetime', 'numeric' or "
"'timedelta' can be True when when coerce=True."
)

if isinstance(values, (list, tuple)):
# List or scalar
values = np.array(values, dtype=np.object_)
elif not hasattr(values, "dtype"):
values = np.array([values], dtype=np.object_)
elif not is_object_dtype(values.dtype):
if not is_object_dtype(values.dtype):
# If not object, do not attempt conversion
values = values.copy() if copy else values
return values
Expand All @@ -843,13 +828,13 @@ def soft_convert_objects(
# GH 20380, when datetime is beyond year 2262, hence outside
# bound of nanosecond-resolution 64-bit integers.
try:
values = lib.maybe_convert_objects(values, convert_datetime=datetime)
values = lib.maybe_convert_objects(values, convert_datetime=True)
except OutOfBoundsDatetime:
pass

if timedelta and is_object_dtype(values.dtype):
# Object check to ensure only run if previous did not convert
values = lib.maybe_convert_objects(values, convert_timedelta=timedelta)
values = lib.maybe_convert_objects(values, convert_timedelta=True)

if numeric and is_object_dtype(values.dtype):
try:
Expand Down Expand Up @@ -1368,7 +1353,7 @@ def maybe_cast_to_integer_array(arr, dtype, copy=False):
arr = np.asarray(arr)

if is_unsigned_integer_dtype(dtype) and (arr < 0).any():
raise OverflowError("Trying to coerce negative values " "to unsigned integers")
raise OverflowError("Trying to coerce negative values to unsigned integers")

if is_integer_dtype(dtype) and (is_float_dtype(arr) or is_object_dtype(arr)):
raise ValueError("Trying to coerce float values to integers")
5 changes: 5 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6033,6 +6033,11 @@ def _convert(
-------
converted : same as input object
"""
validate_bool_kwarg(datetime, "datetime")
validate_bool_kwarg(numeric, "numeric")
validate_bool_kwarg(timedelta, "timedelta")
validate_bool_kwarg(coerce, "coerce")
validate_bool_kwarg(copy, "copy")
return self._constructor(
self._data.convert(
datetime=datetime,
Expand Down
60 changes: 28 additions & 32 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
find_common_type,
infer_dtype_from,
infer_dtype_from_scalar,
maybe_convert_objects,
maybe_downcast_to_dtype,
maybe_infer_dtype_type,
maybe_promote,
Expand Down Expand Up @@ -669,7 +668,14 @@ def _astype(self, dtype, copy=False, errors="raise", values=None, **kwargs):
)
return newb

def convert(self, copy=True, **kwargs):
def convert(
self,
copy: bool = True,
datetime: bool = True,
numeric: bool = True,
timedelta: bool = True,
coerce: bool = False,
):
""" attempt to coerce any object types to better types return a copy
of the block (if copy = True) by definition we are not an ObjectBlock
here!
Expand Down Expand Up @@ -827,9 +833,7 @@ def replace(
convert=convert,
)
if convert:
blocks = [
b.convert(by_item=True, numeric=False, copy=not inplace) for b in blocks
]
blocks = [b.convert(numeric=False, copy=not inplace) for b in blocks]
return blocks

def _replace_single(self, *args, **kwargs):
Expand Down Expand Up @@ -2779,45 +2783,39 @@ def is_bool(self):
"""
return lib.is_bool_array(self.values.ravel())

# TODO: Refactor when convert_objects is removed since there will be 1 path
def convert(self, *args, **kwargs):
def convert(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

side issue is that a lot of this can be cleaned up now that convert_objects is gone

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that deprecation was before I got here, so it isn't clear to me what else can be cleaned up

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe just remove the comment, you have cleaned up reasonably. you can try looking in history but if not ok.

self,
copy: bool = True,
datetime: bool = True,
numeric: bool = True,
timedelta: bool = True,
coerce: bool = False,
):
""" attempt to coerce any object types to better types return a copy of
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add typing to the returns would be good here (and maybe Paramters / Returns to the doc-string), but can be in the future. I think adding doc-strings on routines that we are most likely going to keep is worth the effort (so use your judgement on this).

the block (if copy = True) by definition we ARE an ObjectBlock!!!!!

can return multiple blocks!
"""

if args:
raise NotImplementedError
by_item = kwargs.get("by_item", True)

new_inputs = ["coerce", "datetime", "numeric", "timedelta"]
new_style = False
for kw in new_inputs:
new_style |= kw in kwargs

if new_style:
fn = soft_convert_objects
fn_inputs = new_inputs
else:
fn = maybe_convert_objects
fn_inputs = ["convert_dates", "convert_numeric", "convert_timedeltas"]
fn_inputs += ["copy"]

fn_kwargs = {key: kwargs[key] for key in fn_inputs if key in kwargs}

# operate column-by-column
def f(m, v, i):
shape = v.shape
values = fn(v.ravel(), **fn_kwargs)
values = soft_convert_objects(
v.ravel(),
datetime=datetime,
numeric=numeric,
timedelta=timedelta,
coerce=coerce,
copy=copy,
)
if isinstance(values, np.ndarray):
# TODO: allow EA once reshape is supported
values = values.reshape(shape)

values = _block_shape(values, ndim=self.ndim)
return values

if by_item and not self._is_single_block:
if self.ndim == 2:
blocks = self.split_and_operate(None, f, False)
else:
values = f(None, self.values.ravel(), None)
Expand Down Expand Up @@ -3041,7 +3039,7 @@ def re_replacer(s):
# convert
block = self.make_block(new_values)
if convert:
block = block.convert(by_item=True, numeric=False)
block = block.convert(numeric=False)
return block

def _replace_coerce(
Expand Down Expand Up @@ -3080,9 +3078,7 @@ def _replace_coerce(
mask=mask,
)
if convert:
block = [
b.convert(by_item=True, numeric=False, copy=True) for b in block
]
block = [b.convert(numeric=False, copy=True) for b in block]
return block
return self

Expand Down
1 change: 0 additions & 1 deletion pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1551,7 +1551,6 @@ def index(self):

def convert(self, **kwargs):
""" convert the whole block as one """
kwargs["by_item"] = False
return self.apply("convert", **kwargs)

@property
Expand Down
7 changes: 3 additions & 4 deletions pandas/tests/dtypes/cast/test_convert_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@


@pytest.mark.parametrize("data", [[1, 2], ["apply", "banana"]])
@pytest.mark.parametrize("copy", [True, False])
def test_maybe_convert_objects_copy(data, copy):
def test_maybe_convert_objects_copy(data):
arr = np.array(data)
out = maybe_convert_objects(arr, copy=copy)
out = maybe_convert_objects(arr)

assert (arr is out) is (not copy)
assert arr is not out
4 changes: 0 additions & 4 deletions pandas/tests/internals/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -584,10 +584,6 @@ def _compare(old_mgr, new_mgr):
new_mgr = mgr.convert()
_compare(mgr, new_mgr)

mgr = create_mgr("a, b: object; f: i8; g: f8")
new_mgr = mgr.convert()
_compare(mgr, new_mgr)

# convert
mgr = create_mgr("a,b,foo: object; f: i8; g: f8")
mgr.set("a", np.array(["1"] * N, dtype=np.object_))
Expand Down