From bc1c21ac9ff3c343f004c151ecfe3fdae7befb3a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 17 Jul 2019 15:30:27 -0700 Subject: [PATCH 01/10] validate args for maybe_convert_objects, soft_convert_objects; remove unreachable branches --- pandas/core/dtypes/cast.py | 83 +++++++++++++++----------------------- pandas/core/generic.py | 5 +++ 2 files changed, 37 insertions(+), 51 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index f483cf520754b..f4f0f34764272 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -6,6 +6,7 @@ from pandas._libs import lib, tslib, tslibs from pandas._libs.tslibs import NaT, OutOfBoundsDatetime, Period, iNaT +from pandas.util._validators import validate_bool_kwarg from .common import ( _INT64_DTYPE, @@ -708,9 +709,7 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False): elif np.issubdtype(arr.dtype, np.floating) and np.issubdtype(dtype, np.integer): if not np.isfinite(arr).all(): - raise ValueError( - "Cannot convert non-finite values (NA or inf) to " "integer" - ) + raise ValueError("Cannot convert non-finite values (NA or inf) to integer") elif is_object_dtype(arr): @@ -731,9 +730,7 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False): return astype_nansafe(to_timedelta(arr).values, dtype, copy=copy) if dtype.name in ("datetime64", "timedelta64"): - msg = ( - "The '{dtype}' dtype has no unit. " "Please pass in '{dtype}[ns]' instead." - ) + msg = "The '{dtype}' dtype has no unit. Please pass in '{dtype}[ns]' instead." raise ValueError(msg.format(dtype=dtype.name)) if copy or is_object_dtype(arr) or is_object_dtype(dtype): @@ -744,46 +741,25 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False): def maybe_convert_objects( - values, convert_dates=True, convert_numeric=True, convert_timedeltas=True, copy=True + values: np.ndarray, + convert_dates: bool = True, + convert_numeric: bool = True, + convert_timedeltas: bool = True, + copy: bool = True, ): """ if we have an object dtype, try to coerce dates and/or numbers """ - - # if we have passed in a list or scalar - if isinstance(values, (list, tuple)): - values = np.array(values, dtype=np.object_) - if not hasattr(values, "dtype"): - values = np.array([values], dtype=np.object_) + validate_bool_kwarg(convert_dates, "convert_dates") + validate_bool_kwarg(convert_numeric, "convert_numeric") + validate_bool_kwarg(convert_timedeltas, "convert_timedeltas") + validate_bool_kwarg(copy, "copy") # convert dates if convert_dates and values.dtype == np.object_: - - # we take an aggressive stance and convert to datetime64[ns] - if convert_dates == "coerce": - new_values = maybe_cast_to_datetime(values, "M8[ns]", errors="coerce") - - # if we are all nans then leave me alone - if not isna(new_values).all(): - values = new_values - - else: - values = lib.maybe_convert_objects(values, convert_datetime=convert_dates) + values = lib.maybe_convert_objects(values, convert_datetime=True) # convert timedeltas if convert_timedeltas and values.dtype == np.object_: - - if convert_timedeltas == "coerce": - from pandas.core.tools.timedeltas import to_timedelta - - new_values = to_timedelta(values, errors="coerce") - - # if we are all nans then leave me alone - if not isna(new_values).all(): - values = new_values - - else: - values = lib.maybe_convert_objects( - values, convert_timedelta=convert_timedeltas - ) + values = lib.maybe_convert_objects(values, convert_timedelta=True) # convert to numeric if values.dtype == np.object_: @@ -809,27 +785,32 @@ def maybe_convert_objects( def soft_convert_objects( - values, datetime=True, numeric=True, timedelta=True, coerce=False, copy=True + values: np.ndarray, + datetime: bool = True, + numeric: bool = True, + timedelta: bool = True, + coerce: bool = False, + copy: bool = True, ): """ if we have an object dtype, try to coerce dates and/or numbers """ + validate_bool_kwarg(datetime, "datetime") + validate_bool_kwarg(numeric, "numeric") + validate_bool_kwarg(timedelta, "timedelta") + validate_bool_kwarg(coerce, "coerce") + validate_bool_kwarg(copy, "copy") + assert not coerce # we're only called from one place in internals.blocks + conversion_count = sum((datetime, numeric, timedelta)) if conversion_count == 0: - raise ValueError( - "At least one of datetime, numeric or timedelta must " "be True." - ) + raise ValueError("At least one of datetime, numeric or timedelta must be True.") elif conversion_count > 1 and coerce: raise ValueError( "Only one of 'datetime', 'numeric' or " "'timedelta' can be True when when coerce=True." ) - if isinstance(values, (list, tuple)): - # List or scalar - values = np.array(values, dtype=np.object_) - elif not hasattr(values, "dtype"): - values = np.array([values], dtype=np.object_) - elif not is_object_dtype(values.dtype): + if not is_object_dtype(values.dtype): # If not object, do not attempt conversion values = values.copy() if copy else values return values @@ -855,13 +836,13 @@ def soft_convert_objects( # GH 20380, when datetime is beyond year 2262, hence outside # bound of nanosecond-resolution 64-bit integers. try: - values = lib.maybe_convert_objects(values, convert_datetime=datetime) + values = lib.maybe_convert_objects(values, convert_datetime=True) except OutOfBoundsDatetime: pass if timedelta and is_object_dtype(values.dtype): # Object check to ensure only run if previous did not convert - values = lib.maybe_convert_objects(values, convert_timedelta=timedelta) + values = lib.maybe_convert_objects(values, convert_timedelta=True) if numeric and is_object_dtype(values.dtype): try: @@ -1380,7 +1361,7 @@ def maybe_cast_to_integer_array(arr, dtype, copy=False): arr = np.asarray(arr) if is_unsigned_integer_dtype(dtype) and (arr < 0).any(): - raise OverflowError("Trying to coerce negative values " "to unsigned integers") + raise OverflowError("Trying to coerce negative values to unsigned integers") if is_integer_dtype(dtype) and (is_float_dtype(arr) or is_object_dtype(arr)): raise ValueError("Trying to coerce float values to integers") diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f28f58b070368..a6cc2562bc8d6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6038,6 +6038,11 @@ def _convert( ------- converted : same as input object """ + validate_bool_kwarg(datetime, "datetime") + validate_bool_kwarg(numeric, "numeric") + validate_bool_kwarg(timedelta, "timedelta") + validate_bool_kwarg(coerce, "coerce") + validate_bool_kwarg(copy, "copy") return self._constructor( self._data.convert( datetime=datetime, From bb7cb11509b1d4a613a7b31994a57729a84ede38 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 17 Jul 2019 17:40:05 -0700 Subject: [PATCH 02/10] simplify rarely-used maybe_convert_objects, soft_convert_objects, Block.convert --- pandas/core/dtypes/cast.py | 21 +++++++------------ pandas/core/internals/blocks.py | 19 ++++++----------- .../tests/dtypes/cast/test_convert_objects.py | 7 +++---- pandas/tests/internals/test_internals.py | 4 ---- 4 files changed, 16 insertions(+), 35 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index f4f0f34764272..c91ad0cec406c 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -740,25 +740,18 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False): return arr.view(dtype) -def maybe_convert_objects( - values: np.ndarray, - convert_dates: bool = True, - convert_numeric: bool = True, - convert_timedeltas: bool = True, - copy: bool = True, -): +def maybe_convert_objects(values: np.ndarray, convert_numeric: bool = True): """ if we have an object dtype, try to coerce dates and/or numbers """ - validate_bool_kwarg(convert_dates, "convert_dates") validate_bool_kwarg(convert_numeric, "convert_numeric") - validate_bool_kwarg(convert_timedeltas, "convert_timedeltas") - validate_bool_kwarg(copy, "copy") + + orig_values = values # convert dates - if convert_dates and values.dtype == np.object_: + if values.dtype == np.object_: values = lib.maybe_convert_objects(values, convert_datetime=True) # convert timedeltas - if convert_timedeltas and values.dtype == np.object_: + if values.dtype == np.object_: values = lib.maybe_convert_objects(values, convert_timedelta=True) # convert to numeric @@ -779,7 +772,8 @@ def maybe_convert_objects( # soft-conversion values = lib.maybe_convert_objects(values) - values = values.copy() if copy else values + if values is orig_values: + values = values.copy() return values @@ -799,7 +793,6 @@ def soft_convert_objects( validate_bool_kwarg(timedelta, "timedelta") validate_bool_kwarg(coerce, "coerce") validate_bool_kwarg(copy, "copy") - assert not coerce # we're only called from one place in internals.blocks conversion_count = sum((datetime, numeric, timedelta)) if conversion_count == 0: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 26aca34f20594..7250bc8af4c3b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -18,7 +18,6 @@ find_common_type, infer_dtype_from, infer_dtype_from_scalar, - maybe_convert_objects, maybe_downcast_to_dtype, maybe_infer_dtype_type, maybe_promote, @@ -2789,27 +2788,21 @@ def convert(self, *args, **kwargs): if args: raise NotImplementedError - by_item = kwargs.get("by_item", True) + by_item = kwargs.pop("by_item", True) new_inputs = ["coerce", "datetime", "numeric", "timedelta"] - new_style = False - for kw in new_inputs: - new_style |= kw in kwargs - if new_style: - fn = soft_convert_objects - fn_inputs = new_inputs - else: - fn = maybe_convert_objects - fn_inputs = ["convert_dates", "convert_numeric", "convert_timedeltas"] + fn_inputs = new_inputs fn_inputs += ["copy"] - fn_kwargs = {key: kwargs[key] for key in fn_inputs if key in kwargs} + fn_kwargs = {key: kwargs.pop(key) for key in fn_inputs if key in kwargs} + if kwargs: + raise ValueError("Unrecognized keywords: {kwargs}".format(kwargs=kwargs)) # operate column-by-column def f(m, v, i): shape = v.shape - values = fn(v.ravel(), **fn_kwargs) + values = soft_convert_objects(v.ravel(), **fn_kwargs) if isinstance(values, np.ndarray): # TODO: allow EA once reshape is supported values = values.reshape(shape) diff --git a/pandas/tests/dtypes/cast/test_convert_objects.py b/pandas/tests/dtypes/cast/test_convert_objects.py index 45980dbd82736..a28d554acd312 100644 --- a/pandas/tests/dtypes/cast/test_convert_objects.py +++ b/pandas/tests/dtypes/cast/test_convert_objects.py @@ -5,9 +5,8 @@ @pytest.mark.parametrize("data", [[1, 2], ["apply", "banana"]]) -@pytest.mark.parametrize("copy", [True, False]) -def test_maybe_convert_objects_copy(data, copy): +def test_maybe_convert_objects_copy(data): arr = np.array(data) - out = maybe_convert_objects(arr, copy=copy) + out = maybe_convert_objects(arr) - assert (arr is out) is (not copy) + assert arr is not out diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 655e484bc34d1..b56251aae884e 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -584,10 +584,6 @@ def _compare(old_mgr, new_mgr): new_mgr = mgr.convert() _compare(mgr, new_mgr) - mgr = create_mgr("a, b: object; f: i8; g: f8") - new_mgr = mgr.convert() - _compare(mgr, new_mgr) - # convert mgr = create_mgr("a,b,foo: object; f: i8; g: f8") mgr.set("a", np.array(["1"] * N, dtype=np.object_)) From 6f28f0b970852ee9c16a49c69fd6ba589847d337 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 17 Jul 2019 18:10:30 -0700 Subject: [PATCH 03/10] cleanup --- pandas/core/internals/blocks.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 7250bc8af4c3b..bc28cfe15d152 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2779,21 +2779,15 @@ def is_bool(self): return lib.is_bool_array(self.values.ravel()) # TODO: Refactor when convert_objects is removed since there will be 1 path - def convert(self, *args, **kwargs): + def convert(self, **kwargs): """ attempt to coerce any object types to better types return a copy of the block (if copy = True) by definition we ARE an ObjectBlock!!!!! can return multiple blocks! """ - - if args: - raise NotImplementedError by_item = kwargs.pop("by_item", True) - new_inputs = ["coerce", "datetime", "numeric", "timedelta"] - - fn_inputs = new_inputs - fn_inputs += ["copy"] + fn_inputs = ["coerce", "datetime", "numeric", "timedelta", "copy"] fn_kwargs = {key: kwargs.pop(key) for key in fn_inputs if key in kwargs} if kwargs: From c0be483797859bbca490bf0540c920d6f8b127fc Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 17 Jul 2019 18:25:52 -0700 Subject: [PATCH 04/10] make kwargs explicit --- pandas/core/internals/blocks.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index bc28cfe15d152..2480efa6ad548 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2779,24 +2779,32 @@ def is_bool(self): return lib.is_bool_array(self.values.ravel()) # TODO: Refactor when convert_objects is removed since there will be 1 path - def convert(self, **kwargs): + def convert( + self, + by_item: bool = True, + datetime: bool = True, + numeric: bool = True, + timedelta: bool = True, + coerce: bool = False, + copy: bool = True, + ): """ attempt to coerce any object types to better types return a copy of the block (if copy = True) by definition we ARE an ObjectBlock!!!!! can return multiple blocks! """ - by_item = kwargs.pop("by_item", True) - - fn_inputs = ["coerce", "datetime", "numeric", "timedelta", "copy"] - - fn_kwargs = {key: kwargs.pop(key) for key in fn_inputs if key in kwargs} - if kwargs: - raise ValueError("Unrecognized keywords: {kwargs}".format(kwargs=kwargs)) # operate column-by-column def f(m, v, i): shape = v.shape - values = soft_convert_objects(v.ravel(), **fn_kwargs) + values = soft_convert_objects( + v.ravel(), + datetime=datetime, + numeric=numeric, + timedelta=timedelta, + coerce=coerce, + copy=copy, + ) if isinstance(values, np.ndarray): # TODO: allow EA once reshape is supported values = values.reshape(shape) @@ -2804,7 +2812,7 @@ def f(m, v, i): values = _block_shape(values, ndim=self.ndim) return values - if by_item and not self._is_single_block: + if by_item and self.ndim == 2: blocks = self.split_and_operate(None, f, False) else: values = f(None, self.values.ravel(), None) From dc3d1a1cc86ec3087b97f2f4d35b9a4910636c19 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 17 Jul 2019 19:43:07 -0700 Subject: [PATCH 05/10] remove by_item kwarg --- pandas/core/internals/blocks.py | 9 ++++----- pandas/core/internals/managers.py | 1 - 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 2480efa6ad548..643423ce0cb32 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -827,7 +827,7 @@ def replace( ) if convert: blocks = [ - b.convert(by_item=True, numeric=False, copy=not inplace) for b in blocks + b.convert(numeric=False, copy=not inplace) for b in blocks ] return blocks @@ -2781,7 +2781,6 @@ def is_bool(self): # TODO: Refactor when convert_objects is removed since there will be 1 path def convert( self, - by_item: bool = True, datetime: bool = True, numeric: bool = True, timedelta: bool = True, @@ -2812,7 +2811,7 @@ def f(m, v, i): values = _block_shape(values, ndim=self.ndim) return values - if by_item and self.ndim == 2: + if self.ndim == 2: blocks = self.split_and_operate(None, f, False) else: values = f(None, self.values.ravel(), None) @@ -3036,7 +3035,7 @@ def re_replacer(s): # convert block = self.make_block(new_values) if convert: - block = block.convert(by_item=True, numeric=False) + block = block.convert(numeric=False) return block def _replace_coerce( @@ -3076,7 +3075,7 @@ def _replace_coerce( ) if convert: block = [ - b.convert(by_item=True, numeric=False, copy=True) for b in block + b.convert(numeric=False, copy=True) for b in block ] return block return self diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 2e7280eeae0e2..394c0773409f2 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1551,7 +1551,6 @@ def index(self): def convert(self, **kwargs): """ convert the whole block as one """ - kwargs["by_item"] = False return self.apply("convert", **kwargs) @property From 37942b16455918e6d41244ca4a0457de3bb7f18d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 18 Jul 2019 07:42:56 -0700 Subject: [PATCH 06/10] blackify --- pandas/core/internals/blocks.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 643423ce0cb32..bc83c9c5950ec 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -826,9 +826,7 @@ def replace( convert=convert, ) if convert: - blocks = [ - b.convert(numeric=False, copy=not inplace) for b in blocks - ] + blocks = [b.convert(numeric=False, copy=not inplace) for b in blocks] return blocks def _replace_single(self, *args, **kwargs): @@ -3074,9 +3072,7 @@ def _replace_coerce( mask=mask, ) if convert: - block = [ - b.convert(numeric=False, copy=True) for b in block - ] + block = [b.convert(numeric=False, copy=True) for b in block] return block return self From 7137e98d03c202efdd35d44382bf802cb3d93ba5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 18 Jul 2019 10:19:24 -0700 Subject: [PATCH 07/10] mypy fixup --- pandas/core/internals/blocks.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index bc83c9c5950ec..3449cfee488e2 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -668,7 +668,14 @@ def _astype(self, dtype, copy=False, errors="raise", values=None, **kwargs): ) return newb - def convert(self, copy=True, **kwargs): + def convert( + self, + copy: bool = True, + datetime: bool = True, + numeric: bool = True, + timedelta: bool = True, + coerce: bool = False, + ): """ attempt to coerce any object types to better types return a copy of the block (if copy = True) by definition we are not an ObjectBlock here! @@ -2779,11 +2786,11 @@ def is_bool(self): # TODO: Refactor when convert_objects is removed since there will be 1 path def convert( self, + copy: bool = True, datetime: bool = True, numeric: bool = True, timedelta: bool = True, coerce: bool = False, - copy: bool = True, ): """ attempt to coerce any object types to better types return a copy of the block (if copy = True) by definition we ARE an ObjectBlock!!!!! From b17489aa1d5b1256f97c88425e45b7e179083237 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 20 Jul 2019 12:52:10 -0700 Subject: [PATCH 08/10] docstring, use is_object_dtype --- pandas/core/dtypes/cast.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 7b343cd2b0c28..fd8536e38eee7 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -729,21 +729,32 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False): def maybe_convert_objects(values: np.ndarray, convert_numeric: bool = True): - """ if we have an object dtype, try to coerce dates and/or numbers """ + """ + If we have an object dtype array, try to coerce dates and/or numbers. + + Parameters + ---------- + values : ndarray + convert_numeric : bool, default True + + Returns + ------- + ndarray or DatetimeIndex + """ validate_bool_kwarg(convert_numeric, "convert_numeric") orig_values = values # convert dates - if values.dtype == np.object_: + if is_object_dtype(values.dtype): values = lib.maybe_convert_objects(values, convert_datetime=True) # convert timedeltas - if values.dtype == np.object_: + if is_object_dtype(values.dtype): values = lib.maybe_convert_objects(values, convert_timedelta=True) # convert to numeric - if values.dtype == np.object_: + if is_object_dtype(values.dtype): if convert_numeric: try: new_values = lib.maybe_convert_numeric( From 2e3eb9d99be1e03cb501373998c03a2e0ccf236f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 20 Jul 2019 13:22:56 -0700 Subject: [PATCH 09/10] Merge branch 'master' of https://github.com/pandas-dev/pandas into convert_objects From b32f6521b8c9c802049c88d3868eb532954c60d0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 20 Jul 2019 13:23:19 -0700 Subject: [PATCH 10/10] remove comment --- pandas/core/internals/blocks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3449cfee488e2..ace57938f948c 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2783,7 +2783,6 @@ def is_bool(self): """ return lib.is_bool_array(self.values.ravel()) - # TODO: Refactor when convert_objects is removed since there will be 1 path def convert( self, copy: bool = True,