diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 7838ef8df4164..16a30408c021d 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -84,6 +84,7 @@ Other enhancements - :meth:`DataFrame.to_json` now supports a ``mode`` keyword with supported inputs 'w' and 'a'. Defaulting to 'w', 'a' can be used when lines=True and orient='records' to append record oriented json lines to an existing json file. (:issue:`35849`) - Added ``name`` parameter to :meth:`IntervalIndex.from_breaks`, :meth:`IntervalIndex.from_arrays` and :meth:`IntervalIndex.from_tuples` (:issue:`48911`) - Added :meth:`Index.infer_objects` analogous to :meth:`Series.infer_objects` (:issue:`50034`) +- Added ``copy`` parameter to :meth:`Series.infer_objects` and :meth:`DataFrame.infer_objects`, passing ``False`` will avoid making copies for series or columns that are already non-object or where no better dtype can be inferred (:issue:`50096`) - :meth:`DataFrame.plot.hist` now recognizes ``xlabel`` and ``ylabel`` arguments (:issue:`49793`) - diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 37b7af13fc7c4..c659f8d0d9a4d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6316,7 +6316,7 @@ def __deepcopy__(self: NDFrameT, memo=None) -> NDFrameT: return self.copy(deep=True) @final - def infer_objects(self: NDFrameT) -> NDFrameT: + def infer_objects(self: NDFrameT, copy: bool_t = True) -> NDFrameT: """ Attempt to infer better dtypes for object columns. @@ -6325,6 +6325,12 @@ def infer_objects(self: NDFrameT) -> NDFrameT: columns unchanged. The inference rules are the same as during normal Series/DataFrame construction. + Parameters + ---------- + copy : bool, default True + Whether to make a copy for non-object or non-inferrable columns + or Series. + Returns ------- converted : same type as input object @@ -6354,7 +6360,7 @@ def infer_objects(self: NDFrameT) -> NDFrameT: A int64 dtype: object """ - new_mgr = self._mgr.convert() + new_mgr = self._mgr.convert(copy=copy) return self._constructor(new_mgr).__finalize__(self, method="infer_objects") @final diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 918c70ff91da5..77617bce7ea41 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -374,19 +374,22 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T: def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T: return self.apply(astype_array_safe, dtype=dtype, copy=copy, errors=errors) - def convert(self: T) -> T: + def convert(self: T, copy: bool) -> T: def _convert(arr): if is_object_dtype(arr.dtype): # extract PandasArray for tests that patch PandasArray._typ arr = np.asarray(arr) - return lib.maybe_convert_objects( + result = lib.maybe_convert_objects( arr, convert_datetime=True, convert_timedelta=True, convert_period=True, ) + if result is arr and copy: + return arr.copy() + return result else: - return arr.copy() + return arr.copy() if copy else arr return self.apply(_convert) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 95300c888eede..4006d63b7b115 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1981,6 +1981,8 @@ def convert( convert_timedelta=True, convert_period=True, ) + if copy and res_values is values: + res_values = values.copy() res_values = ensure_block_shape(res_values, self.ndim) return [self.make_block(res_values)] diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index d1eee23f1908c..6275e04c30e08 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -441,10 +441,10 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T: def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T: return self.apply("astype", dtype=dtype, copy=copy, errors=errors) - def convert(self: T) -> T: + def convert(self: T, copy: bool) -> T: return self.apply( "convert", - copy=True, + copy=copy, ) def replace(self: T, to_replace, value, inplace: bool) -> T: diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index dc7960cde4a61..570a83eb6bf88 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -591,7 +591,7 @@ def _compare(old_mgr, new_mgr): # noops mgr = create_mgr("f: i8; g: f8") - new_mgr = mgr.convert() + new_mgr = mgr.convert(copy=True) _compare(mgr, new_mgr) # convert @@ -599,7 +599,7 @@ def _compare(old_mgr, new_mgr): mgr.iset(0, np.array(["1"] * N, dtype=np.object_)) mgr.iset(1, np.array(["2."] * N, dtype=np.object_)) mgr.iset(2, np.array(["foo."] * N, dtype=np.object_)) - new_mgr = mgr.convert() + new_mgr = mgr.convert(copy=True) assert new_mgr.iget(0).dtype == np.object_ assert new_mgr.iget(1).dtype == np.object_ assert new_mgr.iget(2).dtype == np.object_ @@ -612,7 +612,7 @@ def _compare(old_mgr, new_mgr): mgr.iset(0, np.array(["1"] * N, dtype=np.object_)) mgr.iset(1, np.array(["2."] * N, dtype=np.object_)) mgr.iset(2, np.array(["foo."] * N, dtype=np.object_)) - new_mgr = mgr.convert() + new_mgr = mgr.convert(copy=True) assert new_mgr.iget(0).dtype == np.object_ assert new_mgr.iget(1).dtype == np.object_ assert new_mgr.iget(2).dtype == np.object_ diff --git a/pandas/tests/series/methods/test_infer_objects.py b/pandas/tests/series/methods/test_infer_objects.py index 4710aaf54de31..9df91afbde70e 100644 --- a/pandas/tests/series/methods/test_infer_objects.py +++ b/pandas/tests/series/methods/test_infer_objects.py @@ -4,6 +4,19 @@ class TestInferObjects: + def test_copy(self, index_or_series): + # GH#50096 + # case where we don't need to do inference because it is already non-object + obj = index_or_series(np.array([1, 2, 3], dtype="int64")) + + result = obj.infer_objects(copy=False) + assert tm.shares_memory(result, obj) + + # case where we try to do inference but can't do better than object + obj2 = index_or_series(np.array(["foo", 2], dtype=object)) + result2 = obj2.infer_objects(copy=False) + assert tm.shares_memory(result2, obj2) + def test_infer_objects_series(self, index_or_series): # GH#11221 actual = index_or_series(np.array([1, 2, 3], dtype="O")).infer_objects()