diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 43d1244c15d8a..d5069c71d332b 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -715,6 +715,36 @@ apply and applymap on ``DataFrame`` evaluates first row/column only once df.apply(func, axis=1) + +.. _whatsnew_110.api_breaking.explode_infer_dtype: + +Infer dtypes in explode method for Dataframe and Series +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Using :meth:`DataFrame.explode` and :meth:`Series.explode` would always return an object for the column being exploded. Now the dtype of the column would be inferred and returned accordingly. (:issue:`34923`) + +.. ipython:: python + + s = pd.Series([1, 2, 3]) + df = pd.DataFrame({'A': [s, s, s, s], 'B': 1}) + +*Previous behavior*: + +.. code-block:: ipython + + In [3]: df.explode("A").dtypes + Out[3]: + A object + B int64 + dtype: object + +*New behavior*: + +.. ipython:: python + + df.explode("A").dtypes + + Increased minimum versions for dependencies ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f52341ed782d8..372a989ce0ffd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7070,6 +7070,7 @@ def explode( else: result.index = self.index.take(result.index) result = result.reindex(columns=self.columns, copy=False) + result = result.infer_objects() return result diff --git a/pandas/core/series.py b/pandas/core/series.py index ef3be854bc3bb..b3e9be26038c8 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3843,7 +3843,7 @@ def explode(self, ignore_index: bool = False) -> "Series": else: index = self.index.repeat(counts) - result = self._constructor(values, index=index, name=self.name) + result = self._constructor(values, index=index, name=self.name).infer_objects() return result diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py index 2bbe8ac2d5b81..064f6936d6d0e 100644 --- a/pandas/tests/frame/methods/test_explode.py +++ b/pandas/tests/frame/methods/test_explode.py @@ -25,7 +25,7 @@ def test_basic(): expected = pd.DataFrame( { "A": pd.Series( - [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object + [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=np.float64 ), "B": 1, } @@ -55,7 +55,7 @@ def test_multi_index_rows(): ("b", 2), ] ), - dtype=object, + dtype=np.float64, ), "B": 1, } @@ -74,7 +74,7 @@ def test_multi_index_columns(): ("A", 1): pd.Series( [0, 1, 2, np.nan, np.nan, 3, 4], index=pd.Index([0, 0, 0, 1, 2, 3, 3]), - dtype=object, + dtype=np.float64, ), ("A", 2): 1, } @@ -93,7 +93,7 @@ def test_usecase(): expected = pd.DataFrame( { "A": [11, 11, 11, 11, 11, 22, 22, 22], - "B": np.array([0, 1, 2, 3, 4, 0, 1, 2], dtype=object), + "B": np.array([0, 1, 2, 3, 4, 0, 1, 2], dtype=np.int64), "C": [10, 10, 10, 10, 10, 20, 20, 20], }, columns=list("ABC"), @@ -160,7 +160,22 @@ def test_duplicate_index(input_dict, input_index, expected_dict, expected_index) # GH 28005 df = pd.DataFrame(input_dict, index=input_index) result = df.explode("col1") - expected = pd.DataFrame(expected_dict, index=expected_index, dtype=object) + expected = pd.DataFrame(expected_dict, index=expected_index, dtype=np.int64) + tm.assert_frame_equal(result, expected) + + +def test_inferred_dtype(): + # GH 34923 + s = pd.Series([1, None, 3]) + df = pd.DataFrame({"A": [s, s], "B": 1}) + result = df.explode("A") + expected = pd.DataFrame( + { + "A": np.array([1, np.nan, 3, 1, np.nan, 3], dtype=np.float64), + "B": np.array([1, 1, 1, 1, 1, 1], dtype=np.int64), + }, + index=[0, 0, 0, 1, 1, 1], + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_explode.py b/pandas/tests/series/methods/test_explode.py index 4b65e042f7b02..b720d7a23b375 100644 --- a/pandas/tests/series/methods/test_explode.py +++ b/pandas/tests/series/methods/test_explode.py @@ -9,7 +9,10 @@ def test_basic(): s = pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd"), name="foo") result = s.explode() expected = pd.Series( - [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object, name="foo" + [0, 1, 2, np.nan, np.nan, 3, 4], + index=list("aaabcdd"), + dtype=np.float64, + name="foo", ) tm.assert_series_equal(result, expected) @@ -54,7 +57,7 @@ def test_multi_index(): names=["foo", "bar"], ) expected = pd.Series( - [0, 1, 2, np.nan, np.nan, 3, 4], index=index, dtype=object, name="foo" + [0, 1, 2, np.nan, np.nan, 3, 4], index=index, dtype=np.float64, name="foo" ) tm.assert_series_equal(result, expected) @@ -116,7 +119,7 @@ def test_duplicate_index(): # GH 28005 s = pd.Series([[1, 2], [3, 4]], index=[0, 0]) result = s.explode() - expected = pd.Series([1, 2, 3, 4], index=[0, 0, 0, 0], dtype=object) + expected = pd.Series([1, 2, 3, 4], index=[0, 0, 0, 0], dtype=np.int64) tm.assert_series_equal(result, expected) @@ -124,5 +127,5 @@ def test_ignore_index(): # GH 34932 s = pd.Series([[1, 2], [3, 4]]) result = s.explode(ignore_index=True) - expected = pd.Series([1, 2, 3, 4], index=[0, 1, 2, 3], dtype=object) + expected = pd.Series([1, 2, 3, 4], index=[0, 1, 2, 3], dtype=np.int64) tm.assert_series_equal(result, expected)