diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 0e591e180e078..707257a35983e 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -500,6 +500,7 @@ ExtensionType Changes - :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`) - :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`) - :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`). +- Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) - Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`) .. _whatsnew_0240.api.incompatibilities: diff --git a/pandas/core/base.py b/pandas/core/base.py index 7f14a68503973..00c049497c0d8 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -664,7 +664,7 @@ def transpose(self, *args, **kwargs): "definition self") @property - def _is_homogeneous(self): + def _is_homogeneous_type(self): """Whether the object has a single dtype. By definition, Series and Index are always considered homogeneous. @@ -673,8 +673,8 @@ def _is_homogeneous(self): See Also -------- - DataFrame._is_homogeneous - MultiIndex._is_homogeneous + DataFrame._is_homogeneous_type + MultiIndex._is_homogeneous_type """ return True diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e16f61d7f5f02..cc58674398b70 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -614,7 +614,7 @@ def shape(self): return len(self.index), len(self.columns) @property - def _is_homogeneous(self): + def _is_homogeneous_type(self): """ Whether all the columns in a DataFrame have the same type. @@ -624,16 +624,17 @@ def _is_homogeneous(self): Examples -------- - >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous + >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type True - >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous + >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type False Items with the same type but different sizes are considered different types. - >>> DataFrame({"A": np.array([1, 2], dtype=np.int32), - ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous + >>> DataFrame({ + ... "A": np.array([1, 2], dtype=np.int32), + ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type False """ if self._data.any_extension_types: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ad38f037b6578..3e6b934e1e863 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -289,21 +289,23 @@ def levels(self): return self._levels @property - def _is_homogeneous(self): + def _is_homogeneous_type(self): """Whether the levels of a MultiIndex all have the same dtype. This looks at the dtypes of the levels. See Also -------- - Index._is_homogeneous - DataFrame._is_homogeneous + Index._is_homogeneous_type + DataFrame._is_homogeneous_type Examples -------- - >>> MultiIndex.from_tuples([('a', 'b'), ('a', 'c')])._is_homogeneous + >>> MultiIndex.from_tuples([ + ... ('a', 'b'), ('a', 'c')])._is_homogeneous_type True - >>> MultiIndex.from_tuples([('a', 1), ('a', 2)])._is_homogeneous + >>> MultiIndex.from_tuples([ + ... ('a', 1), ('a', 2)])._is_homogeneous_type False """ return len({x.dtype for x in self.levels}) <= 1 diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 63738594799f5..2f29f1ae2509f 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -12,9 +12,6 @@ from pandas.util._validators import validate_bool_kwarg from pandas.compat import range, map, zip -from pandas.core.dtypes.dtypes import ( - ExtensionDtype, - PandasExtensionDtype) from pandas.core.dtypes.common import ( _NS_DTYPE, is_datetimelike_v_numeric, @@ -791,6 +788,11 @@ def _interleave(self): """ dtype = _interleaved_dtype(self.blocks) + if is_extension_array_dtype(dtype): + # TODO: https://github.com/pandas-dev/pandas/issues/22791 + # Give EAs some input on what happens here. Sparse needs this. + dtype = 'object' + result = np.empty(self.shape, dtype=dtype) if result.shape[0] == 0: @@ -906,14 +908,25 @@ def fast_xs(self, loc): # unique dtype = _interleaved_dtype(self.blocks) + n = len(items) - result = np.empty(n, dtype=dtype) + if is_extension_array_dtype(dtype): + # we'll eventually construct an ExtensionArray. + result = np.empty(n, dtype=object) + else: + result = np.empty(n, dtype=dtype) + for blk in self.blocks: # Such assignment may incorrectly coerce NaT to None # result[blk.mgr_locs] = blk._slice((slice(None), loc)) for i, rl in enumerate(blk.mgr_locs): result[rl] = blk._try_coerce_result(blk.iget((i, loc))) + if is_extension_array_dtype(dtype): + result = dtype.construct_array_type()._from_sequence( + result, dtype=dtype + ) + return result def consolidate(self): @@ -1855,16 +1868,22 @@ def _shape_compat(x): def _interleaved_dtype(blocks): - if not len(blocks): - return None + # type: (List[Block]) -> Optional[Union[np.dtype, ExtensionDtype]] + """Find the common dtype for `blocks`. - dtype = find_common_type([b.dtype for b in blocks]) + Parameters + ---------- + blocks : List[Block] - # only numpy compat - if isinstance(dtype, (PandasExtensionDtype, ExtensionDtype)): - dtype = np.object + Returns + ------- + dtype : Optional[Union[np.dtype, ExtensionDtype]] + None is returned when `blocks` is empty. + """ + if not len(blocks): + return None - return dtype + return find_common_type([b.dtype for b in blocks]) def _consolidate(blocks): diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index ca4bd64659e06..c91370dc36770 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -836,8 +836,16 @@ def test_constructor_list_str_na(self, string_dtype): "B": pd.Categorical(['b', 'c'])}), False), ]) - def test_is_homogeneous(self, data, expected): - assert data._is_homogeneous is expected + def test_is_homogeneous_type(self, data, expected): + assert data._is_homogeneous_type is expected + + def test_asarray_homogenous(self): + df = pd.DataFrame({"A": pd.Categorical([1, 2]), + "B": pd.Categorical([1, 2])}) + result = np.asarray(df) + # may change from object in the future + expected = np.array([[1, 1], [2, 2]], dtype='object') + tm.assert_numpy_array_equal(result, expected) class TestDataFrameDatetimeWithTZ(TestData): diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 761c633f89da3..0f524ca0aaac5 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -1079,3 +1079,31 @@ def test_validate_indices_high(): def test_validate_indices_empty(): with tm.assert_raises_regex(IndexError, "indices are out"): validate_indices(np.array([0, 1]), 0) + + +def test_extension_array_cross_section(): + # A cross-section of a homogeneous EA should be an EA + df = pd.DataFrame({ + "A": pd.core.arrays.integer_array([1, 2]), + "B": pd.core.arrays.integer_array([3, 4]) + }, index=['a', 'b']) + expected = pd.Series(pd.core.arrays.integer_array([1, 3]), + index=['A', 'B'], name='a') + result = df.loc['a'] + tm.assert_series_equal(result, expected) + + result = df.iloc[0] + tm.assert_series_equal(result, expected) + + +def test_extension_array_cross_section_converts(): + df = pd.DataFrame({ + "A": pd.core.arrays.integer_array([1, 2]), + "B": np.array([1, 2]), + }, index=['a', 'b']) + result = df.loc['a'] + expected = pd.Series([1, 1], dtype=object, index=['A', 'B'], name='a') + tm.assert_series_equal(result, expected) + + result = df.iloc[0] + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/test_multiindex.py b/pandas/tests/indexing/test_multiindex.py index aefa8badf72e7..b8f80164e5402 100644 --- a/pandas/tests/indexing/test_multiindex.py +++ b/pandas/tests/indexing/test_multiindex.py @@ -738,8 +738,8 @@ def test_multiindex_contains_dropped(self): (MultiIndex.from_product([(1, 2), (3, 4)]), True), (MultiIndex.from_product([('a', 'b'), (1, 2)]), False), ]) - def test_multiindex_is_homogeneous(self, data, expected): - assert data._is_homogeneous is expected + def test_multiindex_is_homogeneous_type(self, data, expected): + assert data._is_homogeneous_type is expected class TestMultiIndexSlicers(object): diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 83a458eedbd93..125dff9ecfa7c 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -509,7 +509,7 @@ def test_infer_objects_series(self): assert actual.dtype == 'object' tm.assert_series_equal(actual, expected) - def test_is_homogeneous(self): - assert Series()._is_homogeneous - assert Series([1, 2])._is_homogeneous - assert Series(pd.Categorical([1, 2]))._is_homogeneous + def test_is_homogeneous_type(self): + assert Series()._is_homogeneous_type + assert Series([1, 2])._is_homogeneous_type + assert Series(pd.Categorical([1, 2]))._is_homogeneous_type